From ee2f195dd3e40f49698ca4dc2666ec09c770e80d Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Tue, 30 May 2017 17:37:31 +0000
Subject: [PATCH 1/2] Vendor import of llvm trunk r304222:
 https://llvm.org/svn/llvm-project/llvm/trunk@304222

---
 docs/Proposals/VectorizationPlan.rst          |  182 +
 docs/Vectorizers.rst                          |   11 +
 docs/index.rst                                |    3 +
 include/llvm/Analysis/ScalarEvolution.h       |    3 +-
 include/llvm/CodeGen/DIE.h                    |    4 +-
 include/llvm/DebugInfo/CodeView/CodeView.h    |   22 +-
 ...mFragment.h => DebugChecksumsSubsection.h} |   37 +-
 .../CodeView/DebugFrameDataSubsection.h       |   59 +
 ...agment.h => DebugInlineeLinesSubsection.h} |   37 +-
 ...gLineFragment.h => DebugLinesSubsection.h} |   32 +-
 ...ngTable.h => DebugStringTableSubsection.h} |   39 +-
 .../llvm/DebugInfo/CodeView/DebugSubsection.h |   52 +
 ...agmentRecord.h => DebugSubsectionRecord.h} |   39 +-
 ...mentVisitor.h => DebugSubsectionVisitor.h} |   34 +-
 .../CodeView/DebugSymbolsSubsection.h         |   53 +
 ...ownFragment.h => DebugUnknownSubsection.h} |   11 +-
 .../DebugInfo/CodeView/ModuleDebugFragment.h  |   48 -
 .../CodeView/SymbolVisitorDelegate.h          |    4 +-
 .../PDB/Native/DbiModuleDescriptorBuilder.h   |   23 +-
 include/llvm/DebugInfo/PDB/Native/DbiStream.h |    4 +-
 .../DebugInfo/PDB/Native/ModuleDebugStream.h  |    7 +-
 .../DebugInfo/PDB/Native/PDBStringTable.h     |    4 +-
 .../PDB/Native/PDBStringTableBuilder.h        |   10 +-
 include/llvm/MC/ConstantPools.h               |    3 +-
 include/llvm/Support/ManagedStatic.h          |   20 +-
 include/llvm/TableGen/Record.h                |   18 +-
 .../llvm/Transforms/Scalar/GVNExpression.h    |   11 +-
 lib/Analysis/ScalarEvolution.cpp              |    7 +-
 lib/CodeGen/AsmPrinter/CodeViewDebug.cpp      |   17 +-
 lib/CodeGen/AsmPrinter/CodeViewDebug.h        |    2 +-
 lib/CodeGen/GlobalISel/Localizer.cpp          |    2 +-
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |    3 +-
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |    5 -
 lib/CodeGen/TargetLoweringBase.cpp            |    1 +
 lib/DebugInfo/CodeView/CMakeLists.txt         |   16 +-
 ...gment.cpp => DebugChecksumsSubsection.cpp} |   33 +-
 .../CodeView/DebugFrameDataSubsection.cpp     |   44 +
 ...nt.cpp => DebugInlineeLinesSubsection.cpp} |   38 +-
 ...eFragment.cpp => DebugLinesSubsection.cpp} |   52 +-
 ...ble.cpp => DebugStringTableSubsection.cpp} |   27 +-
 ...eDebugFragment.cpp => DebugSubsection.cpp} |    8 +-
 .../CodeView/DebugSubsectionRecord.cpp        |   81 +
 .../CodeView/DebugSubsectionVisitor.cpp       |   52 +
 .../CodeView/DebugSymbolsSubsection.cpp       |   34 +
 lib/DebugInfo/CodeView/EnumTables.cpp         |   28 +-
 .../CodeView/ModuleDebugFragmentRecord.cpp    |   84 -
 .../CodeView/ModuleDebugFragmentVisitor.cpp   |   52 -
 lib/DebugInfo/CodeView/SymbolDumper.cpp       |    6 +-
 .../PDB/Native/DbiModuleDescriptorBuilder.cpp |   18 +-
 lib/MC/MCCodeView.cpp                         |    6 +-
 lib/Support/APFloat.cpp                       |    2 +
 lib/Support/Timer.cpp                         |   13 +-
 lib/TableGen/Record.cpp                       |   28 +-
 lib/Target/AArch64/AArch64FrameLowering.cpp   |   35 +-
 lib/Target/AArch64/AArch64ISelLowering.cpp    |    3 -
 lib/Target/AMDGPU/AMDGPUSubtarget.h           |    2 +-
 lib/Target/AMDGPU/AMDGPUTargetMachine.cpp     |    3 +
 lib/Target/AMDGPU/SIFoldOperands.cpp          |    7 +-
 lib/Target/AMDGPU/SIPeepholeSDWA.cpp          |   49 +-
 lib/Target/ARM/ARMISelLowering.cpp            |    4 -
 lib/Target/Hexagon/HexagonISelLowering.cpp    |    2 +-
 lib/Target/Mips/AsmParser/MipsAsmParser.cpp   |  387 +-
 lib/Target/Mips/MipsISelLowering.cpp          |    1 -
 lib/Target/Mips/MipsInstrFPU.td               |   23 +
 lib/Target/Mips/MipsRegisterInfo.td           |   27 +
 lib/Target/PowerPC/PPCISelLowering.cpp        |    3 -
 lib/Target/SystemZ/SystemZ.td                 |    2 +
 lib/Target/SystemZ/SystemZFeatures.td         |   14 +-
 lib/Target/SystemZ/SystemZISelLowering.cpp    |   60 +-
 lib/Target/SystemZ/SystemZInstrDFP.td         |  231 ++
 lib/Target/SystemZ/SystemZInstrFP.td          |   19 +-
 lib/Target/SystemZ/SystemZInstrFormats.td     |   97 +-
 lib/Target/SystemZ/SystemZInstrHFP.td         |  240 ++
 lib/Target/SystemZ/SystemZScheduleZ13.td      |  232 ++
 lib/Target/SystemZ/SystemZScheduleZ196.td     |  219 ++
 lib/Target/SystemZ/SystemZScheduleZEC12.td    |  225 ++
 lib/Target/SystemZ/SystemZSubtarget.cpp       |    2 +
 lib/Target/SystemZ/SystemZSubtarget.h         |    8 +
 .../WebAssembly/WebAssemblyISelLowering.cpp   |    4 +-
 lib/Target/X86/X86ISelLowering.cpp            |   81 +-
 lib/Transforms/Scalar/NewGVN.cpp              |   55 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    |   21 +-
 test/CodeGen/AArch64/reg-scavenge-frame.mir   |   52 +
 test/CodeGen/AMDGPU/add.v2i16.ll              |   30 +-
 test/CodeGen/AMDGPU/bfe-combine.ll            |   10 +-
 test/CodeGen/AMDGPU/commute-compares.ll       |    2 +-
 test/CodeGen/AMDGPU/commute_modifiers.ll      |    2 +-
 test/CodeGen/AMDGPU/copy-illegal-type.ll      |    2 +-
 test/CodeGen/AMDGPU/cvt_f32_ubyte.ll          |    1 -
 test/CodeGen/AMDGPU/fabs.f64.ll               |    4 +-
 test/CodeGen/AMDGPU/fabs.ll                   |    4 +-
 test/CodeGen/AMDGPU/fadd.f16.ll               |   18 +-
 test/CodeGen/AMDGPU/fadd64.ll                 |    2 +-
 test/CodeGen/AMDGPU/fcanonicalize.f16.ll      |   21 +-
 test/CodeGen/AMDGPU/fmul.f16.ll               |   18 +-
 test/CodeGen/AMDGPU/fneg-fabs.f16.ll          |   17 +-
 test/CodeGen/AMDGPU/fneg-fabs.f64.ll          |    4 +-
 test/CodeGen/AMDGPU/fneg-fabs.ll              |    4 +-
 test/CodeGen/AMDGPU/fneg.f16.ll               |    8 +-
 test/CodeGen/AMDGPU/fract.f64.ll              |    6 +-
 test/CodeGen/AMDGPU/fsub.f16.ll               |   18 +-
 test/CodeGen/AMDGPU/fsub64.ll                 |    2 +-
 test/CodeGen/AMDGPU/immv216.ll                |   57 +-
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |   32 +-
 .../AMDGPU/llvm.amdgcn.div.fixup.f16.ll       |    6 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll   |    2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll       |    4 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll |    2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll  |    3 +-
 test/CodeGen/AMDGPU/llvm.fma.f16.ll           |    6 +-
 test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll       |    4 +-
 test/CodeGen/AMDGPU/llvm.maxnum.f16.ll        |   18 +-
 test/CodeGen/AMDGPU/llvm.minnum.f16.ll        |   17 +-
 test/CodeGen/AMDGPU/mad24-get-global-id.ll    |    2 +-
 test/CodeGen/AMDGPU/madak.ll                  |    4 +-
 test/CodeGen/AMDGPU/madmk.ll                  |    4 +-
 test/CodeGen/AMDGPU/mul.ll                    |    8 +-
 test/CodeGen/AMDGPU/scratch-simple.ll         |    6 +-
 test/CodeGen/AMDGPU/sdiv.ll                   |    2 +-
 test/CodeGen/AMDGPU/sdwa-peephole.ll          |    5 +-
 test/CodeGen/AMDGPU/sdwa-scalar-ops.mir       |  410 +++
 test/CodeGen/AMDGPU/select.f16.ll             |   12 +-
 test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll    |   21 +-
 test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll     |   43 +-
 test/CodeGen/AMDGPU/sminmax.v2i16.ll          |    5 +-
 test/CodeGen/AMDGPU/srem.ll                   |    2 +-
 test/CodeGen/AMDGPU/sub.v2i16.ll              |   30 +-
 test/CodeGen/AMDGPU/udiv.ll                   |    6 +-
 test/CodeGen/AMDGPU/urem.ll                   |    2 +-
 .../CodeGen/AMDGPU/use-sgpr-multiple-times.ll |   12 +-
 test/CodeGen/AMDGPU/v_mac_f16.ll              |   11 +-
 test/CodeGen/AMDGPU/wqm.ll                    |    2 +-
 test/CodeGen/WebAssembly/negative-base-reg.ll |    2 +-
 test/CodeGen/X86/bitcast-and-setcc-128.ll     | 1155 ++++++
 test/CodeGen/X86/bitcast-and-setcc-256.ll     |  403 +++
 test/CodeGen/X86/mul-constant-i16.ll          |  141 +-
 test/CodeGen/X86/mul-constant-i32.ll          | 1589 ++++++++-
 test/CodeGen/X86/mul-constant-i64.ll          | 1612 ++++++++-
 test/CodeGen/X86/setcc-lowering.ll            |   65 +-
 test/CodeGen/X86/vector-sext.ll               |   56 +
 test/CodeGen/X86/xchg-nofold.ll               |   37 +
 test/MC/AArch64/ldr-pseudo.s                  |   12 +
 test/MC/Disassembler/SystemZ/insns-z13.txt    |  108 +
 test/MC/Disassembler/SystemZ/insns.txt        | 3178 ++++++++++++++++-
 test/MC/Mips/macro-li.d.s                     |  443 +++
 test/MC/Mips/macro-li.s.s                     |  198 +
 test/MC/SystemZ/insn-bad-z13.s                |  146 +
 test/MC/SystemZ/insn-bad-z196.s               |  368 ++
 test/MC/SystemZ/insn-bad-zEC12.s              |  166 +
 test/MC/SystemZ/insn-bad.s                    |  998 ++++++
 test/MC/SystemZ/insn-good-z13.s               |   80 +
 test/MC/SystemZ/insn-good-z196.s              |  336 ++
 test/MC/SystemZ/insn-good-zEC12.s             |   80 +
 test/MC/SystemZ/insn-good.s                   | 2135 +++++++++++
 .../AArch64/no_vector_instructions.ll         |   26 -
 .../llvm-pdbdump/C13DebugFragmentVisitor.cpp  |   14 +-
 tools/llvm-pdbdump/C13DebugFragmentVisitor.h  |   26 +-
 tools/llvm-pdbdump/LLVMOutputStyle.cpp        |   14 +-
 tools/llvm-pdbdump/YAMLOutputStyle.cpp        |   16 +-
 tools/llvm-pdbdump/llvm-pdbdump.cpp           |   13 +-
 tools/llvm-readobj/COFFDumper.cpp             |  113 +-
 tools/llvm-readobj/CodeView.h                 |   54 -
 .../DebugInfo/DWARF/DWARFDebugInfoTest.cpp    |   12 +-
 unittests/Support/ManagedStatic.cpp           |   41 +
 utils/TableGen/GlobalISelEmitter.cpp          |    2 +-
 utils/TableGen/X86FoldTablesEmitter.cpp       |    9 +-
 166 files changed, 16655 insertions(+), 1441 deletions(-)
 create mode 100644 docs/Proposals/VectorizationPlan.rst
 rename include/llvm/DebugInfo/CodeView/{ModuleDebugFileChecksumFragment.h => DebugChecksumsSubsection.h} (63%)
 create mode 100644 include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h
 rename include/llvm/DebugInfo/CodeView/{ModuleDebugInlineeLinesFragment.h => DebugInlineeLinesSubsection.h} (67%)
 rename include/llvm/DebugInfo/CodeView/{ModuleDebugLineFragment.h => DebugLinesSubsection.h} (81%)
 rename include/llvm/DebugInfo/CodeView/{StringTable.h => DebugStringTableSubsection.h} (55%)
 create mode 100644 include/llvm/DebugInfo/CodeView/DebugSubsection.h
 rename include/llvm/DebugInfo/CodeView/{ModuleDebugFragmentRecord.h => DebugSubsectionRecord.h} (55%)
 rename include/llvm/DebugInfo/CodeView/{ModuleDebugFragmentVisitor.h => DebugSubsectionVisitor.h} (50%)
 create mode 100644 include/llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h
 rename include/llvm/DebugInfo/CodeView/{ModuleDebugUnknownFragment.h => DebugUnknownSubsection.h} (60%)
 delete mode 100644 include/llvm/DebugInfo/CodeView/ModuleDebugFragment.h
 rename lib/DebugInfo/CodeView/{ModuleDebugFileChecksumFragment.cpp => DebugChecksumsSubsection.cpp} (72%)
 create mode 100644 lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp
 rename lib/DebugInfo/CodeView/{ModuleDebugInlineeLinesFragment.cpp => DebugInlineeLinesSubsection.cpp} (66%)
 rename lib/DebugInfo/CodeView/{ModuleDebugLineFragment.cpp => DebugLinesSubsection.cpp} (69%)
 rename lib/DebugInfo/CodeView/{StringTable.cpp => DebugStringTableSubsection.cpp} (61%)
 rename lib/DebugInfo/CodeView/{ModuleDebugFragment.cpp => DebugSubsection.cpp} (55%)
 create mode 100644 lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
 create mode 100644 lib/DebugInfo/CodeView/DebugSubsectionVisitor.cpp
 create mode 100644 lib/DebugInfo/CodeView/DebugSymbolsSubsection.cpp
 delete mode 100644 lib/DebugInfo/CodeView/ModuleDebugFragmentRecord.cpp
 delete mode 100644 lib/DebugInfo/CodeView/ModuleDebugFragmentVisitor.cpp
 create mode 100644 lib/Target/SystemZ/SystemZInstrDFP.td
 create mode 100644 lib/Target/SystemZ/SystemZInstrHFP.td
 create mode 100644 test/CodeGen/AArch64/reg-scavenge-frame.mir
 create mode 100644 test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
 create mode 100644 test/CodeGen/X86/bitcast-and-setcc-128.ll
 create mode 100644 test/CodeGen/X86/bitcast-and-setcc-256.ll
 create mode 100644 test/CodeGen/X86/xchg-nofold.ll
 create mode 100644 test/MC/Mips/macro-li.d.s
 create mode 100644 test/MC/Mips/macro-li.s.s
 delete mode 100644 test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
 delete mode 100644 tools/llvm-readobj/CodeView.h

diff --git a/docs/Proposals/VectorizationPlan.rst b/docs/Proposals/VectorizationPlan.rst
new file mode 100644
index 000000000000..82ce4b2de17a
--- /dev/null
+++ b/docs/Proposals/VectorizationPlan.rst
@@ -0,0 +1,182 @@
+==================
+Vectorization Plan
+==================
+
+.. contents::
+   :local:
+
+Abstract
+========
+The vectorization transformation can be rather complicated, involving several
+potential alternatives, especially for outer-loops [1]_ but also possibly for
+innermost loops. These alternatives may have significant performance impact,
+both positive and negative. A cost model is therefore employed to identify the
+best alternative, including the alternative of avoiding any transformation
+altogether.
+
+The Vectorization Plan is an explicit model for describing vectorization
+candidates. It serves for both optimizing candidates including estimating their
+cost reliably, and for performing their final translation into IR. This
+facilitates dealing with multiple vectorization candidates.
+
+High-level Design
+=================
+
+Vectorization Workflow
+----------------------
+VPlan-based vectorization involves three major steps, taking a "scenario-based
+approach" to vectorization planning:
+
+1. Legal Step: check if a loop can be legally vectorized; encode contraints and
+   artifacts if so.
+2. Plan Step:
+
+   a. Build initial VPlans following the constraints and decisions taken by
+      Legal Step 1, and compute their cost.
+   b. Apply optimizations to the VPlans, possibly forking additional VPlans.
+      Prune sub-optimal VPlans having relatively high cost.
+3. Execute Step: materialize the best VPlan. Note that this is the only step
+   that modifies the IR.
+
+Design Guidelines
+-----------------
+In what follows, the term "input IR" refers to code that is fed into the
+vectorizer whereas the term "output IR" refers to code that is generated by the
+vectorizer. The output IR contains code that has been vectorized or "widened"
+according to a loop Vectorization Factor (VF), and/or loop unroll-and-jammed
+according to an Unroll Factor (UF).
+The design of VPlan follows several high-level guidelines:
+
+1. Analysis-like: building and manipulating VPlans must not modify the input IR.
+   In particular, if the best option is not to vectorize at all, the
+   vectorization process terminates before reaching Step 3, and compilation
+   should proceed as if VPlans had not been built.
+
+2. Align Cost & Execute: each VPlan must support both estimating the cost and
+   generating the output IR code, such that the cost estimation evaluates the
+   to-be-generated code reliably.
+
+3. Support vectorizing additional constructs:
+
+   a. Outer-loop vectorization. In particular, VPlan must be able to model the
+      control-flow of the output IR which may include multiple basic-blocks and
+      nested loops.
+   b. SLP vectorization.
+   c. Combinations of the above, including nested vectorization: vectorizing
+      both an inner loop and an outer-loop at the same time (each with its own
+      VF and UF), mixed vectorization: vectorizing a loop with SLP patterns
+      inside [4]_, (re)vectorizing input IR containing vector code.
+   d. Function vectorization [2]_.
+
+4. Support multiple candidates efficiently. In particular, similar candidates
+   related to a range of possible VF's and UF's must be represented efficiently.
+   Potential versioning needs to be supported efficiently.
+
+5. Support vectorizing idioms, such as interleaved groups of strided loads or
+   stores. This is achieved by modeling a sequence of output instructions using
+   a "Recipe", which is responsible for computing its cost and generating its
+   code.
+
+6. Encapsulate Single-Entry Single-Exit regions (SESE). During vectorization
+   such regions may need to be, for example, predicated and linearized, or
+   replicated VF*UF times to handle scalarized and predicated instructions.
+   Innerloops are also modelled as SESE regions.
+
+Low-level Design
+================
+The low-level design of VPlan comprises of the following classes.
+
+:LoopVectorizationPlanner:
+  A LoopVectorizationPlanner is designed to handle the vectorization of a loop
+  or a loop nest. It can construct, optimize and discard one or more VPlans,
+  each VPlan modelling a distinct way to vectorize the loop or the loop nest.
+  Once the best VPlan is determined, including the best VF and UF, this VPlan
+  drives the generation of output IR.
+
+:VPlan:
+  A model of a vectorized candidate for a given input IR loop or loop nest. This
+  candidate is represented using a Hierarchical CFG. VPlan supports estimating
+  the cost and driving the generation of the output IR code it represents.
+
+:Hierarchical CFG:
+  A control-flow graph whose nodes are basic-blocks or Hierarchical CFG's. The
+  Hierarchical CFG data structure is similar to the Tile Tree [5]_, where
+  cross-Tile edges are lifted to connect Tiles instead of the original
+  basic-blocks as in Sharir [6]_, promoting the Tile encapsulation. The terms
+  Region and Block are used rather than Tile [5]_ to avoid confusion with loop
+  tiling.
+
+:VPBlockBase:
+  The building block of the Hierarchical CFG. A pure-virtual base-class of
+  VPBasicBlock and VPRegionBlock, see below. VPBlockBase models the hierarchical
+  control-flow relations with other VPBlocks. Note that in contrast to the IR
+  BasicBlock, a VPBlockBase models its control-flow successors and predecessors
+  directly, rather than through a Terminator branch or through predecessor
+  branches that "use" the VPBlockBase.
+
+:VPBasicBlock:
+  VPBasicBlock is a subclass of VPBlockBase, and serves as the leaves of the
+  Hierarchical CFG. It represents a sequence of output IR instructions that will
+  appear consecutively in an output IR basic-block. The instructions of this
+  basic-block originate from one or more VPBasicBlocks. VPBasicBlock holds a
+  sequence of zero or more VPRecipes that model the cost and generation of the
+  output IR instructions.
+
+:VPRegionBlock:
+  VPRegionBlock is a subclass of VPBlockBase. It models a collection of
+  VPBasicBlocks and VPRegionBlocks which form a SESE subgraph of the output IR
+  CFG. A VPRegionBlock may indicate that its contents are to be replicated a
+  constant number of times when output IR is generated, effectively representing
+  a loop with constant trip-count that will be completely unrolled. This is used
+  to support scalarized and predicated instructions with a single model for
+  multiple candidate VF's and UF's.
+
+:VPRecipeBase:
+  A pure-virtual base class modeling a sequence of one or more output IR
+  instructions, possibly based on one or more input IR instructions. These
+  input IR instructions are referred to as "Ingredients" of the Recipe. A Recipe
+  may specify how its ingredients are to be transformed to produce the output IR
+  instructions; e.g., cloned once, replicated multiple times or widened
+  according to selected VF.
+
+:VPTransformState:
+  Stores information used for generating output IR, passed from
+  LoopVectorizationPlanner to its selected VPlan for execution, and used to pass
+  additional information down to VPBlocks and VPRecipes.
+
+Related LLVM components
+-----------------------
+1. SLP Vectorizer: one can compare the VPlan model with LLVM's existing SLP
+   tree, where TSLP [3]_ adds Plan Step 2.b.
+
+2. RegionInfo: one can compare VPlan's H-CFG with the Region Analysis as used by
+   Polly [7]_.
+
+References
+----------
+.. [1] "Outer-loop vectorization: revisited for short SIMD architectures", Dorit
+    Nuzman and Ayal Zaks, PACT 2008.
+
+.. [2] "Proposal for function vectorization and loop vectorization with function
+    calls", Xinmin Tian, [`cfe-dev
+    <http://lists.llvm.org/pipermail/cfe-dev/2016-March/047732.html>`_].,
+    March 2, 2016.
+    See also `review <https://reviews.llvm.org/D22792>`_.
+
+.. [3] "Throttling Automatic Vectorization: When Less is More", Vasileios
+    Porpodas and Tim Jones, PACT 2015 and LLVM Developers' Meeting 2015.
+
+.. [4] "Exploiting mixed SIMD parallelism by reducing data reorganization
+    overhead", Hao Zhou and Jingling Xue, CGO 2016.
+
+.. [5] "Register Allocation via Hierarchical Graph Coloring", David Callahan and
+    Brian Koblenz, PLDI 1991
+
+.. [6] "Structural analysis: A new approach to flow analysis in optimizing
+    compilers", M. Sharir, Journal of Computer Languages, Jan. 1980
+
+.. [7] "Enabling Polyhedral Optimizations in LLVM", Tobias Grosser, Diploma
+    thesis, 2011.
+
+.. [8] "Introducing VPlan to the Loop Vectorizer", Gil Rapaport and Ayal Zaks,
+    European LLVM Developers' Meeting 2017.
diff --git a/docs/Vectorizers.rst b/docs/Vectorizers.rst
index a909d458c317..317271af4032 100644
--- a/docs/Vectorizers.rst
+++ b/docs/Vectorizers.rst
@@ -382,6 +382,17 @@ And Linpack-pc with the same configuration. Result is Mflops, higher is better.
 
 .. image:: linpack-pc.png
 
+Ongoing Development Directions
+------------------------------
+
+.. toctree::
+   :hidden:
+
+   Proposals/VectorizationPlan
+
+:doc:`Proposals/VectorizationPlan`
+   Modeling the process and upgrading the infrastructure of LLVM's Loop Vectorizer.
+
 .. _slp-vectorizer:
 
 The SLP Vectorizer
diff --git a/docs/index.rst b/docs/index.rst
index becbe48e7ec7..220df1566bd5 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -528,6 +528,7 @@ can be better.
 
    CodeOfConduct
    Proposals/GitHubMove
+   Proposals/VectorizationPlan
 
 :doc:`CodeOfConduct`
    Proposal to adopt a code of conduct on the LLVM social spaces (lists, events,
@@ -536,6 +537,8 @@ can be better.
 :doc:`Proposals/GitHubMove`
    Proposal to move from SVN/Git to GitHub.
 
+:doc:`Proposals/VectorizationPlan`
+   Proposal to model the process and upgrade the infrastructure of LLVM's Loop Vectorizer.
 
 Indices and tables
 ==================
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 4a6fc245c225..1d715b590ab7 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -1536,8 +1536,7 @@ class ScalarEvolution {
   /// Determine if the SCEV can be evaluated at loop's entry. It is true if it
   /// doesn't depend on a SCEVUnknown of an instruction which is dominated by
   /// the header of loop L.
-  bool isAvailableAtLoopEntry(const SCEV *S, const Loop *L, DominatorTree &DT,
-                              LoopInfo &LI);
+  bool isAvailableAtLoopEntry(const SCEV *S, const Loop *L);
 
   /// Return true if the given SCEV changes value in a known way in the
   /// specified loop.  This property being true implies that the value is
diff --git a/include/llvm/CodeGen/DIE.h b/include/llvm/CodeGen/DIE.h
index 4be44e62fa92..4f47ba6e3852 100644
--- a/include/llvm/CodeGen/DIE.h
+++ b/include/llvm/CodeGen/DIE.h
@@ -383,11 +383,11 @@ class DIEValue {
       return;
 #define HANDLE_DIEVALUE_SMALL(T)                                               \
   case is##T:                                                                  \
-    destruct<DIE##T>();
+    destruct<DIE##T>();                                                        \
     return;
 #define HANDLE_DIEVALUE_LARGE(T)                                               \
   case is##T:                                                                  \
-    destruct<const DIE##T *>();
+    destruct<const DIE##T *>();                                                \
     return;
 #include "llvm/CodeGen/DIEValue.def"
     }
diff --git a/include/llvm/DebugInfo/CodeView/CodeView.h b/include/llvm/DebugInfo/CodeView/CodeView.h
index f881ad0c9d80..3316e71916ed 100644
--- a/include/llvm/DebugInfo/CodeView/CodeView.h
+++ b/include/llvm/DebugInfo/CodeView/CodeView.h
@@ -13,6 +13,8 @@
 #include <cinttypes>
 #include <type_traits>
 
+#include "llvm/Support/Endian.h"
+
 namespace llvm {
 namespace codeview {
 
@@ -291,7 +293,7 @@ enum class ModifierOptions : uint16_t {
 };
 CV_DEFINE_ENUM_CLASS_FLAGS_OPERATORS(ModifierOptions)
 
-enum class ModuleDebugFragmentKind : uint32_t {
+enum class DebugSubsectionKind : uint32_t {
   None = 0,
   Symbols = 0xf1,
   Lines = 0xf2,
@@ -550,6 +552,24 @@ enum LineFlags : uint16_t {
   LF_None = 0,
   LF_HaveColumns = 1, // CV_LINES_HAVE_COLUMNS
 };
+
+/// Data in the the SUBSEC_FRAMEDATA subection.
+struct FrameData {
+  support::ulittle32_t RvaStart;
+  support::ulittle32_t CodeSize;
+  support::ulittle32_t LocalSize;
+  support::ulittle32_t ParamsSize;
+  support::ulittle32_t MaxStackSize;
+  support::ulittle32_t FrameFunc;
+  support::ulittle16_t PrologSize;
+  support::ulittle16_t SavedRegsSize;
+  support::ulittle32_t Flags;
+  enum : uint32_t {
+    HasSEH = 1 << 0,
+    HasEH = 1 << 1,
+    IsFunctionStart = 1 << 2,
+  };
+};
 }
 }
 
diff --git a/include/llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h b/include/llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h
similarity index 63%
rename from include/llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h
rename to include/llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h
index 6c08c9aa2137..e7036033d2d9 100644
--- a/include/llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h
+++ b/include/llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h
@@ -1,4 +1,4 @@
-//===- ModuleDebugFileChecksumFragment.h ------------------------*- C++ -*-===//
+//===- DebugChecksumsSubsection.h -------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGFILECHECKSUMFRAGMENT_H
-#define LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGFILECHECKSUMFRAGMENT_H
+#ifndef LLVM_DEBUGINFO_CODEVIEW_DEBUGCHECKSUMSSUBSECTION_H
+#define LLVM_DEBUGINFO_CODEVIEW_DEBUGCHECKSUMSSUBSECTION_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragment.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/BinaryStreamReader.h"
@@ -21,7 +21,7 @@
 namespace llvm {
 namespace codeview {
 
-class StringTable;
+class DebugStringTableSubsection;
 
 struct FileChecksumEntry {
   uint32_t FileNameOffset;    // Byte offset of filename in global stringtable.
@@ -43,19 +43,22 @@ template <> struct VarStreamArrayExtractor<codeview::FileChecksumEntry> {
 
 namespace llvm {
 namespace codeview {
-class ModuleDebugFileChecksumFragmentRef final : public ModuleDebugFragmentRef {
+class DebugChecksumsSubsectionRef final : public DebugSubsectionRef {
   typedef VarStreamArray<codeview::FileChecksumEntry> FileChecksumArray;
   typedef FileChecksumArray::Iterator Iterator;
 
 public:
-  ModuleDebugFileChecksumFragmentRef()
-      : ModuleDebugFragmentRef(ModuleDebugFragmentKind::FileChecksums) {}
+  DebugChecksumsSubsectionRef()
+      : DebugSubsectionRef(DebugSubsectionKind::FileChecksums) {}
 
-  static bool classof(const ModuleDebugFragmentRef *S) {
-    return S->kind() == ModuleDebugFragmentKind::FileChecksums;
+  static bool classof(const DebugSubsectionRef *S) {
+    return S->kind() == DebugSubsectionKind::FileChecksums;
   }
 
+  bool valid() const { return Checksums.valid(); }
+
   Error initialize(BinaryStreamReader Reader);
+  Error initialize(BinaryStreamRef Stream);
 
   Iterator begin() { return Checksums.begin(); }
   Iterator end() { return Checksums.end(); }
@@ -66,23 +69,23 @@ class ModuleDebugFileChecksumFragmentRef final : public ModuleDebugFragmentRef {
   FileChecksumArray Checksums;
 };
 
-class ModuleDebugFileChecksumFragment final : public ModuleDebugFragment {
+class DebugChecksumsSubsection final : public DebugSubsection {
 public:
-  explicit ModuleDebugFileChecksumFragment(StringTable &Strings);
+  explicit DebugChecksumsSubsection(DebugStringTableSubsection &Strings);
 
-  static bool classof(const ModuleDebugFragment *S) {
-    return S->kind() == ModuleDebugFragmentKind::FileChecksums;
+  static bool classof(const DebugSubsection *S) {
+    return S->kind() == DebugSubsectionKind::FileChecksums;
   }
 
   void addChecksum(StringRef FileName, FileChecksumKind Kind,
                    ArrayRef<uint8_t> Bytes);
 
-  uint32_t calculateSerializedLength() override;
-  Error commit(BinaryStreamWriter &Writer) override;
+  uint32_t calculateSerializedSize() const override;
+  Error commit(BinaryStreamWriter &Writer) const override;
   uint32_t mapChecksumOffset(StringRef FileName) const;
 
 private:
-  StringTable &Strings;
+  DebugStringTableSubsection &Strings;
 
   DenseMap<uint32_t, uint32_t> OffsetMap;
   uint32_t SerializedSize = 0;
diff --git a/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h b/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h
new file mode 100644
index 000000000000..686b5c4f242e
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h
@@ -0,0 +1,59 @@
+//===- DebugFrameDataSubsection.h ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_DEBUGFRAMEDATASUBSECTION_H
+#define LLVM_DEBUGINFO_CODEVIEW_DEBUGFRAMEDATASUBSECTION_H
+
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace codeview {
+class DebugFrameDataSubsectionRef final : public DebugSubsectionRef {
+public:
+  DebugFrameDataSubsectionRef()
+      : DebugSubsectionRef(DebugSubsectionKind::FrameData) {}
+  static bool classof(const DebugSubsection *S) {
+    return S->kind() == DebugSubsectionKind::FrameData;
+  }
+
+  Error initialize(BinaryStreamReader Reader);
+
+  FixedStreamArray<FrameData>::Iterator begin() const { return Frames.begin(); }
+  FixedStreamArray<FrameData>::Iterator end() const { return Frames.end(); }
+
+  const void *getRelocPtr() const { return RelocPtr; }
+
+private:
+  const uint32_t *RelocPtr = nullptr;
+  FixedStreamArray<FrameData> Frames;
+};
+
+class DebugFrameDataSubsection final : public DebugSubsection {
+public:
+  DebugFrameDataSubsection()
+      : DebugSubsection(DebugSubsectionKind::FrameData) {}
+  static bool classof(const DebugSubsection *S) {
+    return S->kind() == DebugSubsectionKind::FrameData;
+  }
+
+  uint32_t calculateSerializedSize() const override;
+  Error commit(BinaryStreamWriter &Writer) const override;
+
+  void addFrameData(const FrameData &Frame);
+
+private:
+  std::vector<FrameData> Frames;
+};
+}
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h b/include/llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h
similarity index 67%
rename from include/llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h
rename to include/llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h
index 348497cbf7f2..e2cfc3c99233 100644
--- a/include/llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h
+++ b/include/llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h
@@ -1,4 +1,4 @@
-//===- ModuleDebugInlineeLinesFragment.h ------------------------*- C++ -*-===//
+//===- DebugInlineeLinesSubsection.h ----------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,11 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGINLINEELINESFRAGMENT_H
-#define LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGINLINEELINESFRAGMENT_H
+#ifndef LLVM_DEBUGINFO_CODEVIEW_BUGINLINEELINESSUBSECTION_H
+#define LLVM_DEBUGINFO_CODEVIEW_BUGINLINEELINESSUBSECTION_H
 
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragment.h"
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
@@ -19,9 +19,8 @@
 namespace llvm {
 namespace codeview {
 
-class ModuleDebugInlineeLineFragmentRef;
-class ModuleDebugFileChecksumFragment;
-class StringTable;
+class DebugInlineeLinesSubsectionsRef;
+class DebugChecksumsSubsection;
 
 enum class InlineeLinesSignature : uint32_t {
   Normal,    // CV_INLINEE_SOURCE_LINE_SIGNATURE
@@ -51,15 +50,15 @@ template <> struct VarStreamArrayExtractor<codeview::InlineeSourceLine> {
 };
 
 namespace codeview {
-class ModuleDebugInlineeLineFragmentRef final : public ModuleDebugFragmentRef {
+class DebugInlineeLinesSubsectionRef final : public DebugSubsectionRef {
   typedef VarStreamArray<InlineeSourceLine> LinesArray;
   typedef LinesArray::Iterator Iterator;
 
 public:
-  ModuleDebugInlineeLineFragmentRef();
+  DebugInlineeLinesSubsectionRef();
 
-  static bool classof(const ModuleDebugFragmentRef *S) {
-    return S->kind() == ModuleDebugFragmentKind::InlineeLines;
+  static bool classof(const DebugSubsectionRef *S) {
+    return S->kind() == DebugSubsectionKind::InlineeLines;
   }
 
   Error initialize(BinaryStreamReader Reader);
@@ -73,23 +72,23 @@ class ModuleDebugInlineeLineFragmentRef final : public ModuleDebugFragmentRef {
   VarStreamArray<InlineeSourceLine> Lines;
 };
 
-class ModuleDebugInlineeLineFragment final : public ModuleDebugFragment {
+class DebugInlineeLinesSubsection final : public DebugSubsection {
 public:
-  ModuleDebugInlineeLineFragment(ModuleDebugFileChecksumFragment &Checksums,
-                                 bool HasExtraFiles);
+  DebugInlineeLinesSubsection(DebugChecksumsSubsection &Checksums,
+                              bool HasExtraFiles);
 
-  static bool classof(const ModuleDebugFragment *S) {
-    return S->kind() == ModuleDebugFragmentKind::InlineeLines;
+  static bool classof(const DebugSubsection *S) {
+    return S->kind() == DebugSubsectionKind::InlineeLines;
   }
 
-  Error commit(BinaryStreamWriter &Writer) override;
-  uint32_t calculateSerializedLength() override;
+  Error commit(BinaryStreamWriter &Writer) const override;
+  uint32_t calculateSerializedSize() const override;
 
   void addInlineSite(TypeIndex FuncId, StringRef FileName, uint32_t SourceLine);
   void addExtraFile(StringRef FileName);
 
 private:
-  ModuleDebugFileChecksumFragment &Checksums;
+  DebugChecksumsSubsection &Checksums;
 
   bool HasExtraFiles = false;
   uint32_t ExtraFileCount = 0;
diff --git a/include/llvm/DebugInfo/CodeView/ModuleDebugLineFragment.h b/include/llvm/DebugInfo/CodeView/DebugLinesSubsection.h
similarity index 81%
rename from include/llvm/DebugInfo/CodeView/ModuleDebugLineFragment.h
rename to include/llvm/DebugInfo/CodeView/DebugLinesSubsection.h
index 3124236b8fb1..1b63af59c2ed 100644
--- a/include/llvm/DebugInfo/CodeView/ModuleDebugLineFragment.h
+++ b/include/llvm/DebugInfo/CodeView/DebugLinesSubsection.h
@@ -1,4 +1,4 @@
-//===- ModuleDebugLineFragment.h --------------------------------*- C++ -*-===//
+//===- DebugLinesSubsection.h --------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,8 +10,8 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGLINEFRAGMENT_H
 #define LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGLINEFRAGMENT_H
 
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragment.h"
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
@@ -19,8 +19,8 @@
 namespace llvm {
 namespace codeview {
 
-class ModuleDebugFileChecksumFragment;
-class StringTable;
+class DebugChecksumsSubsection;
+class DebugStringTableSubsection;
 
 // Corresponds to the `CV_DebugSLinesHeader_t` structure.
 struct LineFragmentHeader {
@@ -70,16 +70,16 @@ class LineColumnExtractor {
                        LineColumnEntry &Item, const LineFragmentHeader *Ctx);
 };
 
-class ModuleDebugLineFragmentRef final : public ModuleDebugFragmentRef {
+class DebugLinesSubsectionRef final : public DebugSubsectionRef {
   friend class LineColumnExtractor;
   typedef VarStreamArray<LineColumnEntry, LineColumnExtractor> LineInfoArray;
   typedef LineInfoArray::Iterator Iterator;
 
 public:
-  ModuleDebugLineFragmentRef();
+  DebugLinesSubsectionRef();
 
-  static bool classof(const ModuleDebugFragmentRef *S) {
-    return S->kind() == ModuleDebugFragmentKind::Lines;
+  static bool classof(const DebugSubsectionRef *S) {
+    return S->kind() == DebugSubsectionKind::Lines;
   }
 
   Error initialize(BinaryStreamReader Reader);
@@ -96,7 +96,7 @@ class ModuleDebugLineFragmentRef final : public ModuleDebugFragmentRef {
   LineInfoArray LinesAndColumns;
 };
 
-class ModuleDebugLineFragment final : public ModuleDebugFragment {
+class DebugLinesSubsection final : public DebugSubsection {
   struct Block {
     Block(uint32_t ChecksumBufferOffset)
         : ChecksumBufferOffset(ChecksumBufferOffset) {}
@@ -107,11 +107,11 @@ class ModuleDebugLineFragment final : public ModuleDebugFragment {
   };
 
 public:
-  ModuleDebugLineFragment(ModuleDebugFileChecksumFragment &Checksums,
-                          StringTable &Strings);
+  DebugLinesSubsection(DebugChecksumsSubsection &Checksums,
+                       DebugStringTableSubsection &Strings);
 
-  static bool classof(const ModuleDebugFragment *S) {
-    return S->kind() == ModuleDebugFragmentKind::Lines;
+  static bool classof(const DebugSubsection *S) {
+    return S->kind() == DebugSubsectionKind::Lines;
   }
 
   void createBlock(StringRef FileName);
@@ -119,8 +119,8 @@ class ModuleDebugLineFragment final : public ModuleDebugFragment {
   void addLineAndColumnInfo(uint32_t Offset, const LineInfo &Line,
                             uint32_t ColStart, uint32_t ColEnd);
 
-  uint32_t calculateSerializedLength() override;
-  Error commit(BinaryStreamWriter &Writer) override;
+  uint32_t calculateSerializedSize() const override;
+  Error commit(BinaryStreamWriter &Writer) const override;
 
   void setRelocationAddress(uint16_t Segment, uint16_t Offset);
   void setCodeSize(uint32_t Size);
@@ -129,7 +129,7 @@ class ModuleDebugLineFragment final : public ModuleDebugFragment {
   bool hasColumnInfo() const;
 
 private:
-  ModuleDebugFileChecksumFragment &Checksums;
+  DebugChecksumsSubsection &Checksums;
 
   uint16_t RelocOffset = 0;
   uint16_t RelocSegment = 0;
diff --git a/include/llvm/DebugInfo/CodeView/StringTable.h b/include/llvm/DebugInfo/CodeView/DebugStringTableSubsection.h
similarity index 55%
rename from include/llvm/DebugInfo/CodeView/StringTable.h
rename to include/llvm/DebugInfo/CodeView/DebugStringTableSubsection.h
index 05dc02ee849f..fbe39cb16f09 100644
--- a/include/llvm/DebugInfo/CodeView/StringTable.h
+++ b/include/llvm/DebugInfo/CodeView/DebugStringTableSubsection.h
@@ -1,4 +1,4 @@
-//===- StringTable.h - CodeView String Table Reader/Writer ------*- C++ -*-===//
+//===- DebugStringTableSubsection.h - CodeView String Table -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_CODEVIEW_STRINGTABLE_H
-#define LLVM_DEBUGINFO_CODEVIEW_STRINGTABLE_H
+#ifndef LLVM_DEBUGINFO_CODEVIEW_DEBUGSTRINGTABLESUBSECTION_H
+#define LLVM_DEBUGINFO_CODEVIEW_DEBUGSTRINGTABLESUBSECTION_H
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 
@@ -28,11 +28,15 @@ namespace codeview {
 
 /// Represents a read-only view of a CodeView string table.  This is a very
 /// simple flat buffer consisting of null-terminated strings, where strings
-/// are retrieved by their offset in the buffer.  StringTableRef does not own
-/// the underlying storage for the buffer.
-class StringTableRef {
+/// are retrieved by their offset in the buffer.  DebugStringTableSubsectionRef
+/// does not own the underlying storage for the buffer.
+class DebugStringTableSubsectionRef : public DebugSubsectionRef {
 public:
-  StringTableRef();
+  DebugStringTableSubsectionRef();
+
+  static bool classof(const DebugSubsectionRef *S) {
+    return S->kind() == DebugSubsectionKind::StringTable;
+  }
 
   Error initialize(BinaryStreamRef Contents);
 
@@ -44,11 +48,18 @@ class StringTableRef {
   BinaryStreamRef Stream;
 };
 
-/// Represents a read-write view of a CodeView string table.  StringTable owns
-/// the underlying storage for the table, and is capable of serializing the
-/// string table into a format understood by StringTableRef.
-class StringTable {
+/// Represents a read-write view of a CodeView string table.
+/// DebugStringTableSubsection owns the underlying storage for the table, and is
+/// capable of serializing the string table into a format understood by
+/// DebugStringTableSubsectionRef.
+class DebugStringTableSubsection : public DebugSubsection {
 public:
+  DebugStringTableSubsection();
+
+  static bool classof(const DebugSubsection *S) {
+    return S->kind() == DebugSubsectionKind::StringTable;
+  }
+
   // If string S does not exist in the string table, insert it.
   // Returns the ID for S.
   uint32_t insert(StringRef S);
@@ -56,8 +67,8 @@ class StringTable {
   // Return the ID for string S.  Assumes S exists in the table.
   uint32_t getStringId(StringRef S) const;
 
-  uint32_t calculateSerializedSize() const;
-  Error commit(BinaryStreamWriter &Writer) const;
+  uint32_t calculateSerializedSize() const override;
+  Error commit(BinaryStreamWriter &Writer) const override;
 
   uint32_t size() const;
 
diff --git a/include/llvm/DebugInfo/CodeView/DebugSubsection.h b/include/llvm/DebugInfo/CodeView/DebugSubsection.h
new file mode 100644
index 000000000000..e427e0006a55
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/DebugSubsection.h
@@ -0,0 +1,52 @@
+//===- DebugSubsection.h ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGFRAGMENT_H
+#define LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGFRAGMENT_H
+
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Casting.h"
+
+namespace llvm {
+namespace codeview {
+
+class DebugSubsectionRef {
+public:
+  explicit DebugSubsectionRef(DebugSubsectionKind Kind) : Kind(Kind) {}
+  virtual ~DebugSubsectionRef();
+
+  static bool classof(const DebugSubsectionRef *S) { return true; }
+
+  DebugSubsectionKind kind() const { return Kind; }
+
+protected:
+  DebugSubsectionKind Kind;
+};
+
+class DebugSubsection {
+public:
+  explicit DebugSubsection(DebugSubsectionKind Kind) : Kind(Kind) {}
+  virtual ~DebugSubsection();
+
+  static bool classof(const DebugSubsection *S) { return true; }
+
+  DebugSubsectionKind kind() const { return Kind; }
+
+  virtual Error commit(BinaryStreamWriter &Writer) const = 0;
+  virtual uint32_t calculateSerializedSize() const = 0;
+
+protected:
+  DebugSubsectionKind Kind;
+};
+
+} // namespace codeview
+} // namespace llvm
+
+#endif // LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGFRAGMENT_H
diff --git a/include/llvm/DebugInfo/CodeView/ModuleDebugFragmentRecord.h b/include/llvm/DebugInfo/CodeView/DebugSubsectionRecord.h
similarity index 55%
rename from include/llvm/DebugInfo/CodeView/ModuleDebugFragmentRecord.h
rename to include/llvm/DebugInfo/CodeView/DebugSubsectionRecord.h
index f68f21b224f1..b2e1131e5968 100644
--- a/include/llvm/DebugInfo/CodeView/ModuleDebugFragmentRecord.h
+++ b/include/llvm/DebugInfo/CodeView/DebugSubsectionRecord.h
@@ -1,4 +1,4 @@
-//===- ModuleDebugFragment.h ------------------------------------*- C++ -*-===//
+//===- DebugSubsection.h ------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -20,52 +20,49 @@
 namespace llvm {
 namespace codeview {
 
-class ModuleDebugFragment;
+class DebugSubsection;
 
 // Corresponds to the `CV_DebugSSubsectionHeader_t` structure.
-struct ModuleDebugFragmentHeader {
-  support::ulittle32_t Kind;   // codeview::ModuleDebugFragmentKind enum
+struct DebugSubsectionHeader {
+  support::ulittle32_t Kind;   // codeview::DebugSubsectionKind enum
   support::ulittle32_t Length; // number of bytes occupied by this record.
 };
 
-class ModuleDebugFragmentRecord {
+class DebugSubsectionRecord {
 public:
-  ModuleDebugFragmentRecord();
-  ModuleDebugFragmentRecord(ModuleDebugFragmentKind Kind, BinaryStreamRef Data);
+  DebugSubsectionRecord();
+  DebugSubsectionRecord(DebugSubsectionKind Kind, BinaryStreamRef Data);
 
-  static Error initialize(BinaryStreamRef Stream,
-                          ModuleDebugFragmentRecord &Info);
+  static Error initialize(BinaryStreamRef Stream, DebugSubsectionRecord &Info);
 
   uint32_t getRecordLength() const;
-  ModuleDebugFragmentKind kind() const;
+  DebugSubsectionKind kind() const;
   BinaryStreamRef getRecordData() const;
 
 private:
-  ModuleDebugFragmentKind Kind;
+  DebugSubsectionKind Kind;
   BinaryStreamRef Data;
 };
 
-class ModuleDebugFragmentRecordBuilder {
+class DebugSubsectionRecordBuilder {
 public:
-  ModuleDebugFragmentRecordBuilder(ModuleDebugFragmentKind Kind,
-                                   ModuleDebugFragment &Frag);
+  DebugSubsectionRecordBuilder(DebugSubsectionKind Kind, DebugSubsection &Frag);
   uint32_t calculateSerializedLength();
   Error commit(BinaryStreamWriter &Writer);
 
 private:
-  ModuleDebugFragmentKind Kind;
-  ModuleDebugFragment &Frag;
+  DebugSubsectionKind Kind;
+  DebugSubsection &Frag;
 };
 
 } // namespace codeview
 
-template <>
-struct VarStreamArrayExtractor<codeview::ModuleDebugFragmentRecord> {
+template <> struct VarStreamArrayExtractor<codeview::DebugSubsectionRecord> {
   typedef void ContextType;
 
   static Error extract(BinaryStreamRef Stream, uint32_t &Length,
-                       codeview::ModuleDebugFragmentRecord &Info) {
-    if (auto EC = codeview::ModuleDebugFragmentRecord::initialize(Stream, Info))
+                       codeview::DebugSubsectionRecord &Info) {
+    if (auto EC = codeview::DebugSubsectionRecord::initialize(Stream, Info))
       return EC;
     Length = Info.getRecordLength();
     return Error::success();
@@ -73,7 +70,7 @@ struct VarStreamArrayExtractor<codeview::ModuleDebugFragmentRecord> {
 };
 
 namespace codeview {
-typedef VarStreamArray<ModuleDebugFragmentRecord> ModuleDebugFragmentArray;
+typedef VarStreamArray<DebugSubsectionRecord> DebugSubsectionArray;
 }
 } // namespace llvm
 
diff --git a/include/llvm/DebugInfo/CodeView/ModuleDebugFragmentVisitor.h b/include/llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h
similarity index 50%
rename from include/llvm/DebugInfo/CodeView/ModuleDebugFragmentVisitor.h
rename to include/llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h
index 1f55d2024203..55bef491c97e 100644
--- a/include/llvm/DebugInfo/CodeView/ModuleDebugFragmentVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h
@@ -1,4 +1,4 @@
-//===- ModuleDebugFragmentVisitor.h -----------------------------*- C++ -*-===//
+//===- DebugSubsectionVisitor.h -----------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,43 +17,41 @@ namespace llvm {
 
 namespace codeview {
 
-class ModuleDebugFileChecksumFragmentRef;
-class ModuleDebugFragmentRecord;
-class ModuleDebugInlineeLineFragmentRef;
-class ModuleDebugLineFragmentRef;
-class ModuleDebugUnknownFragmentRef;
+class DebugChecksumsSubsectionRef;
+class DebugSubsectionRecord;
+class DebugInlineeLinesSubsectionRef;
+class DebugLinesSubsectionRef;
+class DebugUnknownSubsectionRef;
 
-class ModuleDebugFragmentVisitor {
+class DebugSubsectionVisitor {
 public:
-  virtual ~ModuleDebugFragmentVisitor() = default;
+  virtual ~DebugSubsectionVisitor() = default;
 
-  virtual Error visitUnknown(ModuleDebugUnknownFragmentRef &Unknown) {
+  virtual Error visitUnknown(DebugUnknownSubsectionRef &Unknown) {
     return Error::success();
   }
-  virtual Error visitLines(ModuleDebugLineFragmentRef &Lines) {
+  virtual Error visitLines(DebugLinesSubsectionRef &Lines) {
     return Error::success();
   }
 
-  virtual Error
-  visitFileChecksums(ModuleDebugFileChecksumFragmentRef &Checksums) {
+  virtual Error visitFileChecksums(DebugChecksumsSubsectionRef &Checksums) {
     return Error::success();
   }
 
-  virtual Error visitInlineeLines(ModuleDebugInlineeLineFragmentRef &Inlinees) {
+  virtual Error visitInlineeLines(DebugInlineeLinesSubsectionRef &Inlinees) {
     return Error::success();
   }
 
   virtual Error finished() { return Error::success(); }
 };
 
-Error visitModuleDebugFragment(const ModuleDebugFragmentRecord &R,
-                               ModuleDebugFragmentVisitor &V);
+Error visitDebugSubsection(const DebugSubsectionRecord &R,
+                           DebugSubsectionVisitor &V);
 
 template <typename T>
-Error visitModuleDebugFragments(T &&FragmentRange,
-                                ModuleDebugFragmentVisitor &V) {
+Error visitDebugSubsections(T &&FragmentRange, DebugSubsectionVisitor &V) {
   for (const auto &L : FragmentRange) {
-    if (auto EC = visitModuleDebugFragment(L, V))
+    if (auto EC = visitDebugSubsection(L, V))
       return EC;
   }
   if (auto EC = V.finished())
diff --git a/include/llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h b/include/llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h
new file mode 100644
index 000000000000..3d1eb27ba270
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h
@@ -0,0 +1,53 @@
+//===- DebugSymbolsSubsection.h --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_DEBUGSYMBOLSSUBSECTION_H
+#define LLVM_DEBUGINFO_CODEVIEW_DEBUGSYMBOLSSUBSECTION_H
+
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace codeview {
+class DebugSymbolsSubsectionRef final : public DebugSubsectionRef {
+public:
+  DebugSymbolsSubsectionRef()
+      : DebugSubsectionRef(DebugSubsectionKind::Symbols) {}
+
+  static bool classof(const DebugSubsectionRef *S) {
+    return S->kind() == DebugSubsectionKind::Symbols;
+  }
+
+  Error initialize(BinaryStreamReader Reader);
+
+private:
+  CVSymbolArray Records;
+};
+
+class DebugSymbolsSubsection final : public DebugSubsection {
+public:
+  DebugSymbolsSubsection() : DebugSubsection(DebugSubsectionKind::Symbols) {}
+  static bool classof(const DebugSubsection *S) {
+    return S->kind() == DebugSubsectionKind::Symbols;
+  }
+
+  uint32_t calculateSerializedSize() const override;
+  Error commit(BinaryStreamWriter &Writer) const override;
+
+  void addSymbol(CVSymbol Symbol);
+
+private:
+  uint32_t Length = 0;
+  std::vector<CVSymbol> Records;
+};
+}
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/CodeView/ModuleDebugUnknownFragment.h b/include/llvm/DebugInfo/CodeView/DebugUnknownSubsection.h
similarity index 60%
rename from include/llvm/DebugInfo/CodeView/ModuleDebugUnknownFragment.h
rename to include/llvm/DebugInfo/CodeView/DebugUnknownSubsection.h
index b8c1c02e5cf1..ea9a96ca8d68 100644
--- a/include/llvm/DebugInfo/CodeView/ModuleDebugUnknownFragment.h
+++ b/include/llvm/DebugInfo/CodeView/DebugUnknownSubsection.h
@@ -1,4 +1,4 @@
-//===- ModuleDebugUnknownFragment.h -----------------------------*- C++ -*-===//
+//===- DebugUnknownSubsection.h -----------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,17 +10,16 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGUNKNOWNFRAGMENT_H
 #define LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGUNKNOWNFRAGMENT_H
 
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragment.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/Support/BinaryStreamRef.h"
 
 namespace llvm {
 namespace codeview {
 
-class ModuleDebugUnknownFragmentRef final : public ModuleDebugFragmentRef {
+class DebugUnknownSubsectionRef final : public DebugSubsectionRef {
 public:
-  ModuleDebugUnknownFragmentRef(ModuleDebugFragmentKind Kind,
-                                BinaryStreamRef Data)
-      : ModuleDebugFragmentRef(Kind), Data(Data) {}
+  DebugUnknownSubsectionRef(DebugSubsectionKind Kind, BinaryStreamRef Data)
+      : DebugSubsectionRef(Kind), Data(Data) {}
 
   BinaryStreamRef getData() const { return Data; }
 
diff --git a/include/llvm/DebugInfo/CodeView/ModuleDebugFragment.h b/include/llvm/DebugInfo/CodeView/ModuleDebugFragment.h
deleted file mode 100644
index a5311cae9480..000000000000
--- a/include/llvm/DebugInfo/CodeView/ModuleDebugFragment.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===- ModuleDebugFragment.h ------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGFRAGMENT_H
-#define LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGFRAGMENT_H
-
-#include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/Support/BinaryStreamWriter.h"
-#include "llvm/Support/Casting.h"
-
-namespace llvm {
-namespace codeview {
-
-class ModuleDebugFragmentRef {
-public:
-  explicit ModuleDebugFragmentRef(ModuleDebugFragmentKind Kind) : Kind(Kind) {}
-  virtual ~ModuleDebugFragmentRef();
-
-  ModuleDebugFragmentKind kind() const { return Kind; }
-
-protected:
-  ModuleDebugFragmentKind Kind;
-};
-
-class ModuleDebugFragment {
-public:
-  explicit ModuleDebugFragment(ModuleDebugFragmentKind Kind) : Kind(Kind) {}
-  virtual ~ModuleDebugFragment();
-
-  ModuleDebugFragmentKind kind() const { return Kind; }
-
-  virtual Error commit(BinaryStreamWriter &Writer) = 0;
-  virtual uint32_t calculateSerializedLength() = 0;
-
-protected:
-  ModuleDebugFragmentKind Kind;
-};
-
-} // namespace codeview
-} // namespace llvm
-
-#endif // LLVM_DEBUGINFO_CODEVIEW_MODULEDEBUGFRAGMENT_H
diff --git a/include/llvm/DebugInfo/CodeView/SymbolVisitorDelegate.h b/include/llvm/DebugInfo/CodeView/SymbolVisitorDelegate.h
index 96c8a47a3669..a2a3c6f18fba 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolVisitorDelegate.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolVisitorDelegate.h
@@ -19,7 +19,7 @@ class BinaryStreamReader;
 
 namespace codeview {
 
-class StringTableRef;
+class DebugStringTableSubsectionRef;
 
 class SymbolVisitorDelegate {
 public:
@@ -27,7 +27,7 @@ class SymbolVisitorDelegate {
 
   virtual uint32_t getRecordOffset(BinaryStreamReader Reader) = 0;
   virtual StringRef getFileNameForFileOffset(uint32_t FileOffset) = 0;
-  virtual StringTableRef getStringTable() = 0;
+  virtual DebugStringTableSubsectionRef getStringTable() = 0;
 };
 
 } // end namespace codeview
diff --git a/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h b/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
index 8cc5db981f56..e5858d0f45e3 100644
--- a/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h
@@ -11,9 +11,9 @@
 #define LLVM_DEBUGINFO_PDB_RAW_DBIMODULEDESCRIPTORBUILDER_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugLineFragment.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/Error.h"
@@ -25,7 +25,7 @@ namespace llvm {
 class BinaryStreamWriter;
 
 namespace codeview {
-class ModuleDebugFragmentRecordBuilder;
+class DebugSubsectionRecordBuilder;
 }
 
 namespace msf {
@@ -49,11 +49,11 @@ class DbiModuleDescriptorBuilder {
   void setObjFileName(StringRef Name);
   void addSymbol(codeview::CVSymbol Symbol);
 
-  void addC13Fragment(std::unique_ptr<codeview::ModuleDebugLineFragment> Lines);
+  void addC13Fragment(std::unique_ptr<codeview::DebugLinesSubsection> Lines);
   void addC13Fragment(
-      std::unique_ptr<codeview::ModuleDebugInlineeLineFragment> Inlinees);
+      std::unique_ptr<codeview::DebugInlineeLinesSubsection> Inlinees);
   void setC13FileChecksums(
-      std::unique_ptr<codeview::ModuleDebugFileChecksumFragment> Checksums);
+      std::unique_ptr<codeview::DebugChecksumsSubsection> Checksums);
 
   uint16_t getStreamIndex() const;
   StringRef getModuleName() const { return ModuleName; }
@@ -83,12 +83,11 @@ class DbiModuleDescriptorBuilder {
   std::vector<std::string> SourceFiles;
   std::vector<codeview::CVSymbol> Symbols;
 
-  std::unique_ptr<codeview::ModuleDebugFileChecksumFragment> ChecksumInfo;
-  std::vector<std::unique_ptr<codeview::ModuleDebugLineFragment>> LineInfo;
-  std::vector<std::unique_ptr<codeview::ModuleDebugInlineeLineFragment>>
-      Inlinees;
+  std::unique_ptr<codeview::DebugChecksumsSubsection> ChecksumInfo;
+  std::vector<std::unique_ptr<codeview::DebugLinesSubsection>> LineInfo;
+  std::vector<std::unique_ptr<codeview::DebugInlineeLinesSubsection>> Inlinees;
 
-  std::vector<std::unique_ptr<codeview::ModuleDebugFragmentRecordBuilder>>
+  std::vector<std::unique_ptr<codeview::DebugSubsectionRecordBuilder>>
       C13Builders;
 
   ModuleInfoHeader Layout;
diff --git a/include/llvm/DebugInfo/PDB/Native/DbiStream.h b/include/llvm/DebugInfo/PDB/Native/DbiStream.h
index 8f95481f4152..dc35f8c72cd9 100644
--- a/include/llvm/DebugInfo/PDB/Native/DbiStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/DbiStream.h
@@ -10,7 +10,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_RAW_PDBDBISTREAM_H
 #define LLVM_DEBUGINFO_PDB_RAW_PDBDBISTREAM_H
 
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragment.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptor.h"
 #include "llvm/DebugInfo/PDB/Native/DbiModuleList.h"
@@ -19,8 +19,6 @@
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include "llvm/Support/BinaryStreamArray.h"
-#include "llvm/Support/BinaryStreamArray.h"
-#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
diff --git a/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h b/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h
index 2c95690ed580..822ce3ce13d3 100644
--- a/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/ModuleDebugStream.h
@@ -12,7 +12,7 @@
 
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragmentRecord.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/Support/BinaryStreamArray.h"
@@ -25,8 +25,7 @@ class PDBFile;
 class DbiModuleDescriptor;
 
 class ModuleDebugStreamRef {
-  typedef codeview::ModuleDebugFragmentArray::Iterator
-      LinesAndChecksumsIterator;
+  typedef codeview::DebugSubsectionArray::Iterator LinesAndChecksumsIterator;
 
 public:
   ModuleDebugStreamRef(const DbiModuleDescriptor &Module,
@@ -58,7 +57,7 @@ class ModuleDebugStreamRef {
   BinaryStreamRef C13LinesSubstream;
   BinaryStreamRef GlobalRefsSubstream;
 
-  codeview::ModuleDebugFragmentArray LinesAndChecksums;
+  codeview::DebugSubsectionArray LinesAndChecksums;
 };
 }
 }
diff --git a/include/llvm/DebugInfo/PDB/Native/PDBStringTable.h b/include/llvm/DebugInfo/PDB/Native/PDBStringTable.h
index 7c7f16bd1c73..6aeb0a5479cb 100644
--- a/include/llvm/DebugInfo/PDB/Native/PDBStringTable.h
+++ b/include/llvm/DebugInfo/PDB/Native/PDBStringTable.h
@@ -12,7 +12,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/CodeView/StringTable.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
@@ -52,7 +52,7 @@ class PDBStringTable {
   Error readEpilogue(BinaryStreamReader &Reader);
 
   const PDBStringTableHeader *Header = nullptr;
-  codeview::StringTableRef Strings;
+  codeview::DebugStringTableSubsectionRef Strings;
   FixedStreamArray<support::ulittle32_t> IDs;
   uint32_t ByteSize = 0;
   uint32_t NameCount = 0;
diff --git a/include/llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h b/include/llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h
index 6f85e7a4a074..0faa02dc4525 100644
--- a/include/llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/PDBStringTableBuilder.h
@@ -16,7 +16,7 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/CodeView/StringTable.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/Support/Error.h"
 #include <vector>
 
@@ -41,8 +41,10 @@ class PDBStringTableBuilder {
   uint32_t calculateSerializedSize() const;
   Error commit(BinaryStreamWriter &Writer) const;
 
-  codeview::StringTable &getStrings() { return Strings; }
-  const codeview::StringTable &getStrings() const { return Strings; }
+  codeview::DebugStringTableSubsection &getStrings() { return Strings; }
+  const codeview::DebugStringTableSubsection &getStrings() const {
+    return Strings;
+  }
 
 private:
   uint32_t calculateHashTableSize() const;
@@ -51,7 +53,7 @@ class PDBStringTableBuilder {
   Error writeHashTable(BinaryStreamWriter &Writer) const;
   Error writeEpilogue(BinaryStreamWriter &Writer) const;
 
-  codeview::StringTable Strings;
+  codeview::DebugStringTableSubsection Strings;
 };
 
 } // end namespace pdb
diff --git a/include/llvm/MC/ConstantPools.h b/include/llvm/MC/ConstantPools.h
index 5d4e32a672dd..ef33250204ec 100644
--- a/include/llvm/MC/ConstantPools.h
+++ b/include/llvm/MC/ConstantPools.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/SMLoc.h"
 #include <cstdint>
+#include <map>
 
 namespace llvm {
 
@@ -44,7 +45,7 @@ struct ConstantPoolEntry {
 class ConstantPool {
   using EntryVecTy = SmallVector<ConstantPoolEntry, 4>;
   EntryVecTy Entries;
-  DenseMap<int64_t, const MCSymbolRefExpr *> CachedEntries;
+  std::map<int64_t, const MCSymbolRefExpr *> CachedEntries;
 
 public:
   // Initialize a new empty constant pool
diff --git a/include/llvm/Support/ManagedStatic.h b/include/llvm/Support/ManagedStatic.h
index 7ce86eee95d2..b4bf3210cc73 100644
--- a/include/llvm/Support/ManagedStatic.h
+++ b/include/llvm/Support/ManagedStatic.h
@@ -14,25 +14,22 @@
 #ifndef LLVM_SUPPORT_MANAGEDSTATIC_H
 #define LLVM_SUPPORT_MANAGEDSTATIC_H
 
-#include "llvm/Support/Compiler.h"
 #include <atomic>
 #include <cstddef>
 
 namespace llvm {
 
 /// object_creator - Helper method for ManagedStatic.
-template<class C>
-LLVM_LIBRARY_VISIBILITY void* object_creator() {
-  return new C();
-}
+template <class C> struct object_creator {
+  static void *call() { return new C(); }
+};
 
 /// object_deleter - Helper method for ManagedStatic.
 ///
-template <typename T> struct LLVM_LIBRARY_VISIBILITY object_deleter {
+template <typename T> struct object_deleter {
   static void call(void *Ptr) { delete (T *)Ptr; }
 };
-template <typename T, size_t N>
-struct LLVM_LIBRARY_VISIBILITY object_deleter<T[N]> {
+template <typename T, size_t N> struct object_deleter<T[N]> {
   static void call(void *Ptr) { delete[](T *)Ptr; }
 };
 
@@ -59,14 +56,15 @@ class ManagedStaticBase {
 /// libraries that link in LLVM components) and for making destruction be
 /// explicit through the llvm_shutdown() function call.
 ///
-template<class C>
+template <class C, class Creator = object_creator<C>,
+          class Deleter = object_deleter<C>>
 class ManagedStatic : public ManagedStaticBase {
 public:
   // Accessors.
   C &operator*() {
     void *Tmp = Ptr.load(std::memory_order_acquire);
     if (!Tmp)
-      RegisterManagedStatic(object_creator<C>, object_deleter<C>::call);
+      RegisterManagedStatic(Creator::call, Deleter::call);
 
     return *static_cast<C *>(Ptr.load(std::memory_order_relaxed));
   }
@@ -76,7 +74,7 @@ class ManagedStatic : public ManagedStaticBase {
   const C &operator*() const {
     void *Tmp = Ptr.load(std::memory_order_acquire);
     if (!Tmp)
-      RegisterManagedStatic(object_creator<C>, object_deleter<C>::call);
+      RegisterManagedStatic(Creator::call, Deleter::call);
 
     return *static_cast<C *>(Ptr.load(std::memory_order_relaxed));
   }
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index d14a56cb87e0..437f68b24e57 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -1189,6 +1189,9 @@ class DagInit final : public TypedInit, public FoldingSetNode,
     return Init ? Init->getValue() : StringRef();
   }
 
+  ArrayRef<Init *> getArgs() const {
+    return makeArrayRef(getTrailingObjects<Init *>(), NumArgs);
+  }
   ArrayRef<StringInit *> getArgNames() const {
     return makeArrayRef(getTrailingObjects<StringInit *>(), NumArgNames);
   }
@@ -1200,19 +1203,16 @@ class DagInit final : public TypedInit, public FoldingSetNode,
   typedef SmallVectorImpl<Init*>::const_iterator       const_arg_iterator;
   typedef SmallVectorImpl<StringInit*>::const_iterator const_name_iterator;
 
-  inline const_arg_iterator  arg_begin() const { return getTrailingObjects<Init *>(); }
-  inline const_arg_iterator  arg_end  () const { return arg_begin() + NumArgs;   }
-  inline iterator_range<const_arg_iterator> args() const {
-    return llvm::make_range(arg_begin(), arg_end());
-  }
+  inline const_arg_iterator  arg_begin() const { return getArgs().begin(); }
+  inline const_arg_iterator  arg_end  () const { return getArgs().end(); }
 
-  inline size_t              arg_size () const { return NumArgs;  }
+  inline size_t              arg_size () const { return NumArgs; }
   inline bool                arg_empty() const { return NumArgs == 0; }
 
-  inline const_name_iterator name_begin() const { return getTrailingObjects<StringInit *>(); }
-  inline const_name_iterator name_end  () const { return name_begin() + NumArgNames;   }
+  inline const_name_iterator name_begin() const { return getArgNames().begin();}
+  inline const_name_iterator name_end  () const { return getArgNames().end(); }
 
-  inline size_t              name_size () const { return NumArgNames;  }
+  inline size_t              name_size () const { return NumArgNames; }
   inline bool                name_empty() const { return NumArgNames == 0; }
 
   Init *getBit(unsigned Bit) const override {
diff --git a/include/llvm/Transforms/Scalar/GVNExpression.h b/include/llvm/Transforms/Scalar/GVNExpression.h
index a971df975b6f..324ebca46de2 100644
--- a/include/llvm/Transforms/Scalar/GVNExpression.h
+++ b/include/llvm/Transforms/Scalar/GVNExpression.h
@@ -58,10 +58,11 @@ class Expression {
 private:
   ExpressionType EType;
   unsigned Opcode;
+  mutable hash_code HashVal;
 
 public:
   Expression(ExpressionType ET = ET_Base, unsigned O = ~2U)
-      : EType(ET), Opcode(O) {}
+      : EType(ET), Opcode(O), HashVal(0) {}
   Expression(const Expression &) = delete;
   Expression &operator=(const Expression &) = delete;
   virtual ~Expression();
@@ -82,6 +83,14 @@ class Expression {
 
     return equals(Other);
   }
+  hash_code getComputedHash() const {
+    // It's theoretically possible for a thing to hash to zero.  In that case,
+    // we will just compute the hash a few extra times, which is no worse that
+    // we did before, which was to compute it always.
+    if (static_cast<unsigned>(HashVal) == 0)
+      HashVal = getHashValue();
+    return HashVal;
+  }
 
   virtual bool equals(const Expression &Other) const { return true; }
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index d280fda0a162..f55ce202bcbb 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -2178,8 +2178,7 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
   return Flags;
 }
 
-bool ScalarEvolution::isAvailableAtLoopEntry(const SCEV *S, const Loop *L,
-                                             DominatorTree &DT, LoopInfo &LI) {
+bool ScalarEvolution::isAvailableAtLoopEntry(const SCEV *S, const Loop *L) {
   if (!isLoopInvariant(S, L))
     return false;
   // If a value depends on a SCEVUnknown which is defined after the loop, we
@@ -2516,7 +2515,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
     const Loop *AddRecLoop = AddRec->getLoop();
     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (isAvailableAtLoopEntry(Ops[i], AddRecLoop, DT, LI)) {
+      if (isAvailableAtLoopEntry(Ops[i], AddRecLoop)) {
         LIOps.push_back(Ops[i]);
         Ops.erase(Ops.begin()+i);
         --i; --e;
@@ -2791,7 +2790,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
     const Loop *AddRecLoop = AddRec->getLoop();
     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (isAvailableAtLoopEntry(Ops[i], AddRecLoop, DT, LI)) {
+      if (isAvailableAtLoopEntry(Ops[i], AddRecLoop)) {
         LIOps.push_back(Ops[i]);
         Ops.erase(Ops.begin()+i);
         --i; --e;
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 114aea391a86..385c78bbccef 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -15,8 +15,8 @@
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
@@ -393,7 +393,7 @@ void CodeViewDebug::endModule() {
   // subprograms.
   switchToDebugSectionForSymbol(nullptr);
 
-  MCSymbol *CompilerInfo = beginCVSubsection(ModuleDebugFragmentKind::Symbols);
+  MCSymbol *CompilerInfo = beginCVSubsection(DebugSubsectionKind::Symbols);
   emitCompilerInformation();
   endCVSubsection(CompilerInfo);
 
@@ -417,7 +417,7 @@ void CodeViewDebug::endModule() {
 
   // Emit UDT records for any types used by global variables.
   if (!GlobalUDTs.empty()) {
-    MCSymbol *SymbolsEnd = beginCVSubsection(ModuleDebugFragmentKind::Symbols);
+    MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
     emitDebugInfoForUDTs(GlobalUDTs);
     endCVSubsection(SymbolsEnd);
   }
@@ -630,8 +630,7 @@ void CodeViewDebug::emitInlineeLinesSubsection() {
     return;
 
   OS.AddComment("Inlinee lines subsection");
-  MCSymbol *InlineEnd =
-      beginCVSubsection(ModuleDebugFragmentKind::InlineeLines);
+  MCSymbol *InlineEnd = beginCVSubsection(DebugSubsectionKind::InlineeLines);
 
   // We don't provide any extra file info.
   // FIXME: Find out if debuggers use this info.
@@ -756,7 +755,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
 
   // Emit a symbol subsection, required by VS2012+ to find function boundaries.
   OS.AddComment("Symbol subsection for " + Twine(FuncName));
-  MCSymbol *SymbolsEnd = beginCVSubsection(ModuleDebugFragmentKind::Symbols);
+  MCSymbol *SymbolsEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
   {
     MCSymbol *ProcRecordBegin = MMI->getContext().createTempSymbol(),
              *ProcRecordEnd = MMI->getContext().createTempSymbol();
@@ -2111,7 +2110,7 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
   maybeRecordLocation(DL, Asm->MF);
 }
 
-MCSymbol *CodeViewDebug::beginCVSubsection(ModuleDebugFragmentKind Kind) {
+MCSymbol *CodeViewDebug::beginCVSubsection(DebugSubsectionKind Kind) {
   MCSymbol *BeginLabel = MMI->getContext().createTempSymbol(),
            *EndLabel = MMI->getContext().createTempSymbol();
   OS.EmitIntValue(unsigned(Kind), 4);
@@ -2171,7 +2170,7 @@ void CodeViewDebug::emitDebugInfoForGlobals() {
         if (!GV->hasComdat() && !GV->isDeclarationForLinker()) {
           if (!EndLabel) {
             OS.AddComment("Symbol subsection for globals");
-            EndLabel = beginCVSubsection(ModuleDebugFragmentKind::Symbols);
+            EndLabel = beginCVSubsection(DebugSubsectionKind::Symbols);
           }
           // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions.
           emitDebugInfoForGlobal(GVE->getVariable(), GV, Asm->getSymbol(GV));
@@ -2189,7 +2188,7 @@ void CodeViewDebug::emitDebugInfoForGlobals() {
           OS.AddComment("Symbol subsection for " +
                         Twine(GlobalValue::dropLLVMManglingEscape(GV->getName())));
           switchToDebugSectionForSymbol(GVSym);
-          EndLabel = beginCVSubsection(ModuleDebugFragmentKind::Symbols);
+          EndLabel = beginCVSubsection(DebugSubsectionKind::Symbols);
           // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions.
           emitDebugInfoForGlobal(GVE->getVariable(), GV, GVSym);
           endCVSubsection(EndLabel);
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index 46b2daa1e007..1c0c1644edaf 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -216,7 +216,7 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
   /// Opens a subsection of the given kind in a .debug$S codeview section.
   /// Returns an end label for use with endCVSubsection when the subsection is
   /// finished.
-  MCSymbol *beginCVSubsection(codeview::ModuleDebugFragmentKind Kind);
+  MCSymbol *beginCVSubsection(codeview::DebugSubsectionKind Kind);
 
   void endCVSubsection(MCSymbol *EndLabel);
 
diff --git a/lib/CodeGen/GlobalISel/Localizer.cpp b/lib/CodeGen/GlobalISel/Localizer.cpp
index bdca732b4e33..c2a568e4b452 100644
--- a/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -23,7 +23,7 @@ using namespace llvm;
 char Localizer::ID = 0;
 INITIALIZE_PASS(Localizer, DEBUG_TYPE,
                 "Move/duplicate certain instructions close to their use", false,
-                false);
+                false)
 
 Localizer::Localizer() : MachineFunctionPass(ID) {
   initializeLocalizerPass(*PassRegistry::getPassRegistry());
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 23a302f3e561..ab36bc1417ae 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14567,7 +14567,8 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
   // extract instead or remove that condition entirely.
   auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
   auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
-  if (!Ld || !Ld->hasOneUse() || Ld->isVolatile() || !ExtIdx)
+  if (!Ld || !Ld->hasOneUse() || Ld->getExtensionType() || Ld->isVolatile() ||
+      !ExtIdx)
     return SDValue();
 
   // The narrow load will be offset from the base address of the old load if
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index d0a8b34c69c6..da2fb72bec45 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -925,10 +925,6 @@ getStrictFPOpcodeAction(const TargetLowering &TLI, unsigned Opcode, EVT VT) {
   if (Action != TargetLowering::Legal)
     Action = TargetLowering::Expand;
 
-  // ISD::FPOWI returns 'Legal' even though it should be expanded.
-  if (Opcode == ISD::STRICT_FPOWI && Action == TargetLowering::Legal)
-    Action = TargetLowering::Expand;
-
   return Action;
 }
 
@@ -1027,7 +1023,6 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     break;
   case ISD::EXTRACT_ELEMENT:
   case ISD::FLT_ROUNDS_:
-  case ISD::FPOWI:
   case ISD::MERGE_VALUES:
   case ISD::EH_RETURN:
   case ISD::FRAME_TO_ARGS_OFFSET:
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 5f63fd4320bb..0def5ae6d0d0 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -935,6 +935,7 @@ void TargetLoweringBase::initActions() {
     
     // These library functions default to expand.
     setOperationAction(ISD::FROUND, VT, Expand);
+    setOperationAction(ISD::FPOWI, VT, Expand);
 
     // These operations default to expand for vector types.
     if (VT.isVector()) {
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index 90193d07b95d..410b89bc949e 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -7,14 +7,16 @@ add_llvm_library(LLVMDebugInfoCodeView
   Formatters.cpp
   LazyRandomTypeCollection.cpp
   Line.cpp
-  ModuleDebugFileChecksumFragment.cpp
-  ModuleDebugFragment.cpp
-  ModuleDebugFragmentRecord.cpp
-  ModuleDebugFragmentVisitor.cpp
-  ModuleDebugInlineeLinesFragment.cpp
-  ModuleDebugLineFragment.cpp
+  DebugChecksumsSubsection.cpp
+  DebugFrameDataSubsection.cpp
+  DebugInlineeLinesSubsection.cpp
+  DebugLinesSubsection.cpp
+  DebugStringTableSubsection.cpp
+  DebugSubsection.cpp
+  DebugSubsectionRecord.cpp
+  DebugSubsectionVisitor.cpp
+  DebugSymbolsSubsection.cpp
   RecordSerialization.cpp
-  StringTable.cpp
   SymbolRecordMapping.cpp
   SymbolDumper.cpp
   SymbolSerializer.cpp
diff --git a/lib/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.cpp b/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp
similarity index 72%
rename from lib/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.cpp
rename to lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp
index 42f0afc3e2d7..1a85a339f8c3 100644
--- a/lib/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.cpp
+++ b/lib/DebugInfo/CodeView/DebugChecksumsSubsection.cpp
@@ -1,4 +1,4 @@
-//===- ModuleDebugFileChecksumFragment.cpp ----------------------*- C++ -*-===//
+//===- DebugChecksumsSubsection.cpp ----------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
 
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
-#include "llvm/DebugInfo/CodeView/StringTable.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/Support/BinaryStreamReader.h"
 
 using namespace llvm;
@@ -42,22 +42,24 @@ Error llvm::VarStreamArrayExtractor<FileChecksumEntry>::extract(
   return Error::success();
 }
 
-Error ModuleDebugFileChecksumFragmentRef::initialize(
-    BinaryStreamReader Reader) {
+Error DebugChecksumsSubsectionRef::initialize(BinaryStreamReader Reader) {
   if (auto EC = Reader.readArray(Checksums, Reader.bytesRemaining()))
     return EC;
 
   return Error::success();
 }
+Error DebugChecksumsSubsectionRef::initialize(BinaryStreamRef Section) {
+  BinaryStreamReader Reader(Section);
+  return initialize(Reader);
+}
 
-ModuleDebugFileChecksumFragment::ModuleDebugFileChecksumFragment(
-    StringTable &Strings)
-    : ModuleDebugFragment(ModuleDebugFragmentKind::FileChecksums),
-      Strings(Strings) {}
+DebugChecksumsSubsection::DebugChecksumsSubsection(
+    DebugStringTableSubsection &Strings)
+    : DebugSubsection(DebugSubsectionKind::FileChecksums), Strings(Strings) {}
 
-void ModuleDebugFileChecksumFragment::addChecksum(StringRef FileName,
-                                                  FileChecksumKind Kind,
-                                                  ArrayRef<uint8_t> Bytes) {
+void DebugChecksumsSubsection::addChecksum(StringRef FileName,
+                                           FileChecksumKind Kind,
+                                           ArrayRef<uint8_t> Bytes) {
   FileChecksumEntry Entry;
   if (!Bytes.empty()) {
     uint8_t *Copy = Storage.Allocate<uint8_t>(Bytes.size());
@@ -78,11 +80,11 @@ void ModuleDebugFileChecksumFragment::addChecksum(StringRef FileName,
   SerializedSize += Len;
 }
 
-uint32_t ModuleDebugFileChecksumFragment::calculateSerializedLength() {
+uint32_t DebugChecksumsSubsection::calculateSerializedSize() const {
   return SerializedSize;
 }
 
-Error ModuleDebugFileChecksumFragment::commit(BinaryStreamWriter &Writer) {
+Error DebugChecksumsSubsection::commit(BinaryStreamWriter &Writer) const {
   for (const auto &FC : Checksums) {
     FileChecksumEntryHeader Header;
     Header.ChecksumKind = uint8_t(FC.Kind);
@@ -98,8 +100,7 @@ Error ModuleDebugFileChecksumFragment::commit(BinaryStreamWriter &Writer) {
   return Error::success();
 }
 
-uint32_t
-ModuleDebugFileChecksumFragment::mapChecksumOffset(StringRef FileName) const {
+uint32_t DebugChecksumsSubsection::mapChecksumOffset(StringRef FileName) const {
   uint32_t Offset = Strings.getStringId(FileName);
   auto Iter = OffsetMap.find(Offset);
   assert(Iter != OffsetMap.end());
diff --git a/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp b/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp
new file mode 100644
index 000000000000..fd558aa9cc8a
--- /dev/null
+++ b/lib/DebugInfo/CodeView/DebugFrameDataSubsection.cpp
@@ -0,0 +1,44 @@
+//===- DebugFrameDataSubsection.cpp -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h"
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error DebugFrameDataSubsectionRef::initialize(BinaryStreamReader Reader) {
+  if (auto EC = Reader.readObject(RelocPtr))
+    return EC;
+  if (Reader.bytesRemaining() % sizeof(FrameData) != 0)
+    return make_error<CodeViewError>(cv_error_code::corrupt_record,
+                                     "Invalid frame data record format!");
+
+  uint32_t Count = Reader.bytesRemaining() / sizeof(FrameData);
+  if (auto EC = Reader.readArray(Frames, Count))
+    return EC;
+  return Error::success();
+}
+
+uint32_t DebugFrameDataSubsection::calculateSerializedSize() const {
+  return 4 + sizeof(FrameData) * Frames.size();
+}
+
+Error DebugFrameDataSubsection::commit(BinaryStreamWriter &Writer) const {
+  if (auto EC = Writer.writeInteger<uint32_t>(0))
+    return EC;
+
+  if (auto EC = Writer.writeArray(makeArrayRef(Frames)))
+    return EC;
+  return Error::success();
+}
+
+void DebugFrameDataSubsection::addFrameData(const FrameData &Frame) {
+  Frames.push_back(Frame);
+}
diff --git a/lib/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.cpp b/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
similarity index 66%
rename from lib/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.cpp
rename to lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
index cb6a8478797f..520a0ee4454f 100644
--- a/lib/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.cpp
+++ b/lib/DebugInfo/CodeView/DebugInlineeLinesSubsection.cpp
@@ -1,4 +1,4 @@
-//===- ModuleDebugInlineeLineFragment.cpp ------------------------*- C++-*-===//
+//===- DebugInlineeLinesSubsection.cpp ------------------------*- C++-*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
 
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragmentRecord.h"
-#include "llvm/DebugInfo/CodeView/StringTable.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -37,10 +37,10 @@ Error VarStreamArrayExtractor<InlineeSourceLine>::extract(
   return Error::success();
 }
 
-ModuleDebugInlineeLineFragmentRef::ModuleDebugInlineeLineFragmentRef()
-    : ModuleDebugFragmentRef(ModuleDebugFragmentKind::InlineeLines) {}
+DebugInlineeLinesSubsectionRef::DebugInlineeLinesSubsectionRef()
+    : DebugSubsectionRef(DebugSubsectionKind::InlineeLines) {}
 
-Error ModuleDebugInlineeLineFragmentRef::initialize(BinaryStreamReader Reader) {
+Error DebugInlineeLinesSubsectionRef::initialize(BinaryStreamReader Reader) {
   if (auto EC = Reader.readEnum(Signature))
     return EC;
 
@@ -52,16 +52,16 @@ Error ModuleDebugInlineeLineFragmentRef::initialize(BinaryStreamReader Reader) {
   return Error::success();
 }
 
-bool ModuleDebugInlineeLineFragmentRef::hasExtraFiles() const {
+bool DebugInlineeLinesSubsectionRef::hasExtraFiles() const {
   return Signature == InlineeLinesSignature::ExtraFiles;
 }
 
-ModuleDebugInlineeLineFragment::ModuleDebugInlineeLineFragment(
-    ModuleDebugFileChecksumFragment &Checksums, bool HasExtraFiles)
-    : ModuleDebugFragment(ModuleDebugFragmentKind::InlineeLines),
-      Checksums(Checksums), HasExtraFiles(HasExtraFiles) {}
+DebugInlineeLinesSubsection::DebugInlineeLinesSubsection(
+    DebugChecksumsSubsection &Checksums, bool HasExtraFiles)
+    : DebugSubsection(DebugSubsectionKind::InlineeLines), Checksums(Checksums),
+      HasExtraFiles(HasExtraFiles) {}
 
-uint32_t ModuleDebugInlineeLineFragment::calculateSerializedLength() {
+uint32_t DebugInlineeLinesSubsection::calculateSerializedSize() const {
   // 4 bytes for the signature
   uint32_t Size = sizeof(InlineeLinesSignature);
 
@@ -78,7 +78,7 @@ uint32_t ModuleDebugInlineeLineFragment::calculateSerializedLength() {
   return Size;
 }
 
-Error ModuleDebugInlineeLineFragment::commit(BinaryStreamWriter &Writer) {
+Error DebugInlineeLinesSubsection::commit(BinaryStreamWriter &Writer) const {
   InlineeLinesSignature Sig = InlineeLinesSignature::Normal;
   if (HasExtraFiles)
     Sig = InlineeLinesSignature::ExtraFiles;
@@ -102,7 +102,7 @@ Error ModuleDebugInlineeLineFragment::commit(BinaryStreamWriter &Writer) {
   return Error::success();
 }
 
-void ModuleDebugInlineeLineFragment::addExtraFile(StringRef FileName) {
+void DebugInlineeLinesSubsection::addExtraFile(StringRef FileName) {
   uint32_t Offset = Checksums.mapChecksumOffset(FileName);
 
   auto &Entry = Entries.back();
@@ -110,9 +110,9 @@ void ModuleDebugInlineeLineFragment::addExtraFile(StringRef FileName) {
   ++ExtraFileCount;
 }
 
-void ModuleDebugInlineeLineFragment::addInlineSite(TypeIndex FuncId,
-                                                   StringRef FileName,
-                                                   uint32_t SourceLine) {
+void DebugInlineeLinesSubsection::addInlineSite(TypeIndex FuncId,
+                                                StringRef FileName,
+                                                uint32_t SourceLine) {
   uint32_t Offset = Checksums.mapChecksumOffset(FileName);
 
   Entries.emplace_back();
diff --git a/lib/DebugInfo/CodeView/ModuleDebugLineFragment.cpp b/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp
similarity index 69%
rename from lib/DebugInfo/CodeView/ModuleDebugLineFragment.cpp
rename to lib/DebugInfo/CodeView/DebugLinesSubsection.cpp
index e0ee934709ba..2fce06ca2a17 100644
--- a/lib/DebugInfo/CodeView/ModuleDebugLineFragment.cpp
+++ b/lib/DebugInfo/CodeView/DebugLinesSubsection.cpp
@@ -1,4 +1,4 @@
-//===- ModuleDebugLineFragment.cpp -------------------------------*- C++-*-===//
+//===- DebugLinesSubsection.cpp -------------------------------*- C++-*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/CodeView/ModuleDebugLineFragment.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
 
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragmentRecord.h"
-#include "llvm/DebugInfo/CodeView/StringTable.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -49,10 +49,10 @@ Error LineColumnExtractor::extract(BinaryStreamRef Stream, uint32_t &Len,
   return Error::success();
 }
 
-ModuleDebugLineFragmentRef::ModuleDebugLineFragmentRef()
-    : ModuleDebugFragmentRef(ModuleDebugFragmentKind::Lines) {}
+DebugLinesSubsectionRef::DebugLinesSubsectionRef()
+    : DebugSubsectionRef(DebugSubsectionKind::Lines) {}
 
-Error ModuleDebugLineFragmentRef::initialize(BinaryStreamReader Reader) {
+Error DebugLinesSubsectionRef::initialize(BinaryStreamReader Reader) {
   if (auto EC = Reader.readObject(Header))
     return EC;
 
@@ -63,23 +63,21 @@ Error ModuleDebugLineFragmentRef::initialize(BinaryStreamReader Reader) {
   return Error::success();
 }
 
-bool ModuleDebugLineFragmentRef::hasColumnInfo() const {
+bool DebugLinesSubsectionRef::hasColumnInfo() const {
   return !!(Header->Flags & LF_HaveColumns);
 }
 
-ModuleDebugLineFragment::ModuleDebugLineFragment(
-    ModuleDebugFileChecksumFragment &Checksums, StringTable &Strings)
-    : ModuleDebugFragment(ModuleDebugFragmentKind::Lines),
-      Checksums(Checksums) {}
+DebugLinesSubsection::DebugLinesSubsection(DebugChecksumsSubsection &Checksums,
+                                           DebugStringTableSubsection &Strings)
+    : DebugSubsection(DebugSubsectionKind::Lines), Checksums(Checksums) {}
 
-void ModuleDebugLineFragment::createBlock(StringRef FileName) {
+void DebugLinesSubsection::createBlock(StringRef FileName) {
   uint32_t Offset = Checksums.mapChecksumOffset(FileName);
 
   Blocks.emplace_back(Offset);
 }
 
-void ModuleDebugLineFragment::addLineInfo(uint32_t Offset,
-                                          const LineInfo &Line) {
+void DebugLinesSubsection::addLineInfo(uint32_t Offset, const LineInfo &Line) {
   Block &B = Blocks.back();
   LineNumberEntry LNE;
   LNE.Flags = Line.getRawData();
@@ -87,10 +85,10 @@ void ModuleDebugLineFragment::addLineInfo(uint32_t Offset,
   B.Lines.push_back(LNE);
 }
 
-void ModuleDebugLineFragment::addLineAndColumnInfo(uint32_t Offset,
-                                                   const LineInfo &Line,
-                                                   uint32_t ColStart,
-                                                   uint32_t ColEnd) {
+void DebugLinesSubsection::addLineAndColumnInfo(uint32_t Offset,
+                                                const LineInfo &Line,
+                                                uint32_t ColStart,
+                                                uint32_t ColEnd) {
   Block &B = Blocks.back();
   assert(B.Lines.size() == B.Columns.size());
 
@@ -101,7 +99,7 @@ void ModuleDebugLineFragment::addLineAndColumnInfo(uint32_t Offset,
   B.Columns.push_back(CNE);
 }
 
-Error ModuleDebugLineFragment::commit(BinaryStreamWriter &Writer) {
+Error DebugLinesSubsection::commit(BinaryStreamWriter &Writer) const {
   LineFragmentHeader Header;
   Header.CodeSize = CodeSize;
   Header.Flags = hasColumnInfo() ? LF_HaveColumns : 0;
@@ -135,7 +133,7 @@ Error ModuleDebugLineFragment::commit(BinaryStreamWriter &Writer) {
   return Error::success();
 }
 
-uint32_t ModuleDebugLineFragment::calculateSerializedLength() {
+uint32_t DebugLinesSubsection::calculateSerializedSize() const {
   uint32_t Size = sizeof(LineFragmentHeader);
   for (const auto &B : Blocks) {
     Size += sizeof(LineBlockFragmentHeader);
@@ -146,16 +144,16 @@ uint32_t ModuleDebugLineFragment::calculateSerializedLength() {
   return Size;
 }
 
-void ModuleDebugLineFragment::setRelocationAddress(uint16_t Segment,
-                                                   uint16_t Offset) {
+void DebugLinesSubsection::setRelocationAddress(uint16_t Segment,
+                                                uint16_t Offset) {
   RelocOffset = Offset;
   RelocSegment = Segment;
 }
 
-void ModuleDebugLineFragment::setCodeSize(uint32_t Size) { CodeSize = Size; }
+void DebugLinesSubsection::setCodeSize(uint32_t Size) { CodeSize = Size; }
 
-void ModuleDebugLineFragment::setFlags(LineFlags Flags) { this->Flags = Flags; }
+void DebugLinesSubsection::setFlags(LineFlags Flags) { this->Flags = Flags; }
 
-bool ModuleDebugLineFragment::hasColumnInfo() const {
+bool DebugLinesSubsection::hasColumnInfo() const {
   return Flags & LF_HaveColumns;
 }
diff --git a/lib/DebugInfo/CodeView/StringTable.cpp b/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
similarity index 61%
rename from lib/DebugInfo/CodeView/StringTable.cpp
rename to lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
index 21f11204686b..b8741eb0b675 100644
--- a/lib/DebugInfo/CodeView/StringTable.cpp
+++ b/lib/DebugInfo/CodeView/DebugStringTableSubsection.cpp
@@ -1,4 +1,4 @@
-//===- StringTable.cpp - CodeView String Table Reader/Writer ----*- C++ -*-===//
+//===- DebugStringTableSubsection.cpp - CodeView String Table ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/CodeView/StringTable.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 
 #include "llvm/Support/BinaryStream.h"
 #include "llvm/Support/BinaryStreamReader.h"
@@ -16,14 +16,16 @@
 using namespace llvm;
 using namespace llvm::codeview;
 
-StringTableRef::StringTableRef() {}
+DebugStringTableSubsectionRef::DebugStringTableSubsectionRef()
+    : DebugSubsectionRef(DebugSubsectionKind::StringTable) {}
 
-Error StringTableRef::initialize(BinaryStreamRef Contents) {
+Error DebugStringTableSubsectionRef::initialize(BinaryStreamRef Contents) {
   Stream = Contents;
   return Error::success();
 }
 
-Expected<StringRef> StringTableRef::getString(uint32_t Offset) const {
+Expected<StringRef>
+DebugStringTableSubsectionRef::getString(uint32_t Offset) const {
   BinaryStreamReader Reader(Stream);
   Reader.setOffset(Offset);
   StringRef Result;
@@ -32,7 +34,10 @@ Expected<StringRef> StringTableRef::getString(uint32_t Offset) const {
   return Result;
 }
 
-uint32_t StringTable::insert(StringRef S) {
+DebugStringTableSubsection::DebugStringTableSubsection()
+    : DebugSubsection(DebugSubsectionKind::StringTable) {}
+
+uint32_t DebugStringTableSubsection::insert(StringRef S) {
   auto P = Strings.insert({S, StringSize});
 
   // If a given string didn't exist in the string table, we want to increment
@@ -42,9 +47,11 @@ uint32_t StringTable::insert(StringRef S) {
   return P.first->second;
 }
 
-uint32_t StringTable::calculateSerializedSize() const { return StringSize; }
+uint32_t DebugStringTableSubsection::calculateSerializedSize() const {
+  return StringSize;
+}
 
-Error StringTable::commit(BinaryStreamWriter &Writer) const {
+Error DebugStringTableSubsection::commit(BinaryStreamWriter &Writer) const {
   assert(Writer.bytesRemaining() == StringSize);
   uint32_t MaxOffset = 1;
 
@@ -62,9 +69,9 @@ Error StringTable::commit(BinaryStreamWriter &Writer) const {
   return Error::success();
 }
 
-uint32_t StringTable::size() const { return Strings.size(); }
+uint32_t DebugStringTableSubsection::size() const { return Strings.size(); }
 
-uint32_t StringTable::getStringId(StringRef S) const {
+uint32_t DebugStringTableSubsection::getStringId(StringRef S) const {
   auto P = Strings.find(S);
   assert(P != Strings.end());
   return P->second;
diff --git a/lib/DebugInfo/CodeView/ModuleDebugFragment.cpp b/lib/DebugInfo/CodeView/DebugSubsection.cpp
similarity index 55%
rename from lib/DebugInfo/CodeView/ModuleDebugFragment.cpp
rename to lib/DebugInfo/CodeView/DebugSubsection.cpp
index 2af1917413da..67b428bfa713 100644
--- a/lib/DebugInfo/CodeView/ModuleDebugFragment.cpp
+++ b/lib/DebugInfo/CodeView/DebugSubsection.cpp
@@ -1,4 +1,4 @@
-//===- ModuleDebugFragment.cpp -----------------------------------*- C++-*-===//
+//===- DebugSubsection.cpp -----------------------------------*- C++-*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragment.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
 
 using namespace llvm::codeview;
 
-ModuleDebugFragmentRef::~ModuleDebugFragmentRef() {}
+DebugSubsectionRef::~DebugSubsectionRef() {}
 
-ModuleDebugFragment::~ModuleDebugFragment() {}
+DebugSubsection::~DebugSubsection() {}
diff --git a/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp b/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
new file mode 100644
index 000000000000..511f36d0020a
--- /dev/null
+++ b/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
@@ -0,0 +1,81 @@
+//===- DebugSubsectionRecord.cpp -----------------------------*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
+
+#include "llvm/Support/BinaryStreamReader.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+DebugSubsectionRecord::DebugSubsectionRecord()
+    : Kind(DebugSubsectionKind::None) {}
+
+DebugSubsectionRecord::DebugSubsectionRecord(DebugSubsectionKind Kind,
+                                             BinaryStreamRef Data)
+    : Kind(Kind), Data(Data) {}
+
+Error DebugSubsectionRecord::initialize(BinaryStreamRef Stream,
+                                        DebugSubsectionRecord &Info) {
+  const DebugSubsectionHeader *Header;
+  BinaryStreamReader Reader(Stream);
+  if (auto EC = Reader.readObject(Header))
+    return EC;
+
+  DebugSubsectionKind Kind =
+      static_cast<DebugSubsectionKind>(uint32_t(Header->Kind));
+  switch (Kind) {
+  case DebugSubsectionKind::FileChecksums:
+  case DebugSubsectionKind::Lines:
+  case DebugSubsectionKind::InlineeLines:
+    break;
+  default:
+    llvm_unreachable("Unexpected debug fragment kind!");
+  }
+  if (auto EC = Reader.readStreamRef(Info.Data, Header->Length))
+    return EC;
+  Info.Kind = Kind;
+  return Error::success();
+}
+
+uint32_t DebugSubsectionRecord::getRecordLength() const {
+  uint32_t Result = sizeof(DebugSubsectionHeader) + Data.getLength();
+  assert(Result % 4 == 0);
+  return Result;
+}
+
+DebugSubsectionKind DebugSubsectionRecord::kind() const { return Kind; }
+
+BinaryStreamRef DebugSubsectionRecord::getRecordData() const { return Data; }
+
+DebugSubsectionRecordBuilder::DebugSubsectionRecordBuilder(
+    DebugSubsectionKind Kind, DebugSubsection &Frag)
+    : Kind(Kind), Frag(Frag) {}
+
+uint32_t DebugSubsectionRecordBuilder::calculateSerializedLength() {
+  uint32_t Size = sizeof(DebugSubsectionHeader) +
+                  alignTo(Frag.calculateSerializedSize(), 4);
+  return Size;
+}
+
+Error DebugSubsectionRecordBuilder::commit(BinaryStreamWriter &Writer) {
+  DebugSubsectionHeader Header;
+  Header.Kind = uint32_t(Kind);
+  Header.Length = calculateSerializedLength() - sizeof(DebugSubsectionHeader);
+
+  if (auto EC = Writer.writeObject(Header))
+    return EC;
+  if (auto EC = Frag.commit(Writer))
+    return EC;
+  if (auto EC = Writer.padToAlignment(4))
+    return EC;
+
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/DebugSubsectionVisitor.cpp b/lib/DebugInfo/CodeView/DebugSubsectionVisitor.cpp
new file mode 100644
index 000000000000..f2c4dea8685f
--- /dev/null
+++ b/lib/DebugInfo/CodeView/DebugSubsectionVisitor.cpp
@@ -0,0 +1,52 @@
+//===- DebugSubsectionVisitor.cpp ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h"
+
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/DebugInfo/CodeView/DebugUnknownSubsection.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error llvm::codeview::visitDebugSubsection(const DebugSubsectionRecord &R,
+                                           DebugSubsectionVisitor &V) {
+  BinaryStreamReader Reader(R.getRecordData());
+  switch (R.kind()) {
+  case DebugSubsectionKind::Lines: {
+    DebugLinesSubsectionRef Fragment;
+    if (auto EC = Fragment.initialize(Reader))
+      return EC;
+
+    return V.visitLines(Fragment);
+  }
+  case DebugSubsectionKind::FileChecksums: {
+    DebugChecksumsSubsectionRef Fragment;
+    if (auto EC = Fragment.initialize(Reader))
+      return EC;
+
+    return V.visitFileChecksums(Fragment);
+  }
+  case DebugSubsectionKind::InlineeLines: {
+    DebugInlineeLinesSubsectionRef Fragment;
+    if (auto EC = Fragment.initialize(Reader))
+      return EC;
+    return V.visitInlineeLines(Fragment);
+  }
+  default: {
+    DebugUnknownSubsectionRef Fragment(R.kind(), R.getRecordData());
+    return V.visitUnknown(Fragment);
+  }
+  }
+}
diff --git a/lib/DebugInfo/CodeView/DebugSymbolsSubsection.cpp b/lib/DebugInfo/CodeView/DebugSymbolsSubsection.cpp
new file mode 100644
index 000000000000..dc8ba8c929ae
--- /dev/null
+++ b/lib/DebugInfo/CodeView/DebugSymbolsSubsection.cpp
@@ -0,0 +1,34 @@
+//===- DebugSymbolsSubsection.cpp -------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/DebugSymbolsSubsection.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+Error DebugSymbolsSubsectionRef::initialize(BinaryStreamReader Reader) {
+  return Reader.readArray(Records, Reader.getLength());
+}
+
+uint32_t DebugSymbolsSubsection::calculateSerializedSize() const {
+  return Length;
+}
+
+Error DebugSymbolsSubsection::commit(BinaryStreamWriter &Writer) const {
+  for (const auto &Record : Records) {
+    if (auto EC = Writer.writeBytes(Record.RecordData))
+      return EC;
+  }
+  return Error::success();
+}
+
+void DebugSymbolsSubsection::addSymbol(CVSymbol Symbol) {
+  Records.push_back(Symbol);
+  Length += Symbol.length();
+}
\ No newline at end of file
diff --git a/lib/DebugInfo/CodeView/EnumTables.cpp b/lib/DebugInfo/CodeView/EnumTables.cpp
index fc6008ba66de..0441110c85ef 100644
--- a/lib/DebugInfo/CodeView/EnumTables.cpp
+++ b/lib/DebugInfo/CodeView/EnumTables.cpp
@@ -245,20 +245,20 @@ static const EnumEntry<uint32_t> FrameProcSymFlagNames[] = {
 };
 
 static const EnumEntry<uint32_t> ModuleSubstreamKindNames[] = {
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, None),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, Symbols),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, Lines),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, StringTable),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, FileChecksums),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, FrameData),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, InlineeLines),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, CrossScopeImports),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, CrossScopeExports),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, ILLines),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, FuncMDTokenMap),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, TypeMDTokenMap),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, MergedAssemblyInput),
-    CV_ENUM_CLASS_ENT(ModuleDebugFragmentKind, CoffSymbolRVA),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, None),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, Symbols),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, Lines),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, StringTable),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, FileChecksums),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, FrameData),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, InlineeLines),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, CrossScopeImports),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, CrossScopeExports),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, ILLines),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, FuncMDTokenMap),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, TypeMDTokenMap),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, MergedAssemblyInput),
+    CV_ENUM_CLASS_ENT(DebugSubsectionKind, CoffSymbolRVA),
 };
 
 static const EnumEntry<uint16_t> ExportSymFlagNames[] = {
diff --git a/lib/DebugInfo/CodeView/ModuleDebugFragmentRecord.cpp b/lib/DebugInfo/CodeView/ModuleDebugFragmentRecord.cpp
deleted file mode 100644
index b2543de78069..000000000000
--- a/lib/DebugInfo/CodeView/ModuleDebugFragmentRecord.cpp
+++ /dev/null
@@ -1,84 +0,0 @@
-//===- ModuleDebugFragmentRecord.cpp -----------------------------*- C++-*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragmentRecord.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragment.h"
-
-#include "llvm/Support/BinaryStreamReader.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-
-ModuleDebugFragmentRecord::ModuleDebugFragmentRecord()
-    : Kind(ModuleDebugFragmentKind::None) {}
-
-ModuleDebugFragmentRecord::ModuleDebugFragmentRecord(
-    ModuleDebugFragmentKind Kind, BinaryStreamRef Data)
-    : Kind(Kind), Data(Data) {}
-
-Error ModuleDebugFragmentRecord::initialize(BinaryStreamRef Stream,
-                                            ModuleDebugFragmentRecord &Info) {
-  const ModuleDebugFragmentHeader *Header;
-  BinaryStreamReader Reader(Stream);
-  if (auto EC = Reader.readObject(Header))
-    return EC;
-
-  ModuleDebugFragmentKind Kind =
-      static_cast<ModuleDebugFragmentKind>(uint32_t(Header->Kind));
-  switch (Kind) {
-  case ModuleDebugFragmentKind::FileChecksums:
-  case ModuleDebugFragmentKind::Lines:
-  case ModuleDebugFragmentKind::InlineeLines:
-    break;
-  default:
-    llvm_unreachable("Unexpected debug fragment kind!");
-  }
-  if (auto EC = Reader.readStreamRef(Info.Data, Header->Length))
-    return EC;
-  Info.Kind = Kind;
-  return Error::success();
-}
-
-uint32_t ModuleDebugFragmentRecord::getRecordLength() const {
-  uint32_t Result = sizeof(ModuleDebugFragmentHeader) + Data.getLength();
-  assert(Result % 4 == 0);
-  return Result;
-}
-
-ModuleDebugFragmentKind ModuleDebugFragmentRecord::kind() const { return Kind; }
-
-BinaryStreamRef ModuleDebugFragmentRecord::getRecordData() const {
-  return Data;
-}
-
-ModuleDebugFragmentRecordBuilder::ModuleDebugFragmentRecordBuilder(
-    ModuleDebugFragmentKind Kind, ModuleDebugFragment &Frag)
-    : Kind(Kind), Frag(Frag) {}
-
-uint32_t ModuleDebugFragmentRecordBuilder::calculateSerializedLength() {
-  uint32_t Size = sizeof(ModuleDebugFragmentHeader) +
-                  alignTo(Frag.calculateSerializedLength(), 4);
-  return Size;
-}
-
-Error ModuleDebugFragmentRecordBuilder::commit(BinaryStreamWriter &Writer) {
-  ModuleDebugFragmentHeader Header;
-  Header.Kind = uint32_t(Kind);
-  Header.Length =
-      calculateSerializedLength() - sizeof(ModuleDebugFragmentHeader);
-
-  if (auto EC = Writer.writeObject(Header))
-    return EC;
-  if (auto EC = Frag.commit(Writer))
-    return EC;
-  if (auto EC = Writer.padToAlignment(4))
-    return EC;
-
-  return Error::success();
-}
diff --git a/lib/DebugInfo/CodeView/ModuleDebugFragmentVisitor.cpp b/lib/DebugInfo/CodeView/ModuleDebugFragmentVisitor.cpp
deleted file mode 100644
index dc591f3990e2..000000000000
--- a/lib/DebugInfo/CodeView/ModuleDebugFragmentVisitor.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===- ModuleDebugFragmentVisitor.cpp ---------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragmentVisitor.h"
-
-#include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragmentRecord.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugLineFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugUnknownFragment.h"
-#include "llvm/Support/BinaryStreamReader.h"
-#include "llvm/Support/BinaryStreamRef.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-
-Error llvm::codeview::visitModuleDebugFragment(
-    const ModuleDebugFragmentRecord &R, ModuleDebugFragmentVisitor &V) {
-  BinaryStreamReader Reader(R.getRecordData());
-  switch (R.kind()) {
-  case ModuleDebugFragmentKind::Lines: {
-    ModuleDebugLineFragmentRef Fragment;
-    if (auto EC = Fragment.initialize(Reader))
-      return EC;
-
-    return V.visitLines(Fragment);
-  }
-  case ModuleDebugFragmentKind::FileChecksums: {
-    ModuleDebugFileChecksumFragmentRef Fragment;
-    if (auto EC = Fragment.initialize(Reader))
-      return EC;
-
-    return V.visitFileChecksums(Fragment);
-  }
-  case ModuleDebugFragmentKind::InlineeLines: {
-    ModuleDebugInlineeLineFragmentRef Fragment;
-    if (auto EC = Fragment.initialize(Reader))
-      return EC;
-    return V.visitInlineeLines(Fragment);
-  }
-  default: {
-    ModuleDebugUnknownFragmentRef Fragment(R.kind(), R.getRecordData());
-    return V.visitUnknown(Fragment);
-  }
-  }
-}
diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp
index 7d01c8c5f194..2f5a7d256c60 100644
--- a/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -11,8 +11,8 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
-#include "llvm/DebugInfo/CodeView/StringTable.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumpDelegate.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
@@ -369,7 +369,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(
   DictScope S(W, "DefRangeSubfield");
 
   if (ObjDelegate) {
-    StringTableRef Strings = ObjDelegate->getStringTable();
+    DebugStringTableSubsectionRef Strings = ObjDelegate->getStringTable();
     auto ExpectedProgram = Strings.getString(DefRangeSubfield.Program);
     if (!ExpectedProgram) {
       consumeError(ExpectedProgram.takeError());
@@ -390,7 +390,7 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
   DictScope S(W, "DefRange");
 
   if (ObjDelegate) {
-    StringTableRef Strings = ObjDelegate->getStringTable();
+    DebugStringTableSubsectionRef Strings = ObjDelegate->getStringTable();
     auto ExpectedProgram = Strings.getString(DefRange.Program);
     if (!ExpectedProgram) {
       consumeError(ExpectedProgram.takeError());
diff --git a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
index 867864e47dce..b28ec2ff33ac 100644
--- a/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
@@ -10,7 +10,7 @@
 #include "llvm/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.h"
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragmentRecord.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
@@ -170,8 +170,8 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter,
 }
 
 void DbiModuleDescriptorBuilder::addC13Fragment(
-    std::unique_ptr<ModuleDebugLineFragment> Lines) {
-  ModuleDebugLineFragment &Frag = *Lines;
+    std::unique_ptr<DebugLinesSubsection> Lines) {
+  DebugLinesSubsection &Frag = *Lines;
 
   // File Checksums have to come first, so push an empty entry on if this
   // is the first.
@@ -180,12 +180,12 @@ void DbiModuleDescriptorBuilder::addC13Fragment(
 
   this->LineInfo.push_back(std::move(Lines));
   C13Builders.push_back(
-      llvm::make_unique<ModuleDebugFragmentRecordBuilder>(Frag.kind(), Frag));
+      llvm::make_unique<DebugSubsectionRecordBuilder>(Frag.kind(), Frag));
 }
 
 void DbiModuleDescriptorBuilder::addC13Fragment(
-    std::unique_ptr<codeview::ModuleDebugInlineeLineFragment> Inlinees) {
-  ModuleDebugInlineeLineFragment &Frag = *Inlinees;
+    std::unique_ptr<codeview::DebugInlineeLinesSubsection> Inlinees) {
+  DebugInlineeLinesSubsection &Frag = *Inlinees;
 
   // File Checksums have to come first, so push an empty entry on if this
   // is the first.
@@ -194,17 +194,17 @@ void DbiModuleDescriptorBuilder::addC13Fragment(
 
   this->Inlinees.push_back(std::move(Inlinees));
   C13Builders.push_back(
-      llvm::make_unique<ModuleDebugFragmentRecordBuilder>(Frag.kind(), Frag));
+      llvm::make_unique<DebugSubsectionRecordBuilder>(Frag.kind(), Frag));
 }
 
 void DbiModuleDescriptorBuilder::setC13FileChecksums(
-    std::unique_ptr<ModuleDebugFileChecksumFragment> Checksums) {
+    std::unique_ptr<DebugChecksumsSubsection> Checksums) {
   assert(!ChecksumInfo && "Can't have more than one checksum info!");
 
   if (C13Builders.empty())
     C13Builders.push_back(nullptr);
 
   ChecksumInfo = std::move(Checksums);
-  C13Builders[0] = llvm::make_unique<ModuleDebugFragmentRecordBuilder>(
+  C13Builders[0] = llvm::make_unique<DebugSubsectionRecordBuilder>(
       ChecksumInfo->kind(), *ChecksumInfo);
 }
diff --git a/lib/MC/MCCodeView.cpp b/lib/MC/MCCodeView.cpp
index 2b97ecc0fd2c..a0a0ef312276 100644
--- a/lib/MC/MCCodeView.cpp
+++ b/lib/MC/MCCodeView.cpp
@@ -145,7 +145,7 @@ void CodeViewContext::emitStringTable(MCObjectStreamer &OS) {
   MCSymbol *StringBegin = Ctx.createTempSymbol("strtab_begin", false),
            *StringEnd = Ctx.createTempSymbol("strtab_end", false);
 
-  OS.EmitIntValue(unsigned(ModuleDebugFragmentKind::StringTable), 4);
+  OS.EmitIntValue(unsigned(DebugSubsectionKind::StringTable), 4);
   OS.emitAbsoluteSymbolDiff(StringEnd, StringBegin, 4);
   OS.EmitLabel(StringBegin);
 
@@ -172,7 +172,7 @@ void CodeViewContext::emitFileChecksums(MCObjectStreamer &OS) {
   MCSymbol *FileBegin = Ctx.createTempSymbol("filechecksums_begin", false),
            *FileEnd = Ctx.createTempSymbol("filechecksums_end", false);
 
-  OS.EmitIntValue(unsigned(ModuleDebugFragmentKind::FileChecksums), 4);
+  OS.EmitIntValue(unsigned(DebugSubsectionKind::FileChecksums), 4);
   OS.emitAbsoluteSymbolDiff(FileEnd, FileBegin, 4);
   OS.EmitLabel(FileBegin);
 
@@ -197,7 +197,7 @@ void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
   MCSymbol *LineBegin = Ctx.createTempSymbol("linetable_begin", false),
            *LineEnd = Ctx.createTempSymbol("linetable_end", false);
 
-  OS.EmitIntValue(unsigned(ModuleDebugFragmentKind::Lines), 4);
+  OS.EmitIntValue(unsigned(DebugSubsectionKind::Lines), 4);
   OS.emitAbsoluteSymbolDiff(LineEnd, LineBegin, 4);
   OS.EmitLabel(LineBegin);
   OS.EmitCOFFSecRel32(FuncBegin, /*Offset=*/0);
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index e1e2c22e1df1..f36c25a0ce91 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -1559,11 +1559,13 @@ IEEEFloat::opStatus IEEEFloat::divideSpecials(const IEEEFloat &rhs) {
   case PackCategoriesIntoKey(fcInfinity, fcNaN):
     category = fcNaN;
     copySignificand(rhs);
+    LLVM_FALLTHROUGH;
   case PackCategoriesIntoKey(fcNaN, fcZero):
   case PackCategoriesIntoKey(fcNaN, fcNormal):
   case PackCategoriesIntoKey(fcNaN, fcInfinity):
   case PackCategoriesIntoKey(fcNaN, fcNaN):
     sign = false;
+    LLVM_FALLTHROUGH;
   case PackCategoriesIntoKey(fcInfinity, fcZero):
   case PackCategoriesIntoKey(fcInfinity, fcNormal):
   case PackCategoriesIntoKey(fcZero, fcInfinity):
diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
index 8d68c6ae9682..dec6baf7bf47 100644
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp
@@ -72,10 +72,15 @@ std::unique_ptr<raw_fd_ostream> llvm::CreateInfoOutputFile() {
   return llvm::make_unique<raw_fd_ostream>(2, false); // stderr.
 }
 
-static TimerGroup *getDefaultTimerGroup() {
-  static TimerGroup DefaultTimerGroup("misc", "Miscellaneous Ungrouped Timers");
-  return &DefaultTimerGroup;
-}
+namespace {
+struct CreateDefaultTimerGroup {
+  static void *call() {
+    return new TimerGroup("misc", "Miscellaneous Ungrouped Timers");
+  }
+};
+} // namespace
+static ManagedStatic<TimerGroup, CreateDefaultTimerGroup> DefaultTimerGroup;
+static TimerGroup *getDefaultTimerGroup() { return &*DefaultTimerGroup; }
 
 //===----------------------------------------------------------------------===//
 // Timer Implementation
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 09f9759ce7da..f07208b1fb90 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -405,27 +405,21 @@ IntInit::convertInitializerBitRange(ArrayRef<unsigned> Bits) const {
 }
 
 CodeInit *CodeInit::get(StringRef V) {
-  static DenseMap<StringRef, CodeInit*> ThePool;
+  static StringMap<CodeInit*, BumpPtrAllocator &> ThePool(Allocator);
 
-  auto I = ThePool.insert(std::make_pair(V, nullptr));
-  if (I.second) {
-    StringRef VCopy = V.copy(Allocator);
-    I.first->first = VCopy;
-    I.first->second = new(Allocator) CodeInit(VCopy);
-  }
-  return I.first->second;
+  auto &Entry = *ThePool.insert(std::make_pair(V, nullptr)).first;
+  if (!Entry.second)
+    Entry.second = new(Allocator) CodeInit(Entry.getKey());
+  return Entry.second;
 }
 
 StringInit *StringInit::get(StringRef V) {
-  static DenseMap<StringRef, StringInit*> ThePool;
+  static StringMap<StringInit*, BumpPtrAllocator &> ThePool(Allocator);
 
-  auto I = ThePool.insert(std::make_pair(V, nullptr));
-  if (I.second) {
-    StringRef VCopy = V.copy(Allocator);
-    I.first->first = VCopy;
-    I.first->second = new(Allocator) StringInit(VCopy);
-  }
-  return I.first->second;
+  auto &Entry = *ThePool.insert(std::make_pair(V, nullptr)).first;
+  if (!Entry.second)
+    Entry.second = new(Allocator) StringInit(Entry.getKey());
+  return Entry.second;
 }
 
 Init *StringInit::convertInitializerTo(RecTy *Ty) const {
@@ -1540,7 +1534,7 @@ Init *DagInit::resolveReferences(Record &R, const RecordVal *RV) const {
   SmallVector<Init*, 8> NewArgs;
   NewArgs.reserve(arg_size());
   bool ArgsChanged = false;
-  for (const Init *Arg : args()) {
+  for (const Init *Arg : getArgs()) {
     Init *NewArg = Arg->resolveReferences(R, RV);
     NewArgs.push_back(NewArg);
     ArgsChanged |= NewArg != Arg;
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 0b92249580c8..e96ee7d29b3e 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -137,6 +137,34 @@ static cl::opt<bool> EnableRedZone("aarch64-redzone",
 
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 
+/// Look at each instruction that references stack frames and return the stack
+/// size limit beyond which some of these instructions will require a scratch
+/// register during their expansion later.
+static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
+  // FIXME: For now, just conservatively guestimate based on unscaled indexing
+  // range. We'll end up allocating an unnecessary spill slot a lot, but
+  // realistically that's not a big deal at this stage of the game.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugValue() || MI.isPseudo() ||
+          MI.getOpcode() == AArch64::ADDXri ||
+          MI.getOpcode() == AArch64::ADDSXri)
+        continue;
+
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        if (!MI.getOperand(i).isFI())
+          continue;
+
+        int Offset = 0;
+        if (isAArch64FrameOffsetLegal(MI, Offset, nullptr, nullptr, nullptr) ==
+            AArch64FrameOffsetCannotUpdate)
+          return 0;
+      }
+    }
+  }
+  return 255;
+}
+
 bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
   if (!EnableRedZone)
     return false;
@@ -1169,16 +1197,13 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   unsigned NumRegsSpilled = SavedRegs.count();
   bool CanEliminateFrame = NumRegsSpilled == 0;
 
-  // FIXME: Set BigStack if any stack slot references may be out of range.
-  // For now, just conservatively guestimate based on unscaled indexing
-  // range. We'll end up allocating an unnecessary spill slot a lot, but
-  // realistically that's not a big deal at this stage of the game.
   // The CSR spill slots have not been allocated yet, so estimateStackSize
   // won't include them.
   MachineFrameInfo &MFI = MF.getFrameInfo();
   unsigned CFSize = MFI.estimateStackSize(MF) + 8 * NumRegsSpilled;
   DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
-  bool BigStack = (CFSize >= 256);
+  unsigned EstimatedStackSizeLimit = estimateRSStackSizeLimit(MF);
+  bool BigStack = (CFSize > EstimatedStackSizeLimit);
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
     AFI->setHasStackFrame(true);
 
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 62f4c953830b..f798010906cc 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -381,7 +381,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FNEARBYINT, MVT::v4f16, Expand);
   setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
   setOperationAction(ISD::FPOW, MVT::v4f16, Expand);
-  setOperationAction(ISD::FPOWI, MVT::v4f16, Expand);
   setOperationAction(ISD::FREM, MVT::v4f16, Expand);
   setOperationAction(ISD::FROUND, MVT::v4f16, Expand);
   setOperationAction(ISD::FRINT, MVT::v4f16, Expand);
@@ -413,7 +412,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FNEARBYINT, MVT::v8f16, Expand);
   setOperationAction(ISD::FNEG, MVT::v8f16, Expand);
   setOperationAction(ISD::FPOW, MVT::v8f16, Expand);
-  setOperationAction(ISD::FPOWI, MVT::v8f16, Expand);
   setOperationAction(ISD::FREM, MVT::v8f16, Expand);
   setOperationAction(ISD::FROUND, MVT::v8f16, Expand);
   setOperationAction(ISD::FRINT, MVT::v8f16, Expand);
@@ -726,7 +724,6 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
   if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
     setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FCOS, VT, Expand);
-    setOperationAction(ISD::FPOWI, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
     setOperationAction(ISD::FLOG, VT, Expand);
     setOperationAction(ISD::FLOG2, VT, Expand);
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 660879426810..0582ce95693a 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -730,7 +730,7 @@ class SISubtarget final : public AMDGPUSubtarget {
   /// \returns True if waitcnt instruction is needed before barrier instruction,
   /// false otherwise.
   bool needWaitcntBeforeBarrier() const {
-    return getGeneration() < GFX9;
+    return true;
   }
 
   /// \returns true if the flat_scratch register should be initialized with the
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index a9d3a31a7240..48827f463997 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -736,6 +736,9 @@ void GCNPassConfig::addMachineSSAOptimization() {
   addPass(createSIShrinkInstructionsPass());
   if (EnableSDWAPeephole) {
     addPass(&SIPeepholeSDWAID);
+    addPass(&MachineLICMID);
+    addPass(&MachineCSEID);
+    addPass(&SIFoldOperandsID);
     addPass(&DeadMachineInstructionElimID);
   }
 }
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index d63414735b95..f13629a3185f 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -247,9 +247,10 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
 
 // If the use operand doesn't care about the value, this may be an operand only
 // used for register indexing, in which case it is unsafe to fold.
-static bool isUseSafeToFold(const MachineInstr &MI,
+static bool isUseSafeToFold(const SIInstrInfo *TII,
+                            const MachineInstr &MI,
                             const MachineOperand &UseMO) {
-  return !UseMO.isUndef();
+  return !UseMO.isUndef() && !TII->isSDWA(MI);
   //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
 }
 
@@ -261,7 +262,7 @@ void SIFoldOperands::foldOperand(
   SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {
   const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);
 
-  if (!isUseSafeToFold(*UseMI, UseOp))
+  if (!isUseSafeToFold(TII, *UseMI, UseOp))
     return;
 
   // FIXME: Fold operands with subregs.
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 4dc090d9b7ed..fae249b04492 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -55,6 +55,7 @@ class SIPeepholeSDWA : public MachineFunctionPass {
 
   std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
   std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
+  SmallVector<MachineInstr *, 8> ConvertedInstructions;
 
   Optional<int64_t> foldToImm(const MachineOperand &Op) const;
 
@@ -69,6 +70,7 @@ class SIPeepholeSDWA : public MachineFunctionPass {
   void matchSDWAOperands(MachineFunction &MF);
   bool isConvertibleToSDWA(const MachineInstr &MI) const;
   bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
+  void legalizeScalarOperands(MachineInstr &MI) const;
 
   StringRef getPassName() const override { return "SI Peephole SDWA"; }
 
@@ -289,7 +291,7 @@ bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
   MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
   MachineOperand *SrcMods =
       TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
-  assert(Src && Src->isReg());
+  assert(Src && (Src->isReg() || Src->isImm()));
   if (!isSameReg(*Src, *getReplacedOperand())) {
     // If this is not src0 then it should be src1
     Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
@@ -580,18 +582,8 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
 }
 
 bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
-  // Check if this instruction can be converted to SDWA:
-  // 1. Does this opcode support SDWA
-  if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
-    return false;
-
-  // 2. Are all operands - VGPRs
-  for (const MachineOperand &Operand : MI.explicit_operands()) {
-    if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg()))
-      return false;
-  }
-
-  return true;
+  // Check if this instruction has opcode that supports SDWA
+  return AMDGPU::getSDWAOp(MI.getOpcode()) != -1;
 }
 
 bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
@@ -685,7 +677,9 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
     if (PotentialMatches.count(Operand->getParentInst()) == 0)
       Converted |= Operand->convertToSDWA(*SDWAInst, TII);
   }
-  if (!Converted) {
+  if (Converted) {
+    ConvertedInstructions.push_back(SDWAInst);
+  } else {
     SDWAInst->eraseFromParent();
     return false;
   }
@@ -698,6 +692,29 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
   return true;
 }
 
+// If an instruction was converted to SDWA it should not have immediates or SGPR
+// operands. Copy its scalar operands into VGPRs.
+void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
+  const MCInstrDesc &Desc = TII->get(MI.getOpcode());
+  for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
+    MachineOperand &Op = MI.getOperand(I);
+    if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
+      continue;
+    if (Desc.OpInfo[I].RegClass == -1 ||
+       !TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
+      continue;
+    unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
+                        TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
+    if (Op.isImm())
+      Copy.addImm(Op.getImm());
+    else if (Op.isReg())
+      Copy.addReg(Op.getReg(), Op.isKill() ? RegState::Kill : 0,
+                  Op.getSubReg());
+    Op.ChangeToRegister(VGPR, false);
+  }
+}
+
 bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
 
@@ -728,5 +745,9 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
 
   PotentialMatches.clear();
   SDWAOperands.clear();
+
+  while (!ConvertedInstructions.empty())
+    legalizeScalarOperands(*ConvertedInstructions.pop_back_val());
+
   return false;
 }
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 62e774d869da..949d821e36b2 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -585,7 +585,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSQRT, MVT::v2f64, Expand);
     setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
     setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
-    setOperationAction(ISD::FPOWI, MVT::v2f64, Expand);
     setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
     setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
     setOperationAction(ISD::FLOG2, MVT::v2f64, Expand);
@@ -603,7 +602,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
     setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
     setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
-    setOperationAction(ISD::FPOWI, MVT::v4f32, Expand);
     setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG2, MVT::v4f32, Expand);
@@ -620,7 +618,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSQRT, MVT::v2f32, Expand);
     setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
     setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
-    setOperationAction(ISD::FPOWI, MVT::v2f32, Expand);
     setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
     setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
     setOperationAction(ISD::FLOG2, MVT::v2f32, Expand);
@@ -743,7 +740,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FSQRT,      MVT::f64, Expand);
     setOperationAction(ISD::FSIN,       MVT::f64, Expand);
     setOperationAction(ISD::FCOS,       MVT::f64, Expand);
-    setOperationAction(ISD::FPOWI,      MVT::f64, Expand);
     setOperationAction(ISD::FPOW,       MVT::f64, Expand);
     setOperationAction(ISD::FLOG,       MVT::f64, Expand);
     setOperationAction(ISD::FLOG2,      MVT::f64, Expand);
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 1dffebe97f2d..5ecf9320d5c2 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2003,7 +2003,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     // Floating point arithmetic/math functions:
     ISD::FADD,    ISD::FSUB,    ISD::FMUL,    ISD::FMA,     ISD::FDIV,
     ISD::FREM,    ISD::FNEG,    ISD::FABS,    ISD::FSQRT,   ISD::FSIN,
-    ISD::FCOS,    ISD::FPOWI,   ISD::FPOW,    ISD::FLOG,    ISD::FLOG2,
+    ISD::FCOS,    ISD::FPOW,    ISD::FLOG,    ISD::FLOG2,
     ISD::FLOG10,  ISD::FEXP,    ISD::FEXP2,   ISD::FCEIL,   ISD::FTRUNC,
     ISD::FRINT,   ISD::FNEARBYINT,            ISD::FROUND,  ISD::FFLOOR,
     ISD::FMINNUM, ISD::FMAXNUM, ISD::FSINCOS,
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index d407774574be..d855d3e7f778 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -13,6 +13,7 @@
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "MipsTargetStreamer.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -216,9 +217,15 @@ class MipsAsmParser : public MCTargetAsmParser {
                                unsigned SrcReg, bool Is32BitSym, SMLoc IDLoc,
                                MCStreamer &Out, const MCSubtargetInfo *STI);
 
+  bool emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc, MCSymbol *Sym);
+
   bool expandLoadImm(MCInst &Inst, bool Is32BitImm, SMLoc IDLoc,
                      MCStreamer &Out, const MCSubtargetInfo *STI);
 
+  bool expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR, bool Is64FPU,
+                         SMLoc IDLoc, MCStreamer &Out,
+                         const MCSubtargetInfo *STI);
+
   bool expandLoadAddress(unsigned DstReg, unsigned BaseReg,
                          const MCOperand &Offset, bool Is32BitAddress,
                          SMLoc IDLoc, MCStreamer &Out,
@@ -1011,6 +1018,16 @@ class MipsOperand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createReg(getAFGR64Reg()));
   }
 
+  void addStrictlyAFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getAFGR64Reg()));
+  }
+
+  void addStrictlyFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getFGR64Reg()));
+  }
+
   void addFGR64AsmRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(getFGR64Reg()));
@@ -1027,6 +1044,15 @@ class MipsOperand : public MCParsedAsmOperand {
                     "registers");
   }
 
+  void addStrictlyFGR32AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getFGR32Reg()));
+    // FIXME: We ought to do this for -integrated-as without -via-file-asm too.
+    if (!AsmParser.useOddSPReg() && RegIdx.Index & 1)
+      AsmParser.Error(StartLoc, "-mno-odd-spreg prohibits the use of odd FPU "
+                                "registers");
+  }
+
   void addFGRH32AsmRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(getFGRH32Reg()));
@@ -1574,6 +1600,11 @@ class MipsOperand : public MCParsedAsmOperand {
     return isRegIdx() && RegIdx.Kind & RegKind_FGR && RegIdx.Index <= 31;
   }
 
+  bool isStrictlyFGRAsmReg() const {
+    // AFGR64 is $0-$15 but we handle this in getAFGR64()
+    return isRegIdx() && RegIdx.Kind == RegKind_FGR && RegIdx.Index <= 31;
+  }
+
   bool isHWRegsAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_HWRegs && RegIdx.Index <= 31;
   }
@@ -2368,6 +2399,27 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   case Mips::PseudoTRUNC_W_D:
     return expandTrunc(Inst, true, true, IDLoc, Out, STI) ? MER_Fail
                                                           : MER_Success;
+
+  case Mips::LoadImmSingleGPR:
+    return expandLoadImmReal(Inst, true, true, false, IDLoc, Out, STI)
+               ? MER_Fail
+               : MER_Success;
+  case Mips::LoadImmSingleFGR:
+    return expandLoadImmReal(Inst, true, false, false, IDLoc, Out, STI)
+               ? MER_Fail
+               : MER_Success;
+  case Mips::LoadImmDoubleGPR:
+    return expandLoadImmReal(Inst, false, true, false, IDLoc, Out, STI)
+               ? MER_Fail
+               : MER_Success;
+  case Mips::LoadImmDoubleFGR:
+      return expandLoadImmReal(Inst, false, false, true, IDLoc, Out, STI)
+               ? MER_Fail
+               : MER_Success;
+  case Mips::LoadImmDoubleFGR_32:
+    return expandLoadImmReal(Inst, false, false, false, IDLoc, Out, STI)
+               ? MER_Fail
+               : MER_Success;
   case Mips::Ulh:
     return expandUlh(Inst, true, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::Ulhu:
@@ -2952,6 +3004,302 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
   return false;
 }
 
+// Each double-precision register DO-D15 overlaps with two of the single
+// precision registers F0-F31. As an example, all of the following hold true:
+// D0 + 1 == F1, F1 + 1 == D1, F1 + 1 == F2, depending on the context.
+static unsigned nextReg(unsigned Reg) {
+  if (MipsMCRegisterClasses[Mips::FGR32RegClassID].contains(Reg))
+    return Reg == (unsigned)Mips::F31 ? (unsigned)Mips::F0 : Reg + 1;
+  switch (Reg) {
+  default: llvm_unreachable("Unknown register in assembly macro expansion!");
+  case Mips::ZERO: return Mips::AT;
+  case Mips::AT:   return Mips::V0;
+  case Mips::V0:   return Mips::V1;
+  case Mips::V1:   return Mips::A0;
+  case Mips::A0:   return Mips::A1;
+  case Mips::A1:   return Mips::A2;
+  case Mips::A2:   return Mips::A3;
+  case Mips::A3:   return Mips::T0;
+  case Mips::T0:   return Mips::T1;
+  case Mips::T1:   return Mips::T2;
+  case Mips::T2:   return Mips::T3;
+  case Mips::T3:   return Mips::T4;
+  case Mips::T4:   return Mips::T5;
+  case Mips::T5:   return Mips::T6;
+  case Mips::T6:   return Mips::T7;
+  case Mips::T7:   return Mips::S0;
+  case Mips::S0:   return Mips::S1;
+  case Mips::S1:   return Mips::S2;
+  case Mips::S2:   return Mips::S3;
+  case Mips::S3:   return Mips::S4;
+  case Mips::S4:   return Mips::S5;
+  case Mips::S5:   return Mips::S6;
+  case Mips::S6:   return Mips::S7;
+  case Mips::S7:   return Mips::T8;
+  case Mips::T8:   return Mips::T9;
+  case Mips::T9:   return Mips::K0;
+  case Mips::K0:   return Mips::K1;
+  case Mips::K1:   return Mips::GP;
+  case Mips::GP:   return Mips::SP;
+  case Mips::SP:   return Mips::FP;
+  case Mips::FP:   return Mips::RA;
+  case Mips::RA:   return Mips::ZERO;
+  case Mips::D0:   return Mips::F1;
+  case Mips::D1:   return Mips::F3;
+  case Mips::D2:   return Mips::F5;
+  case Mips::D3:   return Mips::F7;
+  case Mips::D4:   return Mips::F9;
+  case Mips::D5:   return Mips::F11;
+  case Mips::D6:   return Mips::F13;
+  case Mips::D7:   return Mips::F15;
+  case Mips::D8:   return Mips::F17;
+  case Mips::D9:   return Mips::F19;
+  case Mips::D10:   return Mips::F21;
+  case Mips::D11:   return Mips::F23;
+  case Mips::D12:   return Mips::F25;
+  case Mips::D13:   return Mips::F27;
+  case Mips::D14:   return Mips::F29;
+  case Mips::D15:   return Mips::F31;
+  }
+}
+
+// FIXME: This method is too general. In principle we should compute the number
+// of instructions required to synthesize the immediate inline compared to
+// synthesizing the address inline and relying on non .text sections.
+// For static O32 and N32 this may yield a small benefit, for static N64 this is
+// likely to yield a much larger benefit as we have to synthesize a 64bit
+// address to load a 64 bit value.
+bool MipsAsmParser::emitPartialAddress(MipsTargetStreamer &TOut, SMLoc IDLoc,
+                                       MCSymbol *Sym) {
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  if(IsPicEnabled) {
+    const MCExpr *GotSym =
+        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+    const MipsMCExpr *GotExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_GOT, GotSym, getContext());
+
+    if(isABI_O32() || isABI_N32()) {
+      TOut.emitRRX(Mips::LW, ATReg, Mips::GP, MCOperand::createExpr(GotExpr),
+                   IDLoc, STI);
+    } else { //isABI_N64()
+      TOut.emitRRX(Mips::LD, ATReg, Mips::GP, MCOperand::createExpr(GotExpr),
+                   IDLoc, STI);
+    }
+  } else { //!IsPicEnabled
+    const MCExpr *HiSym =
+        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+    const MipsMCExpr *HiExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_HI, HiSym, getContext());
+
+    // FIXME: This is technically correct but gives a different result to gas,
+    // but gas is incomplete there (it has a fixme noting it doesn't work with
+    // 64-bit addresses).
+    // FIXME: With -msym32 option, the address expansion for N64 should probably
+    // use the O32 / N32 case. It's safe to use the 64 address expansion as the
+    // symbol's value is considered sign extended.
+    if(isABI_O32() || isABI_N32()) {
+      TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
+    } else { //isABI_N64()
+      const MCExpr *HighestSym =
+          MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+      const MipsMCExpr *HighestExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, HighestSym, getContext());
+      const MCExpr *HigherSym =
+          MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+      const MipsMCExpr *HigherExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, HigherSym, getContext());
+
+      TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HighestExpr), IDLoc,
+                  STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg,
+                   MCOperand::createExpr(HigherExpr), IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(HiExpr),
+                   IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, ATReg, ATReg, 16, IDLoc, STI);
+    }
+  }
+  return false;
+}
+
+bool MipsAsmParser::expandLoadImmReal(MCInst &Inst, bool IsSingle, bool IsGPR,
+                                      bool Is64FPU, SMLoc IDLoc,
+                                      MCStreamer &Out,
+                                      const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  assert(Inst.getNumOperands() == 2 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() && Inst.getOperand(1).isImm() &&
+         "Invalid instruction operand.");
+
+  unsigned FirstReg = Inst.getOperand(0).getReg();
+  uint64_t ImmOp64 = Inst.getOperand(1).getImm();
+
+  uint32_t HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32;
+  // If ImmOp64 is AsmToken::Integer type (all bits set to zero in the
+  // exponent field), convert it to double (e.g. 1 to 1.0)
+  if ((HiImmOp64 & 0x7ff00000) == 0) {
+    APFloat RealVal(APFloat::IEEEdouble(), ImmOp64);
+    ImmOp64 = RealVal.bitcastToAPInt().getZExtValue();
+  }
+
+  uint32_t LoImmOp64 = ImmOp64 & 0xffffffff;
+  HiImmOp64 = (ImmOp64 & 0xffffffff00000000) >> 32;
+
+  if (IsSingle) {
+    // Conversion of a double in an uint64_t to a float in a uint32_t,
+    // retaining the bit pattern of a float.
+    uint32_t ImmOp32;
+    double doubleImm = BitsToDouble(ImmOp64);
+    float tmp_float = static_cast<float>(doubleImm);
+    ImmOp32 = FloatToBits(tmp_float);
+
+    if (IsGPR) {
+      if (loadImmediate(ImmOp32, FirstReg, Mips::NoRegister, true, true, IDLoc,
+                        Out, STI))
+        return true;
+      return false;
+    } else {
+      unsigned ATReg = getATReg(IDLoc);
+      if (!ATReg)
+        return true;
+      if (LoImmOp64 == 0) {
+        if (loadImmediate(ImmOp32, ATReg, Mips::NoRegister, true, true, IDLoc,
+                          Out, STI))
+          return true;
+        TOut.emitRR(Mips::MTC1, FirstReg, ATReg, IDLoc, STI);
+        return false;
+      }
+
+      MCSection *CS = getStreamer().getCurrentSectionOnly();
+      // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
+      // where appropriate.
+      MCSection *ReadOnlySection = getContext().getELFSection(
+          ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
+      MCSymbol *Sym = getContext().createTempSymbol();
+      const MCExpr *LoSym =
+          MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+      const MipsMCExpr *LoExpr =
+          MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+
+      getStreamer().SwitchSection(ReadOnlySection);
+      getStreamer().EmitLabel(Sym, IDLoc);
+      getStreamer().EmitIntValue(ImmOp32, 4);
+      getStreamer().SwitchSection(CS);
+
+      if(emitPartialAddress(TOut, IDLoc, Sym))
+        return true;
+      TOut.emitRRX(Mips::LWC1, FirstReg, ATReg,
+                   MCOperand::createExpr(LoExpr), IDLoc, STI);
+    }
+    return false;
+  }
+
+  // if(!IsSingle)
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  if (IsGPR) {
+    if (LoImmOp64 == 0) {
+      if(isABI_N32() || isABI_N64()) {
+        if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, false, true,
+                          IDLoc, Out, STI))
+          return true;
+        return false;
+      } else {
+        if (loadImmediate(HiImmOp64, FirstReg, Mips::NoRegister, true, true,
+                        IDLoc, Out, STI))
+          return true;
+
+        if (loadImmediate(0, nextReg(FirstReg), Mips::NoRegister, true, true,
+                        IDLoc, Out, STI))
+          return true;
+        return false;
+      }
+    }
+
+    MCSection *CS = getStreamer().getCurrentSectionOnly();
+    MCSection *ReadOnlySection = getContext().getELFSection(
+        ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
+    MCSymbol *Sym = getContext().createTempSymbol();
+    const MCExpr *LoSym =
+        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+    const MipsMCExpr *LoExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+
+    getStreamer().SwitchSection(ReadOnlySection);
+    getStreamer().EmitLabel(Sym, IDLoc);
+    getStreamer().EmitIntValue(HiImmOp64, 4);
+    getStreamer().EmitIntValue(LoImmOp64, 4);
+    getStreamer().SwitchSection(CS);
+
+    if(emitPartialAddress(TOut, IDLoc, Sym))
+      return true;
+    if(isABI_N64())
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg,
+                   MCOperand::createExpr(LoExpr), IDLoc, STI);
+    else
+      TOut.emitRRX(Mips::ADDiu, ATReg, ATReg,
+                   MCOperand::createExpr(LoExpr), IDLoc, STI);
+
+    if(isABI_N32() || isABI_N64())
+      TOut.emitRRI(Mips::LD, FirstReg, ATReg, 0, IDLoc, STI);
+    else {
+      TOut.emitRRI(Mips::LW, FirstReg, ATReg, 0, IDLoc, STI);
+      TOut.emitRRI(Mips::LW, nextReg(FirstReg), ATReg, 4, IDLoc, STI);
+    }
+    return false;
+  } else { // if(!IsGPR && !IsSingle)
+    if ((LoImmOp64 == 0) &&
+        !((HiImmOp64 & 0xffff0000) && (HiImmOp64 & 0x0000ffff))) {
+      // FIXME: In the case where the constant is zero, we can load the
+      // register directly from the zero register.
+      if (loadImmediate(HiImmOp64, ATReg, Mips::NoRegister, true, true, IDLoc,
+                        Out, STI))
+        return true;
+      if (isABI_N32() || isABI_N64())
+        TOut.emitRR(Mips::DMTC1, FirstReg, ATReg, IDLoc, STI);
+      else if (hasMips32r2()) {
+        TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
+        TOut.emitRRR(Mips::MTHC1_D32, FirstReg, FirstReg, ATReg, IDLoc, STI);
+      } else {
+        TOut.emitRR(Mips::MTC1, nextReg(FirstReg), ATReg, IDLoc, STI);
+        TOut.emitRR(Mips::MTC1, FirstReg, Mips::ZERO, IDLoc, STI);
+      }
+      return false;
+    }
+
+    MCSection *CS = getStreamer().getCurrentSectionOnly();
+    // FIXME: Enhance this expansion to use the .lit4 & .lit8 sections
+    // where appropriate.
+    MCSection *ReadOnlySection = getContext().getELFSection(
+        ".rodata", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
+
+    MCSymbol *Sym = getContext().createTempSymbol();
+    const MCExpr *LoSym =
+        MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+    const MipsMCExpr *LoExpr =
+        MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
+
+    getStreamer().SwitchSection(ReadOnlySection);
+    getStreamer().EmitLabel(Sym, IDLoc);
+    getStreamer().EmitIntValue(HiImmOp64, 4);
+    getStreamer().EmitIntValue(LoImmOp64, 4);
+    getStreamer().SwitchSection(CS);
+
+    if(emitPartialAddress(TOut, IDLoc, Sym))
+      return true;
+    TOut.emitRRX(Is64FPU ? Mips::LDC164 : Mips::LDC1, FirstReg, ATReg,
+                 MCOperand::createExpr(LoExpr), IDLoc, STI);
+  }
+  return false;
+}
+
 bool MipsAsmParser::expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
                                                MCStreamer &Out,
                                                const MCSubtargetInfo *STI) {
@@ -4318,45 +4666,6 @@ bool MipsAsmParser::expandDMULMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   return false;
 }
 
-static unsigned nextReg(unsigned Reg) {
-  switch (Reg) {
-  case Mips::ZERO: return Mips::AT;
-  case Mips::AT:   return Mips::V0;
-  case Mips::V0:   return Mips::V1;
-  case Mips::V1:   return Mips::A0;
-  case Mips::A0:   return Mips::A1;
-  case Mips::A1:   return Mips::A2;
-  case Mips::A2:   return Mips::A3;
-  case Mips::A3:   return Mips::T0;
-  case Mips::T0:   return Mips::T1;
-  case Mips::T1:   return Mips::T2;
-  case Mips::T2:   return Mips::T3;
-  case Mips::T3:   return Mips::T4;
-  case Mips::T4:   return Mips::T5;
-  case Mips::T5:   return Mips::T6;
-  case Mips::T6:   return Mips::T7;
-  case Mips::T7:   return Mips::S0;
-  case Mips::S0:   return Mips::S1;
-  case Mips::S1:   return Mips::S2;
-  case Mips::S2:   return Mips::S3;
-  case Mips::S3:   return Mips::S4;
-  case Mips::S4:   return Mips::S5;
-  case Mips::S5:   return Mips::S6;
-  case Mips::S6:   return Mips::S7;
-  case Mips::S7:   return Mips::T8;
-  case Mips::T8:   return Mips::T9;
-  case Mips::T9:   return Mips::K0;
-  case Mips::K0:   return Mips::K1;
-  case Mips::K1:   return Mips::GP;
-  case Mips::GP:   return Mips::SP;
-  case Mips::SP:   return Mips::FP;
-  case Mips::FP:   return Mips::RA;
-  case Mips::RA:   return Mips::ZERO;
-  default:         return 0;
-  }
-
-}
-
 // Expand 'ld $<reg> offset($reg2)' to 'lw $<reg>, offset($reg2);
 //                                      lw $<reg+1>>, offset+4($reg2)'
 // or expand 'sd $<reg> offset($reg2)' to 'sw $<reg>, offset($reg2);
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 8fe4e75f3e18..760630c41176 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -362,7 +362,6 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setOperationAction(ISD::FCOS,              MVT::f64,   Expand);
   setOperationAction(ISD::FSINCOS,           MVT::f32,   Expand);
   setOperationAction(ISD::FSINCOS,           MVT::f64,   Expand);
-  setOperationAction(ISD::FPOWI,             MVT::f32,   Expand);
   setOperationAction(ISD::FPOW,              MVT::f32,   Expand);
   setOperationAction(ISD::FPOW,              MVT::f64,   Expand);
   setOperationAction(ISD::FLOG,              MVT::f32,   Expand);
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index df42d56d041b..d81a769d7fd9 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -681,6 +681,29 @@ def PseudoTRUNC_W_D : MipsAsmPseudoInst<(outs FGR32Opnd:$fd),
                                         "trunc.w.d\t$fd, $fs, $rs">,
                       FGR_64, HARDFLOAT;
 
+def LoadImmSingleGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                         (ins imm64:$fpimm),
+                                         "li.s\t$rd, $fpimm">;
+
+def LoadImmSingleFGR : MipsAsmPseudoInst<(outs StrictlyFGR32Opnd:$rd),
+                                         (ins imm64:$fpimm),
+                                         "li.s\t$rd, $fpimm">,
+                       HARDFLOAT;
+
+def LoadImmDoubleGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                         (ins imm64:$fpimm),
+                                         "li.d\t$rd, $fpimm">;
+
+def LoadImmDoubleFGR_32 : MipsAsmPseudoInst<(outs StrictlyAFGR64Opnd:$rd),
+                                            (ins imm64:$fpimm),
+                                            "li.d\t$rd, $fpimm">,
+                          FGR_32, HARDFLOAT;
+
+def LoadImmDoubleFGR : MipsAsmPseudoInst<(outs StrictlyFGR64Opnd:$rd),
+                                         (ins imm64:$fpimm),
+                                         "li.d\t$rd, $fpimm">,
+                       FGR_64, HARDFLOAT;
+
 //===----------------------------------------------------------------------===//
 // InstAliases.
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index ccfdcc89b078..08fb3d7d4352 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -552,16 +552,31 @@ def AFGR64AsmOperand : MipsAsmRegOperand {
   let PredicateMethod = "isFGRAsmReg";
 }
 
+def StrictlyAFGR64AsmOperand : MipsAsmRegOperand {
+  let Name = "StrictlyAFGR64AsmReg";
+  let PredicateMethod = "isStrictlyFGRAsmReg";
+}
+
 def FGR64AsmOperand : MipsAsmRegOperand {
   let Name = "FGR64AsmReg";
   let PredicateMethod = "isFGRAsmReg";
 }
 
+def StrictlyFGR64AsmOperand : MipsAsmRegOperand {
+  let Name = "StrictlyFGR64AsmReg";
+  let PredicateMethod = "isStrictlyFGRAsmReg";
+}
+
 def FGR32AsmOperand : MipsAsmRegOperand {
   let Name = "FGR32AsmReg";
   let PredicateMethod = "isFGRAsmReg";
 }
 
+def StrictlyFGR32AsmOperand : MipsAsmRegOperand {
+  let Name = "StrictlyFGR32AsmReg";
+  let PredicateMethod = "isStrictlyFGRAsmReg";
+}
+
 def FGRH32AsmOperand : MipsAsmRegOperand {
   let Name = "FGRH32AsmReg";
   let PredicateMethod = "isFGRAsmReg";
@@ -639,14 +654,26 @@ def AFGR64Opnd : RegisterOperand<AFGR64> {
   let ParserMatchClass = AFGR64AsmOperand;
 }
 
+def StrictlyAFGR64Opnd : RegisterOperand<AFGR64> {
+  let ParserMatchClass = StrictlyAFGR64AsmOperand;
+}
+
 def FGR64Opnd : RegisterOperand<FGR64> {
   let ParserMatchClass = FGR64AsmOperand;
 }
 
+def StrictlyFGR64Opnd : RegisterOperand<FGR64> {
+  let ParserMatchClass = StrictlyFGR64AsmOperand;
+}
+
 def FGR32Opnd : RegisterOperand<FGR32> {
   let ParserMatchClass = FGR32AsmOperand;
 }
 
+def StrictlyFGR32Opnd : RegisterOperand<FGR32> {
+  let ParserMatchClass = StrictlyFGR32AsmOperand;
+}
+
 def FGRCCOpnd : RegisterOperand<FGRCC> {
   // The assembler doesn't use register classes so we can re-use
   // FGR32AsmOperand.
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index b90a5ee28342..216efcc4a1ee 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -539,7 +539,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::FSIN, VT, Expand);
       setOperationAction(ISD::FCOS, VT, Expand);
       setOperationAction(ISD::FABS, VT, Expand);
-      setOperationAction(ISD::FPOWI, VT, Expand);
       setOperationAction(ISD::FFLOOR, VT, Expand);
       setOperationAction(ISD::FCEIL,  VT, Expand);
       setOperationAction(ISD::FTRUNC, VT, Expand);
@@ -798,7 +797,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::FABS , MVT::v4f64, Legal);
     setOperationAction(ISD::FSIN , MVT::v4f64, Expand);
     setOperationAction(ISD::FCOS , MVT::v4f64, Expand);
-    setOperationAction(ISD::FPOWI , MVT::v4f64, Expand);
     setOperationAction(ISD::FPOW , MVT::v4f64, Expand);
     setOperationAction(ISD::FLOG , MVT::v4f64, Expand);
     setOperationAction(ISD::FLOG2 , MVT::v4f64, Expand);
@@ -844,7 +842,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setOperationAction(ISD::FABS , MVT::v4f32, Legal);
     setOperationAction(ISD::FSIN , MVT::v4f32, Expand);
     setOperationAction(ISD::FCOS , MVT::v4f32, Expand);
-    setOperationAction(ISD::FPOWI , MVT::v4f32, Expand);
     setOperationAction(ISD::FPOW , MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG , MVT::v4f32, Expand);
     setOperationAction(ISD::FLOG2 , MVT::v4f32, Expand);
diff --git a/lib/Target/SystemZ/SystemZ.td b/lib/Target/SystemZ/SystemZ.td
index 6bdfd4d07edc..c5f324418da5 100644
--- a/lib/Target/SystemZ/SystemZ.td
+++ b/lib/Target/SystemZ/SystemZ.td
@@ -54,6 +54,8 @@ include "SystemZInstrFormats.td"
 include "SystemZInstrInfo.td"
 include "SystemZInstrVector.td"
 include "SystemZInstrFP.td"
+include "SystemZInstrHFP.td"
+include "SystemZInstrDFP.td"
 
 def SystemZInstrInfo : InstrInfo {}
 
diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td
index 7bfa378aa85c..ffb0b8d1c861 100644
--- a/lib/Target/SystemZ/SystemZFeatures.td
+++ b/lib/Target/SystemZ/SystemZFeatures.td
@@ -115,12 +115,18 @@ def FeatureTransactionalExecution : SystemZFeature<
   "Assume that the transactional-execution facility is installed"
 >;
 
+def FeatureDFPZonedConversion : SystemZFeature<
+  "dfp-zoned-conversion", "DFPZonedConversion",
+  "Assume that the DFP zoned-conversion facility is installed"
+>;
+
 def Arch10NewFeatures : SystemZFeatureList<[
     FeatureExecutionHint,
     FeatureLoadAndTrap,
     FeatureMiscellaneousExtensions,
     FeatureProcessorAssist,
-    FeatureTransactionalExecution
+    FeatureTransactionalExecution,
+    FeatureDFPZonedConversion
 ]>;
 
 //===----------------------------------------------------------------------===//
@@ -144,6 +150,11 @@ def FeatureMessageSecurityAssist5 : SystemZFeature<
   "Assume that the message-security-assist extension facility 5 is installed"
 >;
 
+def FeatureDFPPackedConversion : SystemZFeature<
+  "dfp-packed-conversion", "DFPPackedConversion",
+  "Assume that the DFP packed-conversion facility is installed"
+>;
+
 def FeatureVector : SystemZFeature<
   "vector", "Vector",
   "Assume that the vectory facility is installed"
@@ -154,6 +165,7 @@ def Arch11NewFeatures : SystemZFeatureList<[
     FeatureLoadAndZeroRightmostByte,
     FeatureLoadStoreOnCond2,
     FeatureMessageSecurityAssist5,
+    FeatureDFPPackedConversion,
     FeatureVector
 ]>;
 
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 235e095f0010..ae141dbcad34 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -4189,12 +4189,20 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
   if (Single.getNode() && (Count > 1 || Single.getOpcode() == ISD::LOAD))
     return DAG.getNode(SystemZISD::REPLICATE, DL, VT, Single);
 
+  // If all elements are loads, use VLREP/VLEs (below).
+  bool AllLoads = true;
+  for (auto Elem : Elems)
+    if (Elem.getOpcode() != ISD::LOAD || cast<LoadSDNode>(Elem)->isIndexed()) {
+      AllLoads = false;
+      break;
+    }
+
   // The best way of building a v2i64 from two i64s is to use VLVGP.
-  if (VT == MVT::v2i64)
+  if (VT == MVT::v2i64 && !AllLoads)
     return joinDwords(DAG, DL, Elems[0], Elems[1]);
 
   // Use a 64-bit merge high to combine two doubles.
-  if (VT == MVT::v2f64)
+  if (VT == MVT::v2f64 && !AllLoads)
     return buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
 
   // Build v4f32 values directly from the FPRs:
@@ -4204,7 +4212,7 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
   //      <ABxx>         <CDxx>
   //                V                 VMRHG
   //              <ABCD>
-  if (VT == MVT::v4f32) {
+  if (VT == MVT::v4f32 && !AllLoads) {
     SDValue Op01 = buildMergeScalars(DAG, DL, VT, Elems[0], Elems[1]);
     SDValue Op23 = buildMergeScalars(DAG, DL, VT, Elems[2], Elems[3]);
     // Avoid unnecessary undefs by reusing the other operand.
@@ -4246,23 +4254,37 @@ static SDValue buildVector(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
         Constants[I] = DAG.getUNDEF(Elems[I].getValueType());
     Result = DAG.getBuildVector(VT, DL, Constants);
   } else {
-    // Otherwise try to use VLVGP to start the sequence in order to
+    // Otherwise try to use VLREP or VLVGP to start the sequence in order to
     // avoid a false dependency on any previous contents of the vector
-    // register.  This only makes sense if one of the associated elements
-    // is defined.
-    unsigned I1 = NumElements / 2 - 1;
-    unsigned I2 = NumElements - 1;
-    bool Def1 = !Elems[I1].isUndef();
-    bool Def2 = !Elems[I2].isUndef();
-    if (Def1 || Def2) {
-      SDValue Elem1 = Elems[Def1 ? I1 : I2];
-      SDValue Elem2 = Elems[Def2 ? I2 : I1];
-      Result = DAG.getNode(ISD::BITCAST, DL, VT,
-                           joinDwords(DAG, DL, Elem1, Elem2));
-      Done[I1] = true;
-      Done[I2] = true;
-    } else
-      Result = DAG.getUNDEF(VT);
+    // register.
+
+    // Use a VLREP if at least one element is a load.
+    unsigned LoadElIdx = UINT_MAX;
+    for (unsigned I = 0; I < NumElements; ++I)
+      if (Elems[I].getOpcode() == ISD::LOAD &&
+          cast<LoadSDNode>(Elems[I])->isUnindexed()) {
+        LoadElIdx = I;
+        break;
+      }
+    if (LoadElIdx != UINT_MAX) {
+      Result = DAG.getNode(SystemZISD::REPLICATE, DL, VT, Elems[LoadElIdx]);
+      Done[LoadElIdx] = true;
+    } else {
+      // Try to use VLVGP.
+      unsigned I1 = NumElements / 2 - 1;
+      unsigned I2 = NumElements - 1;
+      bool Def1 = !Elems[I1].isUndef();
+      bool Def2 = !Elems[I2].isUndef();
+      if (Def1 || Def2) {
+        SDValue Elem1 = Elems[Def1 ? I1 : I2];
+        SDValue Elem2 = Elems[Def2 ? I2 : I1];
+        Result = DAG.getNode(ISD::BITCAST, DL, VT,
+                             joinDwords(DAG, DL, Elem1, Elem2));
+        Done[I1] = true;
+        Done[I2] = true;
+      } else
+        Result = DAG.getUNDEF(VT);
+    }
   }
 
   // Use VLVGx to insert the other elements.
diff --git a/lib/Target/SystemZ/SystemZInstrDFP.td b/lib/Target/SystemZ/SystemZInstrDFP.td
new file mode 100644
index 000000000000..08ab2d7bbc52
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrDFP.td
@@ -0,0 +1,231 @@
+//==- SystemZInstrDFP.td - Floating-point SystemZ instructions -*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The instructions in this file implement SystemZ decimal floating-point
+// arithmetic.  These instructions are inot currently used for code generation,
+// are provided for use with the assembler and disassembler only.  If LLVM
+// ever supports decimal floating-point types (_Decimal64 etc.), they can
+// also be used for code generation for those types.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and test.
+let Defs = [CC] in {
+  def LTDTR : UnaryRRE<"ltdtr", 0xB3D6, null_frag, FP64,  FP64>;
+  def LTXTR : UnaryRRE<"ltxtr", 0xB3DE, null_frag, FP128, FP128>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Convert floating-point values to narrower representations.  The destination
+// of LDXTR is a 128-bit value, but only the first register of the pair is used.
+def LEDTR : TernaryRRFe<"ledtr", 0xB3D5, FP32,  FP64>;
+def LDXTR : TernaryRRFe<"ldxtr", 0xB3DD, FP128, FP128>;
+
+// Extend floating-point values to wider representations.
+def LDETR : BinaryRRFd<"ldetr", 0xB3D4, FP64,  FP32>;
+def LXDTR : BinaryRRFd<"lxdtr", 0xB3DC, FP128, FP64>;
+
+// Convert a signed integer value to a floating-point one.
+def CDGTR : UnaryRRE<"cdgtr", 0xB3F1, null_frag, FP64,  GR64>;
+def CXGTR : UnaryRRE<"cxgtr", 0xB3F9, null_frag, FP128, GR64>;
+let Predicates = [FeatureFPExtension] in {
+  def CDGTRA : TernaryRRFe<"cdgtra", 0xB3F1, FP64,  GR64>;
+  def CXGTRA : TernaryRRFe<"cxgtra", 0xB3F9, FP128, GR64>;
+  def CDFTR : TernaryRRFe<"cdftr", 0xB951, FP64,  GR32>;
+  def CXFTR : TernaryRRFe<"cxftr", 0xB959, FP128, GR32>;
+}
+
+// Convert an unsigned integer value to a floating-point one.
+let Predicates = [FeatureFPExtension] in {
+  def CDLGTR : TernaryRRFe<"cdlgtr", 0xB952, FP64,  GR64>;
+  def CXLGTR : TernaryRRFe<"cxlgtr", 0xB95A, FP128, GR64>;
+  def CDLFTR : TernaryRRFe<"cdlftr", 0xB953, FP64,  GR32>;
+  def CXLFTR : TernaryRRFe<"cxlftr", 0xB95B, FP128, GR32>;
+}
+
+// Convert a floating-point value to a signed integer value.
+let Defs = [CC] in {
+  def CGDTR : BinaryRRFe<"cgdtr", 0xB3E1, GR64, FP64>;
+  def CGXTR : BinaryRRFe<"cgxtr", 0xB3E9, GR64, FP128>;
+  let Predicates = [FeatureFPExtension] in {
+    def CGDTRA : TernaryRRFe<"cgdtra", 0xB3E1, GR64, FP64>;
+    def CGXTRA : TernaryRRFe<"cgxtra", 0xB3E9, GR64, FP128>;
+    def CFDTR : TernaryRRFe<"cfdtr", 0xB941, GR32, FP64>;
+    def CFXTR : TernaryRRFe<"cfxtr", 0xB949, GR32, FP128>;
+  }
+}
+
+// Convert a floating-point value to an unsigned integer value.
+let Defs = [CC] in {
+  let Predicates = [FeatureFPExtension] in {
+    def CLGDTR : TernaryRRFe<"clgdtr", 0xB942, GR64, FP64>;
+    def CLGXTR : TernaryRRFe<"clgxtr", 0xB94A, GR64, FP128>;
+    def CLFDTR : TernaryRRFe<"clfdtr", 0xB943, GR32, FP64>;
+    def CLFXTR : TernaryRRFe<"clfxtr", 0xB94B, GR32, FP128>;
+  }
+}
+
+// Convert a packed value to a floating-point one.
+def CDSTR : UnaryRRE<"cdstr", 0xB3F3, null_frag, FP64,  GR64>;
+def CXSTR : UnaryRRE<"cxstr", 0xB3FB, null_frag, FP128, GR128>;
+def CDUTR : UnaryRRE<"cdutr", 0xB3F2, null_frag, FP64,  GR64>;
+def CXUTR : UnaryRRE<"cxutr", 0xB3FA, null_frag, FP128, GR128>;
+
+// Convert a floating-point value to a packed value.
+def CSDTR : BinaryRRFd<"csdtr", 0xB3E3, GR64,  FP64>;
+def CSXTR : BinaryRRFd<"csxtr", 0xB3EB, GR128, FP128>;
+def CUDTR : UnaryRRE<"cudtr", 0xB3E2, null_frag, GR64,  FP64>;
+def CUXTR : UnaryRRE<"cuxtr", 0xB3EA, null_frag, GR128, FP128>;
+
+// Convert from/to memory values in the zoned format.
+let Predicates = [FeatureDFPZonedConversion] in {
+  def CDZT : BinaryRSL<"cdzt", 0xEDAA, FP64>;
+  def CXZT : BinaryRSL<"cxzt", 0xEDAB, FP128>;
+  def CZDT : StoreBinaryRSL<"czdt", 0xEDA8, FP64>;
+  def CZXT : StoreBinaryRSL<"czxt", 0xEDA9, FP128>;
+}
+
+// Convert from/to memory values in the packed format.
+let Predicates = [FeatureDFPPackedConversion] in {
+  def CDPT : BinaryRSL<"cdpt", 0xEDAE, FP64>;
+  def CXPT : BinaryRSL<"cxpt", 0xEDAF, FP128>;
+  def CPDT : StoreBinaryRSL<"cpdt", 0xEDAC, FP64>;
+  def CPXT : StoreBinaryRSL<"cpxt", 0xEDAD, FP128>;
+}
+
+// Perform floating-point operation.
+let Defs = [CC, R1L, F0Q], Uses = [R0L, F4Q] in
+  def PFPO : SideEffectInherentE<"pfpo", 0x010A>;
+
+
+//===----------------------------------------------------------------------===//
+// Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Round to an integer, with the second operand (M3) specifying the rounding
+// mode.  M4 can be set to 4 to suppress detection of inexact conditions.
+def FIDTR : TernaryRRFe<"fidtr", 0xB3D7, FP64,  FP64>;
+def FIXTR : TernaryRRFe<"fixtr", 0xB3DF, FP128, FP128>;
+
+// Extract biased exponent.
+def EEDTR : UnaryRRE<"eedtr", 0xB3E5, null_frag, FP64,  FP64>;
+def EEXTR : UnaryRRE<"eextr", 0xB3ED, null_frag, FP128, FP128>;
+
+// Extract significance.
+def ESDTR : UnaryRRE<"esdtr", 0xB3E7, null_frag, FP64,  FP64>;
+def ESXTR : UnaryRRE<"esxtr", 0xB3EF, null_frag, FP128, FP128>;
+
+
+//===----------------------------------------------------------------------===//
+// Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition.
+let Defs = [CC] in {
+  let isCommutable = 1 in {
+    def ADTR : BinaryRRFa<"adtr", 0xB3D2, null_frag, FP64,  FP64,  FP64>;
+    def AXTR : BinaryRRFa<"axtr", 0xB3DA, null_frag, FP128, FP128, FP128>;
+  }
+  let Predicates = [FeatureFPExtension] in {
+    def ADTRA : TernaryRRFa<"adtra", 0xB3D2, FP64,  FP64,  FP64>;
+    def AXTRA : TernaryRRFa<"axtra", 0xB3DA, FP128, FP128, FP128>;
+  }
+}
+
+// Subtraction.
+let Defs = [CC] in {
+  def SDTR : BinaryRRFa<"sdtr", 0xB3D3, null_frag, FP64,  FP64,  FP64>;
+  def SXTR : BinaryRRFa<"sxtr", 0xB3DB, null_frag, FP128, FP128, FP128>;
+  let Predicates = [FeatureFPExtension] in {
+    def SDTRA : TernaryRRFa<"sdtra", 0xB3D3, FP64,  FP64,  FP64>;
+    def SXTRA : TernaryRRFa<"sxtra", 0xB3DB, FP128, FP128, FP128>;
+  }
+}
+
+// Multiplication.
+let isCommutable = 1 in {
+  def MDTR : BinaryRRFa<"mdtr", 0xB3D0, null_frag, FP64,  FP64,  FP64>;
+  def MXTR : BinaryRRFa<"mxtr", 0xB3D8, null_frag, FP128, FP128, FP128>;
+}
+let Predicates = [FeatureFPExtension] in {
+  def MDTRA : TernaryRRFa<"mdtra", 0xB3D0, FP64,  FP64,  FP64>;
+  def MXTRA : TernaryRRFa<"mxtra", 0xB3D8, FP128, FP128, FP128>;
+}
+
+// Division.
+def DDTR : BinaryRRFa<"ddtr", 0xB3D1, null_frag, FP64,  FP64,  FP64>;
+def DXTR : BinaryRRFa<"dxtr", 0xB3D9, null_frag, FP128, FP128, FP128>;
+let Predicates = [FeatureFPExtension] in {
+  def DDTRA : TernaryRRFa<"ddtra", 0xB3D1, FP64,  FP64,  FP64>;
+  def DXTRA : TernaryRRFa<"dxtra", 0xB3D9, FP128, FP128, FP128>;
+}
+
+// Quantize.
+def QADTR : TernaryRRFb<"qadtr", 0xB3F5, FP64,  FP64,  FP64>;
+def QAXTR : TernaryRRFb<"qaxtr", 0xB3FD, FP128, FP128, FP128>;
+
+// Reround.
+def RRDTR : TernaryRRFb<"rrdtr", 0xB3F7, FP64,  FP64,  FP64>;
+def RRXTR : TernaryRRFb<"rrxtr", 0xB3FF, FP128, FP128, FP128>;
+
+// Shift significand left/right.
+def SLDT : BinaryRXF<"sldt", 0xED40, null_frag, FP64,  FP64,  null_frag, 0>;
+def SLXT : BinaryRXF<"slxt", 0xED48, null_frag, FP128, FP128, null_frag, 0>;
+def SRDT : BinaryRXF<"srdt", 0xED41, null_frag, FP64,  FP64,  null_frag, 0>;
+def SRXT : BinaryRXF<"srxt", 0xED49, null_frag, FP128, FP128, null_frag, 0>;
+
+// Insert biased exponent.
+def IEDTR : BinaryRRFb<"iedtr", 0xB3F6, null_frag, FP64,  FP64,   FP64>;
+def IEXTR : BinaryRRFb<"iextr", 0xB3FE, null_frag, FP128, FP128, FP128>;
+
+
+//===----------------------------------------------------------------------===//
+// Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare.
+let Defs = [CC] in {
+  def CDTR : CompareRRE<"cdtr", 0xB3E4, null_frag, FP64,  FP64>;
+  def CXTR : CompareRRE<"cxtr", 0xB3EC, null_frag, FP128, FP128>;
+}
+
+// Compare and signal.
+let Defs = [CC] in {
+  def KDTR : CompareRRE<"kdtr", 0xB3E0, null_frag, FP64,  FP64>;
+  def KXTR : CompareRRE<"kxtr", 0xB3E8, null_frag, FP128, FP128>;
+}
+
+// Compare biased exponent.
+let Defs = [CC] in {
+  def CEDTR : CompareRRE<"cedtr", 0xB3F4, null_frag, FP64,  FP64>;
+  def CEXTR : CompareRRE<"cextr", 0xB3FC, null_frag, FP128, FP128>;
+}
+
+// Test Data Class.
+let Defs = [CC] in {
+  def TDCET : TestRXE<"tdcet", 0xED50, null_frag, FP32>;
+  def TDCDT : TestRXE<"tdcdt", 0xED54, null_frag, FP64>;
+  def TDCXT : TestRXE<"tdcxt", 0xED58, null_frag, FP128>;
+}
+
+// Test Data Group.
+let Defs = [CC] in {
+  def TDGET : TestRXE<"tdget", 0xED51, null_frag, FP32>;
+  def TDGDT : TestRXE<"tdgdt", 0xED55, null_frag, FP64>;
+  def TDGXT : TestRXE<"tdgxt", 0xED59, null_frag, FP128>;
+}
+
diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td
index 364b81f98eed..10172bd45203 100644
--- a/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/lib/Target/SystemZ/SystemZInstrFP.td
@@ -121,7 +121,8 @@ let canFoldAsLoad = 1, SimpleBDXLoad = 1 in {
   defm LD : UnaryRXPair<"ld", 0x68, 0xED65, load, FP64, 8>;
 
   // For z13 we prefer LDE over LE to avoid partial register dependencies.
-  def LDE32 : UnaryRXE<"lde", 0xED24, null_frag, FP32, 4>;
+  let isCodeGenOnly = 1 in
+    def LDE32 : UnaryRXE<"lde", 0xED24, null_frag, FP32, 4>;
 
   // These instructions are split after register allocation, so we don't
   // want a custom inserter.
@@ -437,18 +438,18 @@ def : Pat<(fmul (f128 (fpextend FP64:$src1)),
                 bdxaddr12only:$addr)>;
 
 // Fused multiply-add.
-def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32>;
-def MADBR : TernaryRRD<"madbr", 0xB31E, z_fma, FP64>;
+def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32, FP32>;
+def MADBR : TernaryRRD<"madbr", 0xB31E, z_fma, FP64, FP64>;
 
-def MAEB : TernaryRXF<"maeb", 0xED0E, z_fma, FP32, load, 4>;
-def MADB : TernaryRXF<"madb", 0xED1E, z_fma, FP64, load, 8>;
+def MAEB : TernaryRXF<"maeb", 0xED0E, z_fma, FP32, FP32, load, 4>;
+def MADB : TernaryRXF<"madb", 0xED1E, z_fma, FP64, FP64, load, 8>;
 
 // Fused multiply-subtract.
-def MSEBR : TernaryRRD<"msebr", 0xB30F, z_fms, FP32>;
-def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_fms, FP64>;
+def MSEBR : TernaryRRD<"msebr", 0xB30F, z_fms, FP32, FP32>;
+def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_fms, FP64, FP64>;
 
-def MSEB : TernaryRXF<"mseb", 0xED0F, z_fms, FP32, load, 4>;
-def MSDB : TernaryRXF<"msdb", 0xED1F, z_fms, FP64, load, 8>;
+def MSEB : TernaryRXF<"mseb", 0xED0F, z_fms, FP32, FP32, load, 4>;
+def MSDB : TernaryRXF<"msdb", 0xED1F, z_fms, FP64, FP64, load, 8>;
 
 // Division.
 def DEBR : BinaryRRE<"debr", 0xB30D, fdiv, FP32,  FP32>;
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index a37da2807854..5f6115ed86a4 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -527,6 +527,22 @@ class InstRRFc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{3-0}   = R2;
 }
 
+class InstRRFd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<4, outs, ins, asmstr, pattern> {
+  field bits<32> Inst;
+  field bits<32> SoftFail = 0;
+
+  bits<4> R1;
+  bits<4> R2;
+  bits<4> M4;
+
+  let Inst{31-16} = op;
+  let Inst{15-12} = 0;
+  let Inst{11-8}  = M4;
+  let Inst{7-4}   = R1;
+  let Inst{3-0}   = R2;
+}
+
 class InstRRFe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<4, outs, ins, asmstr, pattern> {
   field bits<32> Inst;
@@ -725,6 +741,22 @@ class InstRSLa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-0}   = op{7-0};
 }
 
+class InstRSLb<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+  : InstSystemZ<6, outs, ins, asmstr, pattern> {
+  field bits<48> Inst;
+  field bits<48> SoftFail = 0;
+
+  bits<4> R1;
+  bits<24> BDL2;
+  bits<4> M3;
+
+  let Inst{47-40} = op{15-8};
+  let Inst{39-16} = BDL2;
+  let Inst{15-12} = R1;
+  let Inst{11-8}  = M3;
+  let Inst{7-0}   = op{7-0};
+}
+
 class InstRSYa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern>
   : InstSystemZ<6, outs, ins, asmstr, pattern> {
   field bits<48> Inst;
@@ -2752,6 +2784,15 @@ class BinaryRRE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let DisableEncoding = "$R1src";
 }
 
+class BinaryRRD<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRD<opcode, (outs cls1:$R1), (ins cls2:$R3, cls2:$R2),
+            mnemonic#"\t$R1, $R3, $R2",
+            [(set cls1:$R1, (operator cls2:$R3, cls2:$R2))]> {
+  let OpKey = mnemonic#cls;
+  let OpType = "reg";
+}
+
 class BinaryRRFa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  RegisterOperand cls1, RegisterOperand cls2,
                  RegisterOperand cls3>
@@ -2808,6 +2849,11 @@ multiclass BinaryMemRRFcOpt<string mnemonic, bits<16> opcode,
   def Opt : UnaryMemRRFc<mnemonic, opcode, cls1, cls2>;
 }
 
+class BinaryRRFd<string mnemonic, bits<16> opcode, RegisterOperand cls1,
+                RegisterOperand cls2>
+  : InstRRFd<opcode, (outs cls1:$R1), (ins cls2:$R2, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R2, $M4", []>;
+
 class BinaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
                 RegisterOperand cls2>
   : InstRRFe<opcode, (outs cls1:$R1), (ins imm32zx4:$M3, cls2:$R2),
@@ -2958,6 +3004,13 @@ multiclass BinaryRSAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
   }
 }
 
+class BinaryRSL<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRSLb<opcode, (outs cls:$R1),
+             (ins bdladdr12onlylen8:$BDL2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $BDL2, $M3", []> {
+  let mayLoad = 1;
+}
+
 class BinaryRX<string mnemonic, bits<8> opcode, SDPatternOperator operator,
                RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
                AddressingMode mode = bdxaddr12only>
@@ -2987,6 +3040,18 @@ class BinaryRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M3 = 0;
 }
 
+class BinaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                RegisterOperand cls1, RegisterOperand cls2,
+                SDPatternOperator load, bits<5> bytes>
+  : InstRXF<opcode, (outs cls1:$R1), (ins cls2:$R3, bdxaddr12only:$XBD2),
+            mnemonic#"\t$R1, $R3, $XBD2",
+            [(set cls1:$R1, (operator cls2:$R3, (load bdxaddr12only:$XBD2)))]> {
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let mayLoad = 1;
+  let AccessBytes = bytes;
+}
+
 class BinaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes,
                 AddressingMode mode = bdxaddr20only>
@@ -3294,6 +3359,13 @@ multiclass StoreBinaryRSPair<string mnemonic, bits<8> rsOpcode,
   }
 }
 
+class StoreBinaryRSL<string mnemonic, bits<16> opcode, RegisterOperand cls>
+  : InstRSLb<opcode, (outs),
+             (ins cls:$R1, bdladdr12onlylen8:$BDL2, imm32zx4:$M3),
+             mnemonic#"\t$R1, $BDL2, $M3", []> {
+  let mayStore = 1;
+}
+
 class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes,
                      Immediate index>
   : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3),
@@ -3581,6 +3653,12 @@ class SideEffectTernarySSF<string mnemonic, bits<12> opcode,
             (ins bdaddr12only:$BD1, bdaddr12only:$BD2, cls:$R3),
             mnemonic#"\t$BD1, $BD2, $R3", []>;
 
+class TernaryRRFa<string mnemonic, bits<16> opcode,
+                 RegisterOperand cls1, RegisterOperand cls2,
+                 RegisterOperand cls3>
+  : InstRRFa<opcode, (outs cls1:$R1), (ins cls2:$R2, cls3:$R3, imm32zx4:$M4),
+             mnemonic#"\t$R1, $R2, $R3, $M4", []>;
+
 class TernaryRRFb<string mnemonic, bits<16> opcode,
                   RegisterOperand cls1, RegisterOperand cls2,
                   RegisterOperand cls3>
@@ -3597,11 +3675,11 @@ class TernaryRRFe<string mnemonic, bits<16> opcode, RegisterOperand cls1,
              (ins imm32zx4:$M3, cls2:$R2, imm32zx4:$M4),
              mnemonic#"\t$R1, $M3, $R2, $M4", []>;
 
-class TernaryRRD<string mnemonic, bits<16> opcode,
-                 SDPatternOperator operator, RegisterOperand cls>
-  : InstRRD<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, cls:$R2),
+class TernaryRRD<string mnemonic, bits<16> opcode, SDPatternOperator operator,
+                 RegisterOperand cls1, RegisterOperand cls2>
+  : InstRRD<opcode, (outs cls1:$R1), (ins cls2:$R1src, cls2:$R3, cls2:$R2),
             mnemonic#"\t$R1, $R3, $R2",
-            [(set cls:$R1, (operator cls:$R1src, cls:$R3, cls:$R2))]> {
+            [(set cls1:$R1, (operator cls2:$R1src, cls2:$R3, cls2:$R2))]> {
   let OpKey = mnemonic#cls;
   let OpType = "reg";
   let Constraints = "$R1 = $R1src";
@@ -3661,12 +3739,13 @@ class SideEffectTernaryMemMemRSY<string mnemonic, bits<16> opcode,
 }
 
 class TernaryRXF<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                 RegisterOperand cls, SDPatternOperator load, bits<5> bytes>
-  : InstRXF<opcode, (outs cls:$R1),
-            (ins cls:$R1src, cls:$R3, bdxaddr12only:$XBD2),
+                 RegisterOperand cls1, RegisterOperand cls2,
+                 SDPatternOperator load, bits<5> bytes>
+  : InstRXF<opcode, (outs cls1:$R1),
+            (ins cls2:$R1src, cls2:$R3, bdxaddr12only:$XBD2),
             mnemonic#"\t$R1, $R3, $XBD2",
-            [(set cls:$R1, (operator cls:$R1src, cls:$R3,
-                                     (load bdxaddr12only:$XBD2)))]> {
+            [(set cls1:$R1, (operator cls2:$R1src, cls2:$R3,
+                                      (load bdxaddr12only:$XBD2)))]> {
   let OpKey = mnemonic#"r"#cls;
   let OpType = "mem";
   let Constraints = "$R1 = $R1src";
diff --git a/lib/Target/SystemZ/SystemZInstrHFP.td b/lib/Target/SystemZ/SystemZInstrHFP.td
new file mode 100644
index 000000000000..6d5b4b92f650
--- /dev/null
+++ b/lib/Target/SystemZ/SystemZInstrHFP.td
@@ -0,0 +1,240 @@
+//==- SystemZInstrHFP.td - Floating-point SystemZ instructions -*- tblgen-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The instructions in this file implement SystemZ hexadecimal floating-point
+// arithmetic.  Since this format is not mapped to any source-language data
+// type, these instructions are not used for code generation, but are provided
+// for use with the assembler and disassembler only.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and test.
+let Defs = [CC] in {
+  def LTER : UnaryRR <"lter", 0x32,   null_frag, FP32,  FP32>;
+  def LTDR : UnaryRR <"ltdr", 0x22,   null_frag, FP64,  FP64>;
+  def LTXR : UnaryRRE<"ltxr", 0xB362, null_frag, FP128, FP128>;
+}
+
+//===----------------------------------------------------------------------===//
+// Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Convert floating-point values to narrower representations.
+def LEDR : UnaryRR <"ledr", 0x35,   null_frag, FP32, FP64>;
+def LEXR : UnaryRRE<"lexr", 0xB366, null_frag, FP32, FP128>;
+def LDXR : UnaryRR <"ldxr", 0x25,   null_frag, FP64, FP128>;
+let isAsmParserOnly = 1 in {
+  def LRER : UnaryRR <"lrer", 0x35, null_frag, FP32, FP64>;
+  def LRDR : UnaryRR <"lrdr", 0x25, null_frag, FP64, FP128>;
+}
+
+// Extend floating-point values to wider representations.
+def LDER : UnaryRRE<"lder", 0xB324, null_frag, FP64,  FP32>;
+def LXER : UnaryRRE<"lxer", 0xB326, null_frag, FP128, FP32>;
+def LXDR : UnaryRRE<"lxdr", 0xB325, null_frag, FP128, FP64>;
+
+def LDE : UnaryRXE<"lde", 0xED24, null_frag, FP64,  4>;
+def LXE : UnaryRXE<"lxe", 0xED26, null_frag, FP128, 4>;
+def LXD : UnaryRXE<"lxd", 0xED25, null_frag, FP128, 8>;
+
+// Convert a signed integer register value to a floating-point one.
+def CEFR : UnaryRRE<"cefr", 0xB3B4, null_frag, FP32,  GR32>;
+def CDFR : UnaryRRE<"cdfr", 0xB3B5, null_frag, FP64,  GR32>;
+def CXFR : UnaryRRE<"cxfr", 0xB3B6, null_frag, FP128, GR32>;
+
+def CEGR : UnaryRRE<"cegr", 0xB3C4, null_frag, FP32,  GR64>;
+def CDGR : UnaryRRE<"cdgr", 0xB3C5, null_frag, FP64,  GR64>;
+def CXGR : UnaryRRE<"cxgr", 0xB3C6, null_frag, FP128, GR64>;
+
+// Convert a floating-point register value to a signed integer value,
+// with the second operand (modifier M3) specifying the rounding mode.
+let Defs = [CC] in {
+  def CFER : BinaryRRFe<"cfer", 0xB3B8, GR32, FP32>;
+  def CFDR : BinaryRRFe<"cfdr", 0xB3B9, GR32, FP64>;
+  def CFXR : BinaryRRFe<"cfxr", 0xB3BA, GR32, FP128>;
+
+  def CGER : BinaryRRFe<"cger", 0xB3C8, GR64, FP32>;
+  def CGDR : BinaryRRFe<"cgdr", 0xB3C9, GR64, FP64>;
+  def CGXR : BinaryRRFe<"cgxr", 0xB3CA, GR64, FP128>;
+}
+
+// Convert BFP to HFP.
+let Defs = [CC] in {
+  def THDER : UnaryRRE<"thder", 0xB358, null_frag, FP64, FP32>;
+  def THDR  : UnaryRRE<"thdr",  0xB359, null_frag, FP64, FP64>;
+}
+
+// Convert HFP to BFP.
+let Defs = [CC] in {
+  def TBEDR : BinaryRRFe<"tbedr", 0xB350, FP32, FP64>;
+  def TBDR  : BinaryRRFe<"tbdr",  0xB351, FP64, FP64>;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Negation (Load Complement).
+let Defs = [CC] in {
+  def LCER : UnaryRR <"lcer", 0x33,   null_frag, FP32,  FP32>;
+  def LCDR : UnaryRR <"lcdr", 0x23,   null_frag, FP64,  FP64>;
+  def LCXR : UnaryRRE<"lcxr", 0xB363, null_frag, FP128, FP128>;
+}
+
+// Absolute value (Load Positive).
+let Defs = [CC] in {
+  def LPER : UnaryRR <"lper", 0x30,   null_frag, FP32,  FP32>;
+  def LPDR : UnaryRR <"lpdr", 0x20,   null_frag, FP64,  FP64>;
+  def LPXR : UnaryRRE<"lpxr", 0xB360, null_frag, FP128, FP128>;
+}
+
+// Negative absolute value (Load Negative).
+let Defs = [CC] in {
+  def LNER : UnaryRR <"lner", 0x31,   null_frag, FP32,  FP32>;
+  def LNDR : UnaryRR <"lndr", 0x21,   null_frag, FP64,  FP64>;
+  def LNXR : UnaryRRE<"lnxr", 0xB361, null_frag, FP128, FP128>;
+}
+
+// Halve.
+def HER : UnaryRR <"her", 0x34, null_frag, FP32, FP32>;
+def HDR : UnaryRR <"hdr", 0x24, null_frag, FP64, FP64>;
+
+// Square root.
+def SQER : UnaryRRE<"sqer", 0xB245, null_frag, FP32,  FP32>;
+def SQDR : UnaryRRE<"sqdr", 0xB244, null_frag, FP64,  FP64>;
+def SQXR : UnaryRRE<"sqxr", 0xB336, null_frag, FP128, FP128>;
+
+def SQE : UnaryRXE<"sqe", 0xED34, null_frag, FP32, 4>;
+def SQD : UnaryRXE<"sqd", 0xED35, null_frag, FP64, 8>;
+
+// Round to an integer (rounding towards zero).
+def FIER : UnaryRRE<"fier", 0xB377, null_frag, FP32,  FP32>;
+def FIDR : UnaryRRE<"fidr", 0xB37F, null_frag, FP64,  FP64>;
+def FIXR : UnaryRRE<"fixr", 0xB367, null_frag, FP128, FP128>;
+
+
+//===----------------------------------------------------------------------===//
+// Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition.
+let Defs = [CC] in {
+  let isCommutable = 1 in {
+    def AER : BinaryRR<"aer", 0x3A, null_frag, FP32,  FP32>;
+    def ADR : BinaryRR<"adr", 0x2A, null_frag, FP64,  FP64>;
+    def AXR : BinaryRR<"axr", 0x36, null_frag, FP128, FP128>;
+  }
+  def AE : BinaryRX<"ae", 0x7A, null_frag, FP32, load, 4>;
+  def AD : BinaryRX<"ad", 0x6A, null_frag, FP64, load, 8>;
+}
+
+// Addition (unnormalized).
+let Defs = [CC] in {
+  let isCommutable = 1 in {
+    def AUR : BinaryRR<"aur", 0x3E, null_frag, FP32, FP32>;
+    def AWR : BinaryRR<"awr", 0x2E, null_frag, FP64, FP64>;
+  }
+  def AU : BinaryRX<"au", 0x7E, null_frag, FP32, load, 4>;
+  def AW : BinaryRX<"aw", 0x6E, null_frag, FP64, load, 8>;
+}
+
+// Subtraction.
+let Defs = [CC] in {
+  def SER : BinaryRR<"ser", 0x3B, null_frag, FP32,  FP32>;
+  def SDR : BinaryRR<"sdr", 0x2B, null_frag, FP64,  FP64>;
+  def SXR : BinaryRR<"sxr", 0x37, null_frag, FP128, FP128>;
+
+  def SE : BinaryRX<"se", 0x7B, null_frag, FP32, load, 4>;
+  def SD : BinaryRX<"sd", 0x6B, null_frag, FP64, load, 8>;
+}
+
+// Subtraction (unnormalized).
+let Defs = [CC] in {
+  def SUR : BinaryRR<"sur", 0x3F, null_frag, FP32, FP32>;
+  def SWR : BinaryRR<"swr", 0x2F, null_frag, FP64, FP64>;
+
+  def SU : BinaryRX<"su", 0x7F, null_frag, FP32, load, 4>;
+  def SW : BinaryRX<"sw", 0x6F, null_frag, FP64, load, 8>;
+}
+
+// Multiplication.
+let isCommutable = 1 in {
+  def MEER : BinaryRRE<"meer", 0xB337, null_frag, FP32,  FP32>;
+  def MDR  : BinaryRR <"mdr",  0x2C,   null_frag, FP64,  FP64>;
+  def MXR  : BinaryRR <"mxr",  0x26,   null_frag, FP128, FP128>;
+}
+def MEE : BinaryRXE<"mee", 0xED37, null_frag, FP32, load, 4>;
+def MD  : BinaryRX <"md",  0x6C,   null_frag, FP64, load, 8>;
+
+// Extending multiplication (f32 x f32 -> f64).
+def MDER : BinaryRR<"mder", 0x3C, null_frag, FP64, FP32>;
+def MDE  : BinaryRX<"mde",  0x7C, null_frag, FP64, load, 4>;
+let isAsmParserOnly = 1 in {
+  def MER : BinaryRR<"mer", 0x3C, null_frag, FP64, FP32>;
+  def ME  : BinaryRX<"me",  0x7C, null_frag, FP64, load, 4>;
+}
+
+// Extending multiplication (f64 x f64 -> f128).
+def MXDR : BinaryRR<"mxdr", 0x27, null_frag, FP128, FP64>;
+def MXD  : BinaryRX<"mxd",  0x67, null_frag, FP128, load, 8>;
+
+// Fused multiply-add.
+def MAER : TernaryRRD<"maer", 0xB32E, null_frag, FP32, FP32>;
+def MADR : TernaryRRD<"madr", 0xB33E, null_frag, FP64, FP64>;
+def MAE  : TernaryRXF<"mae",  0xED2E, null_frag, FP32, FP32, load, 4>;
+def MAD  : TernaryRXF<"mad",  0xED3E, null_frag, FP64, FP64, load, 8>;
+
+// Fused multiply-subtract.
+def MSER : TernaryRRD<"mser", 0xB32F, null_frag, FP32, FP32>;
+def MSDR : TernaryRRD<"msdr", 0xB33F, null_frag, FP64, FP64>;
+def MSE  : TernaryRXF<"mse",  0xED2F, null_frag, FP32, FP32, load, 4>;
+def MSD  : TernaryRXF<"msd",  0xED3F, null_frag, FP64, FP64, load, 8>;
+
+// Multiplication (unnormalized).
+def MYR  : BinaryRRD<"myr",  0xB33B, null_frag, FP128, FP64>;
+def MYHR : BinaryRRD<"myhr", 0xB33D, null_frag, FP64,  FP64>;
+def MYLR : BinaryRRD<"mylr", 0xB339, null_frag, FP64,  FP64>;
+def MY   : BinaryRXF<"my",   0xED3B, null_frag, FP128, FP64, load, 8>;
+def MYH  : BinaryRXF<"myh",  0xED3D, null_frag, FP64,  FP64, load, 8>;
+def MYL  : BinaryRXF<"myl",  0xED39, null_frag, FP64,  FP64, load, 8>;
+
+// Fused multiply-add (unnormalized).
+def MAYR  : TernaryRRD<"mayr",  0xB33A, null_frag, FP128, FP64>;
+def MAYHR : TernaryRRD<"mayhr", 0xB33C, null_frag, FP64,  FP64>;
+def MAYLR : TernaryRRD<"maylr", 0xB338, null_frag, FP64,  FP64>;
+def MAY   : TernaryRXF<"may",   0xED3A, null_frag, FP128, FP64, load, 8>;
+def MAYH  : TernaryRXF<"mayh",  0xED3C, null_frag, FP64,  FP64, load, 8>;
+def MAYL  : TernaryRXF<"mayl",  0xED38, null_frag, FP64,  FP64, load, 8>;
+
+// Division.
+def DER : BinaryRR <"der", 0x3D,   null_frag, FP32,  FP32>;
+def DDR : BinaryRR <"ddr", 0x2D,   null_frag, FP64,  FP64>;
+def DXR : BinaryRRE<"dxr", 0xB22D, null_frag, FP128, FP128>;
+def DE  : BinaryRX <"de",  0x7D,   null_frag, FP32, load, 4>;
+def DD  : BinaryRX <"dd",  0x6D,   null_frag, FP64, load, 8>;
+
+
+//===----------------------------------------------------------------------===//
+// Comparisons
+//===----------------------------------------------------------------------===//
+
+let Defs = [CC] in {
+  def CER : CompareRR <"cer", 0x39,   null_frag, FP32,  FP32>;
+  def CDR : CompareRR <"cdr", 0x29,   null_frag, FP64,  FP64>;
+  def CXR : CompareRRE<"cxr", 0xB369, null_frag, FP128, FP128>;
+
+  def CE : CompareRX<"ce", 0x79, null_frag, FP32, load, 4>;
+  def CD : CompareRX<"cd", 0x69, null_frag, FP64, load, 8>;
+}
+
diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td
index 612c3b6cf96e..5f5f2f690e58 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ13.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ13.td
@@ -908,6 +908,238 @@ def : InstRW<[FXa, Lat30, GroupAlone], (instregex "SFASR$")>;
 def : InstRW<[FXa, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
 def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[VecBF], (instregex "LEXR$")>;
+def : InstRW<[VecDF2, VecDF2], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSU], (instregex "LDE$")>;
+def : InstRW<[FXb], (instregex "LDER$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "LX(D|E)$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)R$")>;
+
+// Convert from fixed
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)R$")>;
+def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)R$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)R$")>;
+def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)R$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[VecBF], (instregex "THD(E)?R$")>;
+def : InstRW<[VecBF], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DR$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)ER$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[VecBF], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)$")>;
+def : InstRW<[VecFPd], (instregex "SQ(E|D)R$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[VecBF], (instregex "FIER$")>;
+def : InstRW<[VecBF], (instregex "FIDR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
+def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
+def : InstRW<[VecBF], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)?$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MY(H|L)?R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[VecBF, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>;
+def : InstRW<[VecBF, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[VecBF], (instregex "M(A|S)DR$")>;
+def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>;
+def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAY(H|L)?R$")>;
+
+// Division
+def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>;
+def : InstRW<[VecFPd], (instregex "D(E|D)R$")>;
+def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecXsPm, LSU, Lat8], (instregex "C(E|D)$")>;
+def : InstRW<[VecXsPm, Lat4], (instregex "C(E|D)R$")>;
+def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[VecDF], (instregex "LTDTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[VecDF, Lat15], (instregex "LEDTR$")>;
+def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[VecDF], (instregex "LDETR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CD(F|G)TR(A)?$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CXL(F|G)TR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>;
+def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>;
+def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, BeginGroup], (instregex "C(S|U)XTR$")>;
+
+// Convert from / to zoned
+def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>;
+def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CZDT$")>;
+def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CZXT$")>;
+
+// Convert from / to packed
+def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDPT$")>;
+def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXPT$")>;
+def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>;
+def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>;
+
+// Perform floating-point operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[VecDF], (instregex "FIDTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEDTR$")>;
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "ESDTR$")>;
+def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[VecDF], (instregex "ADTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[VecDF], (instregex "SDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[VecDF, Lat30], (instregex "MDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[VecDF, Lat30], (instregex "DDTR(A)?$")>;
+def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[VecDF], (instregex "QADTR$")>;
+def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[FXb, VecDF, Lat11], (instregex "RRDTR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[LSU, VecDF, Lat11], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[FXb, VecDF, Lat11], (instregex "IEDTR$")>;
+def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[VecDF], (instregex "(K|C)DTR$")>;
+def : InstRW<[VecDF, VecDF, Lat11, GroupAlone], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[VecDF], (instregex "CEDTR$")>;
+def : InstRW<[VecDF], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+
+
 // --------------------------------- Vector --------------------------------- //
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td
index 670df8ff5541..126eac2e2072 100644
--- a/lib/Target/SystemZ/SystemZScheduleZ196.td
+++ b/lib/Target/SystemZ/SystemZScheduleZ196.td
@@ -839,5 +839,224 @@ def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
 def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
 def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[FPU], (instregex "LT(D|E)R$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[FPU], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[FPU], (instregex "LEXR$")>;
+def : InstRW<[FPU], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSU], (instregex "LDE$")>;
+def : InstRW<[FXU], (instregex "LDER$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)R$")>;
+
+// Convert from fixed
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)R$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)R$")>;
+def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)R$")>;
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)R$")>;
+def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[FPU], (instregex "THD(E)?R$")>;
+def : InstRW<[FPU], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[FPU], (instregex "L(C|N|P)DR$")>;
+def : InstRW<[FPU], (instregex "L(C|N|P)ER$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[FPU], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)$")>;
+def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)R$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[FPU], (instregex "FIER$")>;
+def : InstRW<[FPU], (instregex "FIDR$")>;
+def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
+def : InstRW<[FPU], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
+def : InstRW<[FPU], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY(H|L)?$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MY(H|L)?R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
+def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>;
+def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAY(H|L)?R$")>;
+
+// Division
+def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
+def : InstRW<[FPU, Lat30], (instregex "D(E|D)R$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)$")>;
+def : InstRW<[FPU], (instregex "C(E|D)R$")>;
+def : InstRW<[FPU, FPU, Lat15], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[DFU, Lat20], (instregex "LTDTR$")>;
+def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[DFU, Lat30], (instregex "LEDTR$")>;
+def : InstRW<[DFU, DFU, Lat30], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
+def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CD(F|G)TR(A)?$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXL(F|G)TR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
+def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>;
+def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>;
+
+// Perform floating-point operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[DFU, Lat20], (instregex "FIDTR$")>;
+def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
+def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat20, GroupAlone], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[DFU, Lat30], (instregex "ADTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[DFU, Lat30], (instregex "SDTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[DFU, Lat30], (instregex "MDTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[DFU, Lat30], (instregex "DDTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[FXU, DFU, Lat30], (instregex "RRDTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[LSU, DFU, Lat11], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[FXU, DFU, Lat11], (instregex "IEDTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
+def : InstRW<[DFU, DFU, Lat15, GroupAlone], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
+def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+
 }
 
diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td
index 1bdb8779dc72..d38ca64d2e9b 100644
--- a/lib/Target/SystemZ/SystemZScheduleZEC12.td
+++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td
@@ -877,5 +877,230 @@ def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>;
 def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>;
 def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>;
 
+
+// --------------------- Hexadecimal floating point ------------------------- //
+
+//===----------------------------------------------------------------------===//
+// HFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[FPU], (instregex "LT(D|E)R$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "LTXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[FPU], (instregex "(LEDR|LRER)$")>;
+def : InstRW<[FPU], (instregex "LEXR$")>;
+def : InstRW<[FPU], (instregex "(LDXR|LRDR)$")>;
+
+// Load lengthened
+def : InstRW<[LSU], (instregex "LDE$")>;
+def : InstRW<[FXU], (instregex "LDER$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "LX(D|E)$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "LX(D|E)R$")>;
+
+// Convert from fixed
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CE(F|G)R$")>;
+def : InstRW<[FXU, FPU, Lat9, GroupAlone], (instregex "CD(F|G)R$")>;
+def : InstRW<[FXU, FPU2, FPU2, Lat11, GroupAlone], (instregex "CX(F|G)R$")>;
+
+// Convert to fixed
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CF(E|D)R$")>;
+def : InstRW<[FXU, FPU, Lat12, GroupAlone], (instregex "CG(E|D)R$")>;
+def : InstRW<[FXU, FPU, FPU, Lat20, GroupAlone], (instregex "C(F|G)XR$")>;
+
+// Convert BFP to HFP / HFP to BFP.
+def : InstRW<[FPU], (instregex "THD(E)?R$")>;
+def : InstRW<[FPU], (instregex "TB(E)?DR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load Complement / Negative / Positive
+def : InstRW<[FPU], (instregex "L(C|N|P)DR$")>;
+def : InstRW<[FPU], (instregex "L(C|N|P)ER$")>;
+def : InstRW<[FPU2, FPU2, Lat9, GroupAlone], (instregex "L(C|N|P)XR$")>;
+
+// Halve
+def : InstRW<[FPU], (instregex "H(E|D)R$")>;
+
+// Square root
+def : InstRW<[FPU, LSU, Lat30], (instregex "SQ(E|D)$")>;
+def : InstRW<[FPU, Lat30], (instregex "SQ(E|D)R$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "SQXR$")>;
+
+// Load FP integer
+def : InstRW<[FPU], (instregex "FIER$")>;
+def : InstRW<[FPU], (instregex "FIDR$")>;
+def : InstRW<[FPU2, FPU2, Lat15, GroupAlone], (instregex "FIXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[FPU, LSU, Lat12], (instregex "A(E|D|U|W)$")>;
+def : InstRW<[FPU], (instregex "A(E|D|U|W)R$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "AXR$")>;
+
+// Subtraction
+def : InstRW<[FPU, LSU, Lat12], (instregex "S(E|D|U|W)$")>;
+def : InstRW<[FPU], (instregex "S(E|D|U|W)R$")>;
+def : InstRW<[FPU2, FPU2, Lat20, GroupAlone], (instregex "SXR$")>;
+
+// Multiply
+def : InstRW<[FPU, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>;
+def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY(H|L)?$")>;
+def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MY(H|L)?R$")>;
+
+// Multiply and add / subtract
+def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>;
+def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>;
+def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>;
+def : InstRW<[FPU2, FPU2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>;
+def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAY(H|L)?R$")>;
+
+// Division
+def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>;
+def : InstRW<[FPU, Lat30], (instregex "D(E|D)R$")>;
+def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXR$")>;
+
+//===----------------------------------------------------------------------===//
+// HFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)$")>;
+def : InstRW<[FPU], (instregex "C(E|D)R$")>;
+def : InstRW<[FPU, FPU, Lat15], (instregex "CXR$")>;
+
+
+// ------------------------ Decimal floating point -------------------------- //
+
+//===----------------------------------------------------------------------===//
+// DFP: Move instructions
+//===----------------------------------------------------------------------===//
+
+// Load and Test
+def : InstRW<[DFU, Lat20], (instregex "LTDTR$")>;
+def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LTXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Conversion instructions
+//===----------------------------------------------------------------------===//
+
+// Load rounded
+def : InstRW<[DFU, Lat30], (instregex "LEDTR$")>;
+def : InstRW<[DFU, DFU, Lat30], (instregex "LDXTR$")>;
+
+// Load lengthened
+def : InstRW<[DFU, Lat20], (instregex "LDETR$")>;
+def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>;
+
+// Convert from fixed / logical
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CD(F|G)TR(A)?$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXL(F|G)TR$")>;
+
+// Convert to fixed / logical
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "C(F|G)DTR(A)?$")>;
+def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "C(F|G)XTR(A)?$")>;
+def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)DTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)XTR$")>;
+
+// Convert from / to signed / unsigned packed
+def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>;
+def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "CX(S|U)TR$")>;
+def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>;
+def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>;
+
+// Convert from / to zoned
+def : InstRW<[LSU, DFU2, Lat7, GroupAlone], (instregex "CDZT$")>;
+def : InstRW<[LSU, LSU, DFU2, DFU2, Lat10, GroupAlone], (instregex "CXZT$")>;
+def : InstRW<[FXU, LSU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>;
+def : InstRW<[FXU, LSU, DFU, DFU, Lat15, GroupAlone], (instregex "CZXT$")>;
+
+// Perform floating-point operation
+def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Unary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Load FP integer
+def : InstRW<[DFU, Lat20], (instregex "FIDTR$")>;
+def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>;
+
+// Extract biased exponent
+def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>;
+def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEXTR$")>;
+
+// Extract significance
+def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>;
+def : InstRW<[FXU, DFU, DFU, Lat20, GroupAlone], (instregex "ESXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Binary arithmetic
+//===----------------------------------------------------------------------===//
+
+// Addition
+def : InstRW<[DFU, Lat30], (instregex "ADTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "AXTR(A)?$")>;
+
+// Subtraction
+def : InstRW<[DFU, Lat30], (instregex "SDTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "SXTR(A)?$")>;
+
+// Multiply
+def : InstRW<[DFU, Lat30], (instregex "MDTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>;
+
+// Division
+def : InstRW<[DFU, Lat30], (instregex "DDTR(A)?$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>;
+
+// Quantize
+def : InstRW<[DFU, Lat30], (instregex "QADTR$")>;
+def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>;
+
+// Reround
+def : InstRW<[FXU, DFU, Lat30], (instregex "RRDTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>;
+
+// Shift significand left/right
+def : InstRW<[LSU, DFU, Lat11], (instregex "S(L|R)DT$")>;
+def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>;
+
+// Insert biased exponent
+def : InstRW<[FXU, DFU, Lat11], (instregex "IEDTR$")>;
+def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>;
+
+//===----------------------------------------------------------------------===//
+// DFP: Comparisons
+//===----------------------------------------------------------------------===//
+
+// Compare
+def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>;
+def : InstRW<[DFU, DFU, Lat15, GroupAlone], (instregex "(K|C)XTR$")>;
+
+// Compare biased exponent
+def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>;
+def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>;
+
+// Test Data Class/Group
+def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>;
+def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>;
+
 }
 
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index 022679a7bc18..0ab0c2f25915 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -42,8 +42,10 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       HasMiscellaneousExtensions(false),
       HasExecutionHint(false), HasLoadAndTrap(false),
       HasTransactionalExecution(false), HasProcessorAssist(false),
+      HasDFPZonedConversion(false),
       HasVector(false), HasLoadStoreOnCond2(false),
       HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false),
+      HasDFPPackedConversion(false),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this), TSInfo(), FrameLowering() {}
 
diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h
index 770dd7cd939f..36e51921bf2f 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/lib/Target/SystemZ/SystemZSubtarget.h
@@ -47,10 +47,12 @@ class SystemZSubtarget : public SystemZGenSubtargetInfo {
   bool HasLoadAndTrap;
   bool HasTransactionalExecution;
   bool HasProcessorAssist;
+  bool HasDFPZonedConversion;
   bool HasVector;
   bool HasLoadStoreOnCond2;
   bool HasLoadAndZeroRightmostByte;
   bool HasMessageSecurityAssist5;
+  bool HasDFPPackedConversion;
 
 private:
   Triple TargetTriple;
@@ -133,6 +135,9 @@ class SystemZSubtarget : public SystemZGenSubtargetInfo {
   // Return true if the target has the processor-assist facility.
   bool hasProcessorAssist() const { return HasProcessorAssist; }
 
+  // Return true if the target has the DFP zoned-conversion facility.
+  bool hasDFPZonedConversion() const { return HasDFPZonedConversion; }
+
   // Return true if the target has the load-and-zero-rightmost-byte facility.
   bool hasLoadAndZeroRightmostByte() const {
     return HasLoadAndZeroRightmostByte;
@@ -142,6 +147,9 @@ class SystemZSubtarget : public SystemZGenSubtargetInfo {
   // extension facility 5.
   bool hasMessageSecurityAssist5() const { return HasMessageSecurityAssist5; }
 
+  // Return true if the target has the DFP packed-conversion facility.
+  bool hasDFPPackedConversion() const { return HasDFPPackedConversion; }
+
   // Return true if the target has the vector facility.
   bool hasVector() const { return HasVector; }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 31a5ca1f4cc2..814377003cbc 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -84,8 +84,8 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
                     ISD::SETULT, ISD::SETULE, ISD::SETUGT, ISD::SETUGE})
       setCondCodeAction(CC, T, Expand);
     // Expand floating-point library function operators.
-    for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOWI, ISD::FPOW,
-                    ISD::FREM, ISD::FMA})
+    for (auto Op : {ISD::FSIN, ISD::FCOS, ISD::FSINCOS, ISD::FPOW, ISD::FREM,
+                    ISD::FMA})
       setOperationAction(Op, T, Expand);
     // Note supported floating-point library function operators that otherwise
     // default to expand.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 86744b064132..8d78308afe9d 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -80,6 +80,12 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
              " of the loop header PC will be 0)."),
     cl::Hidden);
 
+static cl::opt<bool> MulConstantOptimization(
+    "mul-constant-optimization", cl::init(true),
+    cl::desc("Replace 'mul x, Const' with more effective instructions like "
+             "SHIFT, LEA, etc."),
+    cl::Hidden);
+
 /// Call this when the user attempts to do something unsupported, like
 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
 /// report_fatal_error, so calling code should attempt to recover without
@@ -670,7 +676,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FSINCOS,   VT, Expand);
     setOperationAction(ISD::FCOS,      VT, Expand);
     setOperationAction(ISD::FREM,      VT, Expand);
-    setOperationAction(ISD::FPOWI,     VT, Expand);
     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
     setOperationAction(ISD::FPOW,      VT, Expand);
     setOperationAction(ISD::FLOG,      VT, Expand);
@@ -30928,6 +30933,75 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
   }
 }
 
+static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
+                                 EVT VT, SDLoc DL) {
+
+  auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
+    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                                 DAG.getConstant(Mult, DL, VT));
+    Result = DAG.getNode(ISD::SHL, DL, VT, Result,
+                         DAG.getConstant(Shift, DL, MVT::i8));
+    Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, N->getOperand(0),
+                         Result);
+    return Result;
+  };
+
+  auto combineMulMulAddOrSub = [&](bool isAdd) {
+    SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
+                                 DAG.getConstant(9, DL, VT));
+    Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
+    Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, N->getOperand(0),
+                         Result);
+    return Result;
+  };
+
+  switch (MulAmt) {
+  default:
+    break;
+  case 11:
+    // mul x, 11 => add ((shl (mul x, 5), 1), x)
+    return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
+  case 21:
+    // mul x, 21 => add ((shl (mul x, 5), 2), x)
+    return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
+  case 22:
+    // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
+    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                       combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
+  case 19:
+    // mul x, 19 => sub ((shl (mul x, 5), 2), x)
+    return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
+  case 13:
+    // mul x, 13 => add ((shl (mul x, 3), 2), x)
+    return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
+  case 23:
+    // mul x, 13 => sub ((shl (mul x, 3), 3), x)
+    return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
+  case 14:
+    // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
+    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                       combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
+  case 26:
+    // mul x, 26 => sub ((mul (mul x, 9), 3), x)
+    return combineMulMulAddOrSub(/*isAdd*/ false);
+  case 28:
+    // mul x, 28 => add ((mul (mul x, 9), 3), x)
+    return combineMulMulAddOrSub(/*isAdd*/ true);
+  case 29:
+    // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
+    return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
+                       combineMulMulAddOrSub(/*isAdd*/ true));
+  case 30:
+    // mul x, 30 => sub (sub ((shl x, 5), x), x)
+    return DAG.getNode(
+        ISD::SUB, DL, VT, N->getOperand(0),
+        DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0),
+                    DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
+                                DAG.getConstant(5, DL, MVT::i8))));
+  }
+  return SDValue();
+}
+
 /// Optimize a single multiply with constant into two operations in order to
 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
@@ -30937,6 +31011,8 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
   if (DCI.isBeforeLegalize() && VT.isVector())
     return reduceVMULWidth(N, DAG, Subtarget);
 
+  if (!MulConstantOptimization)
+    return SDValue();
   // An imul is usually smaller than the alternative sequence.
   if (DAG.getMachineFunction().getFunction()->optForMinSize())
     return SDValue();
@@ -30992,7 +31068,8 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
     else
       NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
                            DAG.getConstant(MulAmt2, DL, VT));
-  }
+  } else if (!Subtarget.slowLEA())
+    NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
 
   if (!NewMul) {
     assert(MulAmt != 0 &&
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 67abc3116988..5e9f40019ce8 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -377,7 +377,6 @@ class CongruenceClass {
   int StoreCount = 0;
 };
 
-struct HashedExpression;
 namespace llvm {
 template <> struct DenseMapInfo<const Expression *> {
   static const Expression *getEmptyKey() {
@@ -391,41 +390,25 @@ template <> struct DenseMapInfo<const Expression *> {
     return reinterpret_cast<const Expression *>(Val);
   }
   static unsigned getHashValue(const Expression *E) {
-    return static_cast<unsigned>(E->getHashValue());
+    return static_cast<unsigned>(E->getComputedHash());
   }
-  static unsigned getHashValue(const HashedExpression &HE);
-  static bool isEqual(const HashedExpression &LHS, const Expression *RHS);
   static bool isEqual(const Expression *LHS, const Expression *RHS) {
     if (LHS == RHS)
       return true;
     if (LHS == getTombstoneKey() || RHS == getTombstoneKey() ||
         LHS == getEmptyKey() || RHS == getEmptyKey())
       return false;
+    // Compare hashes before equality.  This is *not* what the hashtable does,
+    // since it is computing it modulo the number of buckets, whereas we are
+    // using the full hash keyspace.  Since the hashes are precomputed, this
+    // check is *much* faster than equality.
+    if (LHS->getComputedHash() != RHS->getComputedHash())
+      return false;
     return *LHS == *RHS;
   }
 };
 } // end namespace llvm
 
-// This is just a wrapper around Expression that computes the hash value once at
-// creation time.  Hash values for an Expression can't change once they are
-// inserted into the DenseMap (it breaks DenseMap), so they must be immutable at
-// that point anyway.
-struct HashedExpression {
-  const Expression *E;
-  unsigned HashVal;
-  HashedExpression(const Expression *E)
-      : E(E), HashVal(DenseMapInfo<const Expression *>::getHashValue(E)) {}
-};
-
-unsigned
-DenseMapInfo<const Expression *>::getHashValue(const HashedExpression &HE) {
-  return HE.HashVal;
-}
-bool DenseMapInfo<const Expression *>::isEqual(const HashedExpression &LHS,
-                                               const Expression *RHS) {
-  return isEqual(LHS.E, RHS);
-}
-
 namespace {
 class NewGVN {
   Function &F;
@@ -707,7 +690,7 @@ class NewGVN {
   void markPredicateUsersTouched(Instruction *);
   void markValueLeaderChangeTouched(CongruenceClass *CC);
   void markMemoryLeaderChangeTouched(CongruenceClass *CC);
-  void markPhiOfOpsChanged(const HashedExpression &HE);
+  void markPhiOfOpsChanged(const Expression *E);
   void addPredicateUsers(const PredicateBase *, Instruction *) const;
   void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
   void addAdditionalUsers(Value *To, Value *User) const;
@@ -956,8 +939,12 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
   if (CC && CC->getDefiningExpr()) {
     // If we simplified to something else, we need to communicate
     // that we're users of the value we simplified to.
-    if (I != V)
-      addAdditionalUsers(V, I);
+    if (I != V) {
+      // Don't add temporary instructions to the user lists.
+      if (!AllTempInstructions.count(I))
+        addAdditionalUsers(V, I);
+    }
+
     if (I)
       DEBUG(dbgs() << "Simplified " << *I << " to "
                    << " expression " << *CC->getDefiningExpr() << "\n");
@@ -2195,8 +2182,8 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
 
 // For a given expression, mark the phi of ops instructions that could have
 // changed as a result.
-void NewGVN::markPhiOfOpsChanged(const HashedExpression &HE) {
-  touchAndErase(ExpressionToPhiOfOps, HE);
+void NewGVN::markPhiOfOpsChanged(const Expression *E) {
+  touchAndErase(ExpressionToPhiOfOps, E);
 }
 
 // Perform congruence finding on a given value numbering expression.
@@ -2210,14 +2197,13 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
   assert(!IClass->isDead() && "Found a dead class");
 
   CongruenceClass *EClass = nullptr;
-  HashedExpression HE(E);
   if (const auto *VE = dyn_cast<VariableExpression>(E)) {
     EClass = ValueToClass.lookup(VE->getVariableValue());
   } else if (isa<DeadExpression>(E)) {
     EClass = TOPClass;
   }
   if (!EClass) {
-    auto lookupResult = ExpressionToClass.insert_as({E, nullptr}, HE);
+    auto lookupResult = ExpressionToClass.insert({E, nullptr});
 
     // If it's not in the value table, create a new congruence class.
     if (lookupResult.second) {
@@ -2268,7 +2254,7 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
                  << "\n");
     if (ClassChanged) {
       moveValueToNewCongruenceClass(I, E, IClass, EClass);
-      markPhiOfOpsChanged(HE);
+      markPhiOfOpsChanged(E);
     }
 
     markUsersTouched(I);
@@ -2502,9 +2488,8 @@ NewGVN::makePossiblePhiOfOps(Instruction *I, bool HasBackedge,
         // Clone the instruction, create an expression from it, and see if we
         // have a leader.
         Instruction *ValueOp = I->clone();
-        auto Iter = TempToMemory.end();
         if (MemAccess)
-          Iter = TempToMemory.insert({ValueOp, MemAccess}).first;
+          TempToMemory.insert({ValueOp, MemAccess});
 
         for (auto &Op : ValueOp->operands()) {
           Op = Op->DoPHITranslation(PHIBlock, PredBB);
@@ -2523,7 +2508,7 @@ NewGVN::makePossiblePhiOfOps(Instruction *I, bool HasBackedge,
         AllTempInstructions.erase(ValueOp);
         ValueOp->deleteValue();
         if (MemAccess)
-          TempToMemory.erase(Iter);
+          TempToMemory.erase(ValueOp);
         if (!E)
           return nullptr;
         FoundVal = findPhiOfOpsLeader(E, PredBB);
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3b036a6ac430..2b83b8426d14 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7173,7 +7173,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // Note: Even if all instructions are scalarized, return true if any memory
   // accesses appear in the loop to get benefits from address folding etc.
   bool TypeNotScalarized =
-      VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
+      VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF;
   return VectorizationCostTy(C, TypeNotScalarized);
 }
 
@@ -7312,7 +7312,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   Type *RetTy = I->getType();
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
-  VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
+  VectorTy = ToVectorTy(RetTy, VF);
   auto SE = PSE.getSE();
 
   // TODO: We need to estimate the cost of intrinsic calls.
@@ -7445,10 +7445,9 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     } else if (Legal->isUniform(Op2)) {
       Op2VK = TargetTransformInfo::OK_UniformValue;
     }
-    SmallVector<const Value *, 4> Operands(I->operand_values());
-    unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
-    return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
-                                          Op2VK, Op1VP, Op2VP, Operands);
+    SmallVector<const Value *, 4> Operands(I->operand_values()); 
+    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
+                                      Op2VK, Op1VP, Op2VP, Operands);
   }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
@@ -7471,15 +7470,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   }
   case Instruction::Store:
   case Instruction::Load: {
-    unsigned Width = VF;
-    if (Width > 1) {
-      InstWidening Decision = getWideningDecision(I, Width);
-      assert(Decision != CM_Unknown &&
-             "CM decision should be taken at this point");
-      if (Decision == CM_Scalarize)
-        Width = 1;
-    }
-    VectorTy = ToVectorTy(getMemInstValueType(I), Width);
+    VectorTy = ToVectorTy(getMemInstValueType(I), VF);
     return getMemoryInstructionCost(I, VF);
   }
   case Instruction::ZExt:
diff --git a/test/CodeGen/AArch64/reg-scavenge-frame.mir b/test/CodeGen/AArch64/reg-scavenge-frame.mir
new file mode 100644
index 000000000000..3300bb1e5831
--- /dev/null
+++ b/test/CodeGen/AArch64/reg-scavenge-frame.mir
@@ -0,0 +1,52 @@
+# RUN: llc -run-pass=prologepilog -verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-linux-gnu"
+  define void @ScavengeForFrameWithoutOffset() { ret void }
+...
+---
+name:            ScavengeForFrameWithoutOffset
+tracksRegLiveness: true
+stack:
+  - { id: 0, type: spill-slot, offset: 0, size: 32, alignment: 8 }
+body:             |
+  bb.0:
+    liveins: %d16_d17_d18_d19
+    %x0 = COPY %xzr
+    %x1 = COPY %xzr
+    %x2 = COPY %xzr
+    %x3 = COPY %xzr
+    %x4 = COPY %xzr
+    %x5 = COPY %xzr
+    %x6 = COPY %xzr
+    %x7 = COPY %xzr
+    %x8 = COPY %xzr
+    %x9 = COPY %xzr
+    %x10 = COPY %xzr
+    %x11 = COPY %xzr
+    %x12 = COPY %xzr
+    %x13 = COPY %xzr
+    %x14 = COPY %xzr
+    %x15 = COPY %xzr
+    %x16 = COPY %xzr
+    %x17 = COPY %xzr
+    %x18 = COPY %xzr
+    %x19 = COPY %xzr
+    %x20 = COPY %xzr
+    %x21 = COPY %xzr
+    %x22 = COPY %xzr
+    %x23 = COPY %xzr
+    %x24 = COPY %xzr
+    %x25 = COPY %xzr
+    %x26 = COPY %xzr
+    %x27 = COPY %xzr
+    %x28 = COPY %xzr
+    %fp = COPY %xzr
+    %lr = COPY %xzr
+    ST1Fourv1d killed %d16_d17_d18_d19, %stack.0 :: (store 32 into %stack.0, align 8)
+# CHECK:  STRXui killed %[[SCAVREG:x[0-9]+|fp|lr]], %sp, [[SPOFFSET:[0-9]+]] :: (store 8 into %stack.1)
+# CHECK-NEXT:  %[[SCAVREG]] = ADDXri %sp, {{[0-9]+}}, 0
+# CHECK-NEXT:  ST1Fourv1d killed %d16_d17_d18_d19, killed %[[SCAVREG]] :: (store 32 into %stack.0, align 8)
+# CHECK-NEXT:  %[[SCAVREG]] = LDRXui %sp, [[SPOFFSET]] :: (load 8 from %stack.1)
+...
diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll
index a6b280578531..e5e2d436deb0 100644
--- a/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -23,7 +23,7 @@ define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]]
-; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]]
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL0]], [[VVAL1]]
 
 ; VI: s_add_i32
 ; VI: s_add_i32
@@ -50,7 +50,7 @@ define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <
 
 ; FIXME: VI should not scalarize arg access.
 ; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg:
-; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI: v_add_i32
 ; VI: v_add_i32_sdwa
@@ -62,10 +62,11 @@ define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}v_test_add_v2i16_constant:
 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
-; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
-; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}}
+; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0x1c8
+; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -79,10 +80,11 @@ define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %ou
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_add_v2i16_neg_constant:
 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}}
-; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
 
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
-; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}}
+; VI-DAG: v_mov_b32_e32 v[[SCONST:[0-9]+]], 0xfffffc21
+; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -96,11 +98,11 @@ define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)*
 ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1:
 ; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}}
 
+; VI: v_mov_b32_e32 v[[SCONST:[0-9]+]], -1
 ; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
 ; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
-; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD0]]
+; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, v[[SCONST]], [[LOAD0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
-; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
 define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -114,7 +116,7 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)*
 
 ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi:
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}}
-; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]{{$}}
 
 ; VI-NOT: v_add_u16
 ; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}}
@@ -134,12 +136,12 @@ define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
 ; The high element gives fp
 ; GCN-LABEL: {{^}}v_test_add_v2i16_inline_fp_split:
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0
-; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]{{$}}
 
 ; VI-NOT: v_add_u16
-; VI: v_add_u16_e32 v{{[0-9]+}}, 0x3f80, v{{[0-9]+}}
+; VI: v_mov_b32_e32 v[[K:[0-9]+]], 0x3f80
+; VI: v_add_u16_sdwa v{{[0-9]+}}, v[[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NOT: v_add_u16
-; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
 define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -191,19 +193,17 @@ define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; GFX9: flat_load_dword [[A:v[0-9]+]]
 ; GFX9: flat_load_dword [[B:v[0-9]+]]
 
-; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
 ; GFX9: buffer_store_dwordx4
 
+; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; VI: flat_load_ushort v[[A_LO:[0-9]+]]
 ; VI: flat_load_ushort v[[A_HI:[0-9]+]]
 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
 
-; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
-; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; VI-DAG: v_add_u16_e32
 ; VI-DAG: v_add_u16_e32
 
diff --git a/test/CodeGen/AMDGPU/bfe-combine.ll b/test/CodeGen/AMDGPU/bfe-combine.ll
index 791b49f0e143..6035e3bf4a5f 100644
--- a/test/CodeGen/AMDGPU/bfe-combine.ll
+++ b/test/CodeGen/AMDGPU/bfe-combine.ll
@@ -1,12 +1,16 @@
-; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GCN --check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 < %s | FileCheck --check-prefix=GCN --check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GCN --check-prefix=VI-SDWA %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN --check-prefix=CI %s
 
 ; GCN-LABEL: {{^}}bfe_combine8:
 ; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 8, 8
 ; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], 2, v[[BFE]]
+; VI-SDWA: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
+; VI-SDWA: v_lshlrev_b32_sdwa v[[ADDRBASE:[0-9]+]], v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
 ; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 6, v{{[0-9]+}}
 ; CI: v_and_b32_e32 v[[ADDRLO:[0-9]+]], 0x3fc, v[[SHR]]
 ; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
+; VI-SDWA: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
 ; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
 define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x) {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2
@@ -22,6 +26,10 @@ define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x
 ; GCN-LABEL: {{^}}bfe_combine16:
 ; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 16, 16
 ; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], {{[^,]+}}, v[[BFE]]
+; VI-SDWA: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 15
+; VI-SDWA: v_lshlrev_b32_sdwa v[[ADDRBASE1:[0-9]+]], v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-SDWA: v_lshlrev_b64 v{{\[}}[[ADDRBASE:[0-9]+]]:{{[^\]+}}], 2, v{{\[}}[[ADDRBASE1]]:{{[^\]+}}]
+; VI-SDWA: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
 ; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 1, v{{[0-9]+}}
 ; CI: v_and_b32_e32 v[[AND:[0-9]+]], 0x7fff8000, v[[SHR]]
 ; CI: v_lshl_b64 v{{\[}}[[ADDRLO:[0-9]+]]:{{[^\]+}}], v{{\[}}[[AND]]:{{[^\]+}}], 2
diff --git a/test/CodeGen/AMDGPU/commute-compares.ll b/test/CodeGen/AMDGPU/commute-compares.ll
index 973c4544d97a..66148a43a271 100644
--- a/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/test/CodeGen/AMDGPU/commute-compares.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -amdgpu-sdwa-peephole=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
diff --git a/test/CodeGen/AMDGPU/commute_modifiers.ll b/test/CodeGen/AMDGPU/commute_modifiers.ll
index 8820e4fd80e5..f38c1f8aa6ed 100644
--- a/test/CodeGen/AMDGPU/commute_modifiers.ll
+++ b/test/CodeGen/AMDGPU/commute_modifiers.ll
@@ -51,7 +51,7 @@ define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, fl
 ; FUNC-LABEL: @commute_add_lit_fabs_f32
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
-; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[K]], |[[X]]|
+; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, [[K]]
 ; SI: buffer_store_dword [[REG]]
 define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 026dd7ca6c87..d772d1b67936 100644
--- a/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index e16daa6fad9d..0328ce31002d 100644
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -94,7 +94,6 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)
 ; GCN-DAG: v_cvt_f32_ubyte3_e32
 
 ; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24
-; GCN-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16
 
 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8
diff --git a/test/CodeGen/AMDGPU/fabs.f64.ll b/test/CodeGen/AMDGPU/fabs.f64.ll
index 998e02f7bdf8..718176b80f0f 100644
--- a/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -55,7 +55,7 @@ define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x doub
 ; SI-LABEL: {{^}}fabs_fold_f64:
 ; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-NOT: and
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
 ; SI: s_endpgm
 define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
   %fabs = call double @llvm.fabs.f64(double %in0)
@@ -67,7 +67,7 @@ define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0,
 ; SI-LABEL: {{^}}fabs_fn_fold_f64:
 ; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
 ; SI-NOT: and
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
+; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
 ; SI: s_endpgm
 define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
   %fabs = call double @fabs(double %in0)
diff --git a/test/CodeGen/AMDGPU/fabs.ll b/test/CodeGen/AMDGPU/fabs.ll
index ac8fa3e45ef5..600c6cd8230e 100644
--- a/test/CodeGen/AMDGPU/fabs.ll
+++ b/test/CodeGen/AMDGPU/fabs.ll
@@ -75,7 +75,7 @@ define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float
 ; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
 define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @fabs(float %in0)
   %fmul = fmul float %fabs, %in1
@@ -87,7 +87,7 @@ define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, fl
 ; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
-; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
 define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @llvm.fabs.f32(float %in0)
   %fmul = fmul float %fabs, %in1
diff --git a/test/CodeGen/AMDGPU/fadd.f16.ll b/test/CodeGen/AMDGPU/fadd.f16.ll
index f76ecf58d905..9b3d2a475a14 100644
--- a/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -96,9 +96,9 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
@@ -107,9 +107,9 @@ entry:
 ; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 
-; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
+; VI-DAG: v_mov_b32_e32 v[[CONST2:[0-9]+]], 0x4000
+; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
@@ -125,9 +125,9 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
@@ -136,10 +136,10 @@ entry:
 ; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 
-; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[A_F16_1]]
+; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
+; VI-DAG: v_add_f16_sdwa v[[R_F16_0:[0-9]+]], v[[CONST1]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]]
-; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
-; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_1]]
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll
index 7eb7747de215..c936d98673ba 100644
--- a/test/CodeGen/AMDGPU/fadd64.ll
+++ b/test/CodeGen/AMDGPU/fadd64.ll
@@ -13,7 +13,7 @@ define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; CHECK-LABEL: {{^}}s_fadd_f64:
-; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
+; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
   %r2 = fadd double %r0, %r1
   store double %r2, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index c9787bb478ef..9e8ddd39bbaf 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -205,9 +205,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace
 }
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16:
-; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, {{v[0-9]+}}
+; VI: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
+; VI-DAG: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}}
-; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI-NOT: v_and_b32
 
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
@@ -223,7 +223,8 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
 ; VI-DAG: v_bfe_u32
 ; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
-; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
+; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
+; VI: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
 ; VI-NOT: 0xffff
 ; VI: v_or_b32
@@ -240,9 +241,10 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspa
 }
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
-; VI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
-; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
-; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
+; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
+; VI-DAG: v_mul_f16_sdwa [[REG0:v[0-9]+]], v[[CONST1]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
 ; VI: v_or_b32
 
 ; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
@@ -259,11 +261,10 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad
 
 ; FIXME: Fold modifier
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
-; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
-; VI-DAG: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]]
-; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]]
+; VI-DAG: v_mov_b32_e32 v[[CONST1:[0-9]+]], 0x3c00
+; VI-DAG: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
+; VI-DAG: v_mul_f16_sdwa [[REG1:v[0-9]+]], v[[CONST1]], [[FNEG]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
-; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI-NOT: 0xffff
 
 ; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
diff --git a/test/CodeGen/AMDGPU/fmul.f16.ll b/test/CodeGen/AMDGPU/fmul.f16.ll
index 4e96091ae256..4ef2aa693cf4 100644
--- a/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -96,17 +96,18 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fmul_v2f16_imm_a:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
+; VI-DAG:  v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400
+; VI-DAG:  v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
@@ -121,17 +122,18 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fmul_v2f16_imm_b:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
+; VI-DAG:  v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200
+; VI-DAG:  v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[CONST3]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 506b2a02f828..c256159726bf 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -71,7 +71,9 @@ define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspa
 ; FIXME: single bit op
 ; GCN-LABEL: {{^}}s_fneg_fabs_v2f16:
 ; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
-; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
+; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
 ; CIVI: flat_store_dword
 
@@ -85,10 +87,15 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x
 
 ; GCN-LABEL: {{^}}fneg_fabs_v4f16:
 ; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
-; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
+; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; VI: v_or_b32_sdwa v{{[0-9]+}}, [[VMASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
 
 ; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
 ; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index 85f544032171..bc0e59980186 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -5,7 +5,7 @@
 ; into 2 modifiers, although theoretically that should work.
 
 ; GCN-LABEL: {{^}}fneg_fabs_fadd_f64:
-; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}}
+; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
 define amdgpu_kernel void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -25,7 +25,7 @@ define amdgpu_kernel void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, doubl
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_fmul_f64:
-; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}}
+; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|
 define amdgpu_kernel void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll
index a0cf37b159db..0a7346f410c9 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
 ; SI-NOT: and
-; SI: v_subrev_f32_e64 {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}}
+; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
 define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -15,7 +15,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x
 
 ; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
 ; SI-NOT: and
-; SI: v_mul_f32_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|, {{s[0-9]+}}
+; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
 ; SI-NOT: and
 define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll
index ed36666db807..16e4fc680bea 100644
--- a/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -130,13 +130,15 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x
 }
 
 ; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16:
-; GCN: flat_load_dword [[VAL:v[0-9]+]]
+; GCN-DAG: flat_load_dword [[VAL:v[0-9]+]]
 ; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}
 ; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 
-; GFX89: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]]
+; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]]
 ; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
-; GFX89-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]]
+; GFX9-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]]
+; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
+; VI-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll
index 7a5bcfffa3f3..9a56cbe983cd 100644
--- a/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/test/CodeGen/AMDGPU/fract.f64.ll
@@ -12,7 +12,7 @@ declare double @llvm.floor.f64(double) #0
 ; SI-DAG: v_fract_f64_e32 [[FRC:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
 ; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v{{\[}}[[UPLO]]:[[UPHI]]]
 ; SI-DAG: v_cmp_class_f64_e64 vcc, v{{\[}}[[LO]]:[[HI]]], 3
 ; SI: v_cndmask_b32_e32 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], vcc
 ; SI: v_cndmask_b32_e32 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], vcc
@@ -39,7 +39,7 @@ define amdgpu_kernel void @fract_f64(double addrspace(1)* %out, double addrspace
 ; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]
 ; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
 ; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v{{\[}}[[UPLO]]:[[UPHI]]]
 ; SI-DAG: v_cmp_class_f64_e64 vcc, v{{\[}}[[LO]]:[[HI]]], 3
 ; SI: v_cndmask_b32_e32 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], vcc
 ; SI: v_cndmask_b32_e32 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], vcc
@@ -67,7 +67,7 @@ define amdgpu_kernel void @fract_f64_neg(double addrspace(1)* %out, double addrs
 ; SI-DAG: v_fract_f64_e64 [[FRC:v\[[0-9]+:[0-9]+\]]], -|v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]]|
 ; SI-DAG: v_mov_b32_e32 v[[UPLO:[0-9]+]], -1
 ; SI-DAG: v_mov_b32_e32 v[[UPHI:[0-9]+]], 0x3fefffff
-; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], v{{\[}}[[UPLO]]:[[UPHI]]], [[FRC]]
+; SI-DAG: v_min_f64 v{{\[}}[[MINLO:[0-9]+]]:[[MINHI:[0-9]+]]], [[FRC]], v{{\[}}[[UPLO]]:[[UPHI]]]
 ; SI-DAG: v_cmp_class_f64_e64 vcc, v{{\[}}[[LO]]:[[HI]]], 3
 ; SI: v_cndmask_b32_e32 v[[RESLO:[0-9]+]], v[[MINLO]], v[[LO]], vcc
 ; SI: v_cndmask_b32_e32 v[[RESHI:[0-9]+]], v[[MINHI]], v[[HI]], vcc
diff --git a/test/CodeGen/AMDGPU/fsub.f16.ll b/test/CodeGen/AMDGPU/fsub.f16.ll
index d3c5df317771..836b480b6a67 100644
--- a/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -99,7 +99,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
@@ -111,14 +111,13 @@ entry:
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 
-; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; VI-DAG: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
+; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
+; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST2]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00
-; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
+; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]] neg_lo:[1,0] neg_hi:[1,0]
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
@@ -134,7 +133,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 
 ; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
@@ -146,14 +145,13 @@ entry:
 ; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 
-; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
+; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
+; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONSTM1]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
-; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000
-; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[A_V2_F16]]{{$}}
+; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]{{$}}
 
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll
index 1b0879d098ee..dc332414a152 100644
--- a/test/CodeGen/AMDGPU/fsub64.ll
+++ b/test/CodeGen/AMDGPU/fsub64.ll
@@ -39,7 +39,7 @@ define amdgpu_kernel void @fsub_fabs_inv_f64(double addrspace(1)* %out, double a
 }
 
 ; SI-LABEL: {{^}}s_fsub_f64:
-; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}
+; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 define amdgpu_kernel void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double %a, %b
   store double %sub, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll
index 96132d841997..bc951a82becd 100644
--- a/test/CodeGen/AMDGPU/immv216.ll
+++ b/test/CodeGen/AMDGPU/immv216.ll
@@ -123,7 +123,8 @@ define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONST0:v[0-9]+]], 0
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST0]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -140,7 +141,8 @@ define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %ou
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST05]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -157,7 +159,8 @@ define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %ou
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONSTM05:v[0-9]+]], 0xb800
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM05]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -174,7 +177,8 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)*
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 0x3c00
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -191,7 +195,8 @@ define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %ou
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xbc00
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -208,7 +213,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)*
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -225,7 +231,8 @@ define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %ou
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xc000
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -242,7 +249,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)*
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST4]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -259,7 +267,8 @@ define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %ou
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONSTM4:v[0-9]+]], 0xc400
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM4]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -273,10 +282,10 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)*
 ; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
 ; GFX9: buffer_store_dword [[REG]]
 
+; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
 ; VI: buffer_load_dword
 ; VI-NOT: and
-; VI: v_lshrrev_b32_e32 {{v[0-9]+}}, 16,
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST05]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
 ; VI: v_or_b32
 ; VI: buffer_store_dword
@@ -290,7 +299,7 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace
 ; GCN-LABEL: {{^}}commute_add_literal_v2f16:
 ; GFX9-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; GFX9-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
-; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]] op_sel_hi:[0,1]{{$}}
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], [[K]] op_sel_hi:[1,0]{{$}}
 ; GFX9: buffer_store_dword [[REG]]
 
 ; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
@@ -315,7 +324,8 @@ define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %o
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONST1:v[0-9]+]], 1
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -332,7 +342,8 @@ define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out,
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 2
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -349,7 +360,8 @@ define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out,
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONST16:v[0-9]+]], 16
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST16]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -366,7 +378,8 @@ define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONSTM1:v[0-9]+]], 0xffff
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM1]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -383,7 +396,8 @@ define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONSTM2:v[0-9]+]], 0xfffe
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM2]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -400,7 +414,8 @@ define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONSTM16:v[0-9]+]], 0xfff0
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONSTM16]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -417,7 +432,8 @@ define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)*
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONST63:v[0-9]+]], 63
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST63]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
@@ -434,7 +450,8 @@ define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out
 ; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
 ; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
 ; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]]
-; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL1]]
+; VI-DAG: v_mov_b32_e32 [[CONST64:v[0-9]+]], 64
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[CONST64]], [[VAL1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32
 ; VI: buffer_store_dword
 define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 89adcff1a278..350dd38ef583 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -258,8 +258,10 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace
 ; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
 
 ; GCN-LABEL: {{^}}v_insertelement_v2i16_1:
+; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e70000
 ; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
-; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
+; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 
 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
@@ -278,9 +280,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v2i16_1_inlineimm:
+; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xfff10000
 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
-; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
-; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
+; CI:   v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
+; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
+; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
 define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
@@ -337,8 +342,10 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspac
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_1:
+; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x45000000
 ; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
-; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
+; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 
 ; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500
 ; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
@@ -357,9 +364,12 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_1_inlineimm:
+; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0x230000
 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
-; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
-; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
+; CI: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
+; GFX9: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
+; CI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
+; VI: v_or_b32_sdwa [[RES:v[0-9]+]], [[K]], [[VEC]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
 define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
@@ -411,11 +421,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
+; GFX89: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; CI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
 ; GCN: flat_load_dword [[IDX:v[0-9]+]]
 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
+; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
 
-; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
 
@@ -438,11 +449,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
+; GFX89: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; CI: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
 ; GCN: flat_load_dword [[IDX:v[0-9]+]]
 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
+; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
 
-; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
 ; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
 ; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
index e04d9e662cea..3bb5e21d67ac 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
@@ -27,7 +27,7 @@ entry:
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
 ; VI:  v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
-; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
+; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @div_fixup_f16_imm_a(
@@ -46,7 +46,7 @@ entry:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
-; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
+; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @div_fixup_f16_imm_b(
@@ -65,7 +65,7 @@ entry:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 ; VI:  v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
-; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
+; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @div_fixup_f16_imm_c(
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index a86468b07a27..2cc63ae74bf1 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -17,7 +17,7 @@ declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind re
 ; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
 ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
 ; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
-; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]]
+; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
index c9993ee88369..737be5d00447 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @v_fcmp_f32_dynamic_cc(i64 addrspace(1)* %out, float %
 }
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_with_fabs:
-; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, {{s[0-9]+}}
+; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}, |{{v[0-9]+}}|
 define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
@@ -23,7 +23,7 @@ define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, floa
 }
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_both_operands_with_fabs:
-; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, |{{s[0-9]+}}|
+; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{s[0-9]+}}|, |{{v[0-9]+}}|
 define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %src_input = call float @llvm.fabs.f32(float %src)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
index b47d2dbc744d..be8462d09064 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
@@ -27,7 +27,7 @@ define amdgpu_kernel void @test_mul_legacy_undef1_f32(float addrspace(1)* %out,
 }
 
 ; GCN-LABEL: {{^}}test_mul_legacy_fabs_f32:
-; GCN: v_mul_legacy_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |s{{[0-9]+}}|
+; GCN: v_mul_legacy_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|, |v{{[0-9]+}}|
 define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 1b937ab93247..ef9cda142850 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -3,9 +3,8 @@
 
 ; GCN-LABEL: {{^}}test_barrier:
 ; GFX8: buffer_store_dword
-; GFX8: s_waitcnt
 ; GFX9: flat_store_dword
-; GFX9-NOT: s_waitcnt
+; GCN: s_waitcnt
 ; GCN: s_barrier
 define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out, i32 %size) #0 {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index 518fe8baaa7a..3f4fba7d8ead 100644
--- a/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -39,7 +39,7 @@ define amdgpu_kernel void @fma_f16(
 ; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; VI:  v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
-; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
+; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fma_f16_imm_a(
@@ -62,7 +62,7 @@ define amdgpu_kernel void @fma_f16_imm_a(
 ; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
-; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
+; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fma_f16_imm_b(
@@ -85,7 +85,7 @@ define amdgpu_kernel void @fma_f16_imm_b(
 ; SI:  v_fma_f32 v[[R_F32:[0-9]+]], v[[A_F32:[0-9]]], v[[B_F32:[0-9]]], v[[C_F32:[0-9]]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; VI:  v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
-; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
+; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
 define amdgpu_kernel void @fma_f16_imm_c(
diff --git a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index f30fd1d58204..eec187390169 100644
--- a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -50,7 +50,7 @@ define amdgpu_kernel void @fmuladd_f16(
 ; VI-FLUSH: buffer_store_short v[[C_F16]]
 
 ; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200
-; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[KA]], v[[B_F16]], v[[C_F16]]
+; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[B_F16]], [[KA]], v[[C_F16]]
 ; VI-DENORM: buffer_store_short [[RESULT]]
 
 ; GCN: s_endpgm
@@ -78,7 +78,7 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
 ; VI-FLUSH: buffer_store_short v[[C_F16]]
 
 ; VI-DENORM: v_mov_b32_e32 [[KA:v[0-9]+]], 0x4200
-; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[KA]], v[[A_F16]], v[[C_F16]]
+; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], v[[A_F16]], [[KA]], v[[C_F16]]
 ; VI-DENORM buffer_store_short [[RESULT]]
 
 
diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 4c8dff52509a..a4353d1136e1 100644
--- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -101,18 +101,19 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
+; VI-DAG:  v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
+; VI-DAG:  v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
 
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN-NOT: and
 ; GCN:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
@@ -128,18 +129,19 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
+; VI-DAG:  v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
+; VI-DAG:  v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST3]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
 
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN-NOT: and
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index b8221356b664..4875d26fc860 100644
--- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -100,7 +100,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
@@ -110,11 +110,11 @@ entry:
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
 
-; VI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; VI-DAG:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
+; VI-DAG:  v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
+; VI-DAG:  v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST4]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
 
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN-NOT: and
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
@@ -130,18 +130,19 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI-DAG:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
+; VI-DAG:  v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
+; VI-DAG:  v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], [[CONST3]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
 
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
 ; GCN-NOT: and
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
diff --git a/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index 1e78c4ebcc9f..176d1d25f196 100644
--- a/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -10,7 +10,7 @@ declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 ; GCN-LABEL: {{^}}get_global_id_0:
 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
 ; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]]
-; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0
+; GCN: v_mad_u32_u24 v{{[0-9]+}}, s8, [[VWGSIZEX]], v0
 define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
   %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index 5f1fb0e2d732..8e0014911def 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -151,7 +151,7 @@ define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, flo
 ; GCN-LABEL: {{^}}no_madak_src0_modifier_f32:
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
-; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
+; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -173,7 +173,7 @@ define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalia
 ; GCN-LABEL: {{^}}no_madak_src1_modifier_f32:
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
-; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
+; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll
index 6e70e95383c9..6bc40e82459b 100644
--- a/test/CodeGen/AMDGPU/madmk.ll
+++ b/test/CodeGen/AMDGPU/madmk.ll
@@ -129,7 +129,7 @@ define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias %
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], |[[VA]]|, [[VB]]
+; GCN: v_mad_f32 {{v[0-9]+}}, |[[VA]]|, [[VK]], [[VB]]
 define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -171,7 +171,7 @@ define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalia
 ; GCN-LABEL: {{^}}madmk_add_inline_imm_f32:
 ; GCN: buffer_load_dword [[A:v[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
-; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
+; GCN: v_mad_f32 {{v[0-9]+}}, [[A]], [[VK]], 2.0
 define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
index a72a6efb0711..57c50c9804e5 100644
--- a/test/CodeGen/AMDGPU/mul.ll
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -211,10 +211,10 @@ endif:
 ; SI: s_mul_i32
 ; SI: v_mul_hi_u32
 ; SI: s_mul_i32
-; SI: s_mul_i32
-; SI: v_mul_hi_u32
-; SI: v_mul_hi_u32
-; SI: s_mul_i32
+; SI-DAG: s_mul_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: s_mul_i32
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_u32
 ; SI: s_mul_i32
diff --git a/test/CodeGen/AMDGPU/scratch-simple.ll b/test/CodeGen/AMDGPU/scratch-simple.ll
index 60b9b56a48d1..6ed730ad60f4 100644
--- a/test/CodeGen/AMDGPU/scratch-simple.ll
+++ b/test/CodeGen/AMDGPU/scratch-simple.ll
@@ -9,13 +9,11 @@
 ; GCN-LABEL: {{^}}ps_main:
 
 ; GCN-DAG: s_mov_b32 [[SWO:s[0-9]+]], s0
-; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0x400{{$}}
 ; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
 ; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
 
-; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]]
-; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]]
+; GCN-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], 0x200, [[CLAMP_IDX]]
+; GCN-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], 0x400, [[CLAMP_IDX]]
 
 ; GCN: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
 ; GCN: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, [[SWO]] offen
diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll
index f9ac425be794..7ec6ca809b68 100644
--- a/test/CodeGen/AMDGPU/sdiv.ll
+++ b/test/CodeGen/AMDGPU/sdiv.ll
@@ -36,7 +36,7 @@ define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; FUNC-LABEL: {{^}}slow_sdiv_i32_3435:
 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]],
 ; SI-DAG: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x98a1930b
-; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[MAGIC]], [[VAL]]
+; SI: v_mul_hi_i32 [[TMP:v[0-9]+]], [[VAL]], [[MAGIC]]
 ; SI: v_add_i32
 ; SI: v_lshrrev_b32
 ; SI: v_ashrrev_i32
diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll
index 73defc17d04f..a319edfc5ace 100644
--- a/test/CodeGen/AMDGPU/sdwa-peephole.ll
+++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -345,7 +345,10 @@ entry:
 
 ; GCN-LABEL: {{^}}immediate_mul_v2i16:
 ; NOSDWA-NOT: v_mul_u32_u24_sdwa
-; SDWA-NOT: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141
+; SDWA-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b
+; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v[[M123]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v[[M321]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 
 define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
diff --git a/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
new file mode 100644
index 000000000000..cd50e01032c3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdwa-scalar-ops.mir
@@ -0,0 +1,410 @@
+# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: {{^}}sdwa_imm_operand:
+# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
+# GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
+# GCN: BB0_1:
+# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+
+# GCN-LABEL: {{^}}sdwa_sgpr_operand:
+# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
+# GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
+# GCN: BB1_1:
+# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+
+--- |
+  ; ModuleID = 'sdwa-scalar-ops.opt.ll'
+  source_filename = "sdwa-scalar-ops.opt.ll"
+  target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+  define amdgpu_kernel void @sdwa_imm_operand(i32 addrspace(1)* nocapture %arg) {
+  bb:
+    br label %bb2
+
+  bb1:                                              ; preds = %bb2
+    ret void
+
+  bb2:                                              ; preds = %bb2, %bb
+    %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ]
+    %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)*
+    %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv
+    %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)*
+    %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4
+    %tmp6 = lshr i32 %tmp5, 8
+    %tmp7 = and i32 %tmp6, 255
+    %tmp8 = zext i32 %tmp7 to i64
+    %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8
+    store i32 1, i32 addrspace(1)* %tmp9, align 4
+    %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1
+    %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4
+    %tmp14 = lshr i32 %tmp13, 8
+    %tmp15 = and i32 %tmp14, 255
+    %tmp16 = zext i32 %tmp15 to i64
+    %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
+    store i32 1, i32 addrspace(1)* %tmp17, align 4
+    %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8
+    %tmp1 = trunc i64 %lsr.iv.next to i32
+    %tmp19 = icmp eq i32 %tmp1, 4096
+    br i1 %tmp19, label %bb1, label %bb2
+  }
+
+  define amdgpu_kernel void @sdwa_sgpr_operand(i32 addrspace(1)* nocapture %arg) {
+  bb:
+    br label %bb2
+
+  bb1:                                              ; preds = %bb2
+    ret void
+
+  bb2:                                              ; preds = %bb2, %bb
+    %lsr.iv = phi i64 [ %lsr.iv.next, %bb2 ], [ 0, %bb ]
+    %bc = bitcast i32 addrspace(1)* %arg to i8 addrspace(1)*
+    %uglygep4 = getelementptr i8, i8 addrspace(1)* %bc, i64 %lsr.iv
+    %uglygep45 = bitcast i8 addrspace(1)* %uglygep4 to i32 addrspace(1)*
+    %tmp5 = load i32, i32 addrspace(1)* %uglygep45, align 4
+    %tmp6 = lshr i32 %tmp5, 8
+    %tmp7 = and i32 %tmp6, 255
+    %tmp8 = zext i32 %tmp7 to i64
+    %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp8
+    store i32 1, i32 addrspace(1)* %tmp9, align 4
+    %scevgep = getelementptr i32, i32 addrspace(1)* %uglygep45, i64 1
+    %tmp13 = load i32, i32 addrspace(1)* %scevgep, align 4
+    %tmp14 = lshr i32 %tmp13, 8
+    %tmp15 = and i32 %tmp14, 255
+    %tmp16 = zext i32 %tmp15 to i64
+    %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp16
+    store i32 1, i32 addrspace(1)* %tmp17, align 4
+    %lsr.iv.next = add nuw nsw i64 %lsr.iv, 8
+    %tmp1 = trunc i64 %lsr.iv.next to i32
+    %tmp19 = icmp eq i32 %tmp1, 4096
+    br i1 %tmp19, label %bb1, label %bb2
+  }
+
+...
+---
+name:            sdwa_imm_operand
+alignment:       0
+exposesReturnsTwice: false
+noVRegs:         false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_64 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sgpr_128 }
+  - { id: 4, class: sgpr_64 }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: sgpr_32 }
+  - { id: 7, class: sreg_64 }
+  - { id: 8, class: sreg_64 }
+  - { id: 9, class: sreg_64_xexec }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sreg_32_xm0 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_64 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: sreg_32_xm0 }
+  - { id: 20, class: sreg_32 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32_xm0 }
+  - { id: 23, class: sreg_32_xm0 }
+  - { id: 24, class: sreg_64 }
+  - { id: 25, class: sreg_32_xm0 }
+  - { id: 26, class: sreg_32_xm0 }
+  - { id: 27, class: sreg_32_xm0 }
+  - { id: 28, class: sreg_32_xm0 }
+  - { id: 29, class: sreg_64 }
+  - { id: 30, class: vgpr_32 }
+  - { id: 31, class: vreg_64 }
+  - { id: 32, class: sreg_32_xm0 }
+  - { id: 33, class: sreg_32_xm0 }
+  - { id: 34, class: sreg_64 }
+  - { id: 35, class: sreg_32_xm0 }
+  - { id: 36, class: sreg_32_xm0 }
+  - { id: 37, class: sreg_32_xm0 }
+  - { id: 38, class: sreg_32_xm0 }
+  - { id: 39, class: vreg_64 }
+  - { id: 40, class: vgpr_32 }
+  - { id: 41, class: vreg_64 }
+  - { id: 42, class: sreg_32_xm0 }
+  - { id: 43, class: sreg_32 }
+  - { id: 44, class: sreg_32_xm0 }
+  - { id: 45, class: sreg_64 }
+  - { id: 46, class: sreg_32_xm0 }
+  - { id: 47, class: sreg_32_xm0 }
+  - { id: 48, class: sreg_32_xm0 }
+  - { id: 49, class: sreg_32_xm0 }
+  - { id: 50, class: sreg_64 }
+  - { id: 51, class: vreg_64 }
+  - { id: 52, class: sreg_64 }
+  - { id: 53, class: sreg_32_xm0 }
+  - { id: 54, class: sreg_32_xm0 }
+  - { id: 55, class: sreg_32_xm0 }
+  - { id: 56, class: sreg_32_xm0 }
+  - { id: 57, class: sreg_64 }
+  - { id: 58, class: sreg_32_xm0 }
+  - { id: 59, class: sreg_32_xm0 }
+  - { id: 60, class: vgpr_32 }
+  - { id: 61, class: vgpr_32 }
+  - { id: 62, class: vreg_64 }
+  - { id: 63, class: vgpr_32 }
+  - { id: 64, class: vgpr_32 }
+  - { id: 65, class: vgpr_32 }
+  - { id: 66, class: vgpr_32 }
+  - { id: 67, class: vreg_64 }
+  - { id: 68, class: vgpr_32 }
+  - { id: 69, class: vgpr_32 }
+  - { id: 70, class: vgpr_32 }
+  - { id: 71, class: vgpr_32 }
+  - { id: 72, class: vgpr_32 }
+  - { id: 73, class: vgpr_32 }
+  - { id: 74, class: vgpr_32 }
+  - { id: 75, class: vreg_64 }
+  - { id: 76, class: vgpr_32 }
+  - { id: 77, class: vgpr_32 }
+  - { id: 78, class: vgpr_32 }
+  - { id: 79, class: vgpr_32 }
+  - { id: 80, class: vreg_64 }
+  - { id: 81, class: vgpr_32 }
+  - { id: 82, class: vgpr_32 }
+  - { id: 83, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr4_sgpr5', virtual-reg: '%4' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.bb:
+    successors: %bb.2.bb2(0x80000000)
+    liveins: %sgpr4_sgpr5
+
+    %4 = COPY %sgpr4_sgpr5
+    %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %8 = S_MOV_B64 0
+    %7 = COPY %9
+    %30 = V_MOV_B32_e32 1, implicit %exec
+    S_BRANCH %bb.2.bb2
+
+  bb.1.bb1:
+    S_ENDPGM
+
+  bb.2.bb2:
+    successors: %bb.1.bb1(0x04000000), %bb.2.bb2(0x7c000000)
+
+    %0 = PHI %8, %bb.0.bb, %1, %bb.2.bb2
+    %13 = COPY %7.sub1
+    %14 = S_ADD_U32 %7.sub0, %0.sub0, implicit-def %scc
+    %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead %scc, implicit %scc
+    %16 = REG_SEQUENCE %14, 1, %15, 2
+    %18 = COPY %16
+    %17 = FLAT_LOAD_DWORD %18, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.uglygep45)
+    %60 = V_BFE_U32 %17, 8, 8, implicit %exec
+    %61 = V_LSHLREV_B32_e32 2, killed %60, implicit %exec
+    %70 = V_ADD_I32_e32 %7.sub0, %61, implicit-def %vcc, implicit %exec
+    %66 = COPY %13
+    %65 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
+    %67 = REG_SEQUENCE %70, 1, killed %65, 2
+    FLAT_STORE_DWORD %67, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp9)
+    %37 = S_ADD_U32 %14, 4, implicit-def %scc
+    %38 = S_ADDC_U32 %15, 0, implicit-def dead %scc, implicit %scc
+    %71 = COPY killed %37
+    %72 = COPY killed %38
+    %41 = REG_SEQUENCE killed %71, 1, killed %72, 2
+    %40 = FLAT_LOAD_DWORD killed %41, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.scevgep)
+    %73 = V_BFE_U32 %40, 8, 8, implicit %exec
+    %74 = V_LSHLREV_B32_e32 2, killed %73, implicit %exec
+    %83 = V_ADD_I32_e32 %7.sub0, %74, implicit-def %vcc, implicit %exec
+    %78 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
+    %80 = REG_SEQUENCE %83, 1, killed %78, 2
+    FLAT_STORE_DWORD %80, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp17)
+    %55 = S_ADD_U32 %0.sub0, 8, implicit-def %scc
+    %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead %scc, implicit %scc
+    %57 = REG_SEQUENCE %55, 1, killed %56, 2
+    %1 = COPY %57
+    S_CMPK_EQ_I32 %55, 4096, implicit-def %scc
+    S_CBRANCH_SCC1 %bb.1.bb1, implicit %scc
+    S_BRANCH %bb.2.bb2
+
+...
+---
+name:            sdwa_sgpr_operand
+alignment:       0
+exposesReturnsTwice: false
+noVRegs:         false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64 }
+  - { id: 1, class: sreg_64 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: sgpr_128 }
+  - { id: 4, class: sgpr_64 }
+  - { id: 5, class: sreg_32_xm0 }
+  - { id: 6, class: sgpr_32 }
+  - { id: 7, class: sreg_64 }
+  - { id: 8, class: sreg_64 }
+  - { id: 9, class: sreg_64_xexec }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sreg_32_xm0 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_64 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: sreg_32_xm0 }
+  - { id: 20, class: sreg_32 }
+  - { id: 21, class: sreg_32_xm0 }
+  - { id: 22, class: sreg_32_xm0 }
+  - { id: 23, class: sreg_32_xm0 }
+  - { id: 24, class: sreg_64 }
+  - { id: 25, class: sreg_32_xm0 }
+  - { id: 26, class: sreg_32_xm0 }
+  - { id: 27, class: sreg_32_xm0 }
+  - { id: 28, class: sreg_32_xm0 }
+  - { id: 29, class: sreg_64 }
+  - { id: 30, class: vgpr_32 }
+  - { id: 31, class: vreg_64 }
+  - { id: 32, class: sreg_32_xm0 }
+  - { id: 33, class: sreg_32_xm0 }
+  - { id: 34, class: sreg_64 }
+  - { id: 35, class: sreg_32_xm0 }
+  - { id: 36, class: sreg_32_xm0 }
+  - { id: 37, class: sreg_32_xm0 }
+  - { id: 38, class: sreg_32_xm0 }
+  - { id: 39, class: vreg_64 }
+  - { id: 40, class: vgpr_32 }
+  - { id: 41, class: vreg_64 }
+  - { id: 42, class: sreg_32_xm0 }
+  - { id: 43, class: sreg_32 }
+  - { id: 44, class: sreg_32_xm0 }
+  - { id: 45, class: sreg_64 }
+  - { id: 46, class: sreg_32_xm0 }
+  - { id: 47, class: sreg_32_xm0 }
+  - { id: 48, class: sreg_32_xm0 }
+  - { id: 49, class: sreg_32_xm0 }
+  - { id: 50, class: sreg_64 }
+  - { id: 51, class: vreg_64 }
+  - { id: 52, class: sreg_64 }
+  - { id: 53, class: sreg_32_xm0 }
+  - { id: 54, class: sreg_32_xm0 }
+  - { id: 55, class: sreg_32_xm0 }
+  - { id: 56, class: sreg_32_xm0 }
+  - { id: 57, class: sreg_64 }
+  - { id: 58, class: sreg_32_xm0 }
+  - { id: 59, class: sreg_32_xm0 }
+  - { id: 60, class: vgpr_32 }
+  - { id: 61, class: vgpr_32 }
+  - { id: 62, class: vreg_64 }
+  - { id: 63, class: vgpr_32 }
+  - { id: 64, class: vgpr_32 }
+  - { id: 65, class: vgpr_32 }
+  - { id: 66, class: vgpr_32 }
+  - { id: 67, class: vreg_64 }
+  - { id: 68, class: vgpr_32 }
+  - { id: 69, class: vgpr_32 }
+  - { id: 70, class: vgpr_32 }
+  - { id: 71, class: vgpr_32 }
+  - { id: 72, class: vgpr_32 }
+  - { id: 73, class: vgpr_32 }
+  - { id: 74, class: vgpr_32 }
+  - { id: 75, class: vreg_64 }
+  - { id: 76, class: vgpr_32 }
+  - { id: 77, class: vgpr_32 }
+  - { id: 78, class: vgpr_32 }
+  - { id: 79, class: vgpr_32 }
+  - { id: 80, class: vreg_64 }
+  - { id: 81, class: vgpr_32 }
+  - { id: 82, class: vgpr_32 }
+  - { id: 83, class: vgpr_32 }
+  - { id: 84, class: sreg_32_xm0 }
+liveins:
+  - { reg: '%sgpr4_sgpr5', virtual-reg: '%4' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.bb:
+    successors: %bb.2.bb2(0x80000000)
+    liveins: %sgpr4_sgpr5
+
+    %4 = COPY %sgpr4_sgpr5
+    %9 = S_LOAD_DWORDX2_IMM %4, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %8 = S_MOV_B64 0
+    %7 = COPY %9
+    %30 = V_MOV_B32_e32 1, implicit %exec
+    %84 = S_MOV_B32 2
+    S_BRANCH %bb.2.bb2
+
+  bb.1.bb1:
+    S_ENDPGM
+
+  bb.2.bb2:
+    successors: %bb.1.bb1(0x04000000), %bb.2.bb2(0x7c000000)
+
+    %0 = PHI %8, %bb.0.bb, %1, %bb.2.bb2
+    %13 = COPY %7.sub1
+    %14 = S_ADD_U32 %7.sub0, %0.sub0, implicit-def %scc
+    %15 = S_ADDC_U32 %7.sub1, %0.sub1, implicit-def dead %scc, implicit %scc
+    %16 = REG_SEQUENCE %14, 1, %15, 2
+    %18 = COPY %16
+    %17 = FLAT_LOAD_DWORD %18, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.uglygep45)
+    %60 = V_BFE_U32 %17, 8, 8, implicit %exec
+    %61 = V_LSHLREV_B32_e32 %84, killed %60, implicit %exec
+    %70 = V_ADD_I32_e32 %7.sub0, %61, implicit-def %vcc, implicit %exec
+    %66 = COPY %13
+    %65 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
+    %67 = REG_SEQUENCE %70, 1, killed %65, 2
+    FLAT_STORE_DWORD %67, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp9)
+    %37 = S_ADD_U32 %14, 4, implicit-def %scc
+    %38 = S_ADDC_U32 %15, 0, implicit-def dead %scc, implicit %scc
+    %71 = COPY killed %37
+    %72 = COPY killed %38
+    %41 = REG_SEQUENCE killed %71, 1, killed %72, 2
+    %40 = FLAT_LOAD_DWORD killed %41, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.scevgep)
+    %73 = V_BFE_U32 %40, 8, 8, implicit %exec
+    %74 = V_LSHLREV_B32_e32 %84, killed %73, implicit %exec
+    %83 = V_ADD_I32_e32 %7.sub0, %74, implicit-def %vcc, implicit %exec
+    %78 = V_ADDC_U32_e32 0, %66, implicit-def %vcc, implicit %vcc, implicit %exec
+    %80 = REG_SEQUENCE %83, 1, killed %78, 2
+    FLAT_STORE_DWORD %80, %30, 0, 0, implicit %exec, implicit %flat_scr :: (store 4 into %ir.tmp17)
+    %55 = S_ADD_U32 %0.sub0, 8, implicit-def %scc
+    %56 = S_ADDC_U32 %0.sub1, 0, implicit-def dead %scc, implicit %scc
+    %57 = REG_SEQUENCE %55, 1, killed %56, 2
+    %1 = COPY %57
+    S_CMPK_EQ_I32 %55, 4096, implicit-def %scc
+    S_CBRANCH_SCC1 %bb.1.bb1, implicit %scc
+    S_BRANCH %bb.2.bb2
+
+...
diff --git a/test/CodeGen/AMDGPU/select.f16.ll b/test/CodeGen/AMDGPU/select.f16.ll
index 2a7a9c9e0638..92ee2eb7f403 100644
--- a/test/CodeGen/AMDGPU/select.f16.ll
+++ b/test/CodeGen/AMDGPU/select.f16.ll
@@ -196,11 +196,11 @@ entry:
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
-; SI:  v_cmp_lt_f32_e64
-; SI:  v_cmp_lt_f32_e32 vcc, 0.5
+; SI-DAG:  v_cmp_gt_f32_e64
+; SI-DAG:  v_cmp_lt_f32_e32 vcc, 0.5
 
 ; VI:  v_cmp_lt_f16_e32
-; VI:  v_cmp_lt_f16_e64
+; VI:  v_cmp_gt_f16_e64
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e64
 ; SI:  v_cvt_f16_f32_e32
@@ -228,11 +228,11 @@ entry:
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
-; SI:  v_cmp_gt_f32_e64
-; SI:  v_cmp_gt_f32_e32 vcc, 0.5
+; SI-DAG:  v_cmp_lt_f32_e64
+; SI-DAG:  v_cmp_gt_f32_e32 vcc, 0.5
 
 ; VI:  v_cmp_gt_f16_e32
-; VI:  v_cmp_gt_f16_e64
+; VI:  v_cmp_lt_f16_e64
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e64
 
diff --git a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
index 0a29db4a0580..4f7b61adc91d 100644
--- a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -5,7 +5,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 
 ; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
-; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}}
 ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
 ; GCN: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 
@@ -24,14 +24,15 @@ define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128
 
 ; Extract the high bit of the 2nd quarter
 ; GCN-LABEL: {{^}}v_uextract_bit_63_i128:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}}
 ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO3:[0-9]+]], v[[ZERO0]]{{$}}
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 
-; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO3]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -49,7 +50,7 @@ define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
-; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}}
 ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 
@@ -68,14 +69,15 @@ define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128
 
 ; Extract the high bit of the 4th quarter
 ; GCN-LABEL: {{^}}v_uextract_bit_127_i128:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
 
-; GCN: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO0:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO0]]{{$}}
 ; GCN: v_mov_b32_e32 v[[ZERO2:[0-9]+]], v[[ZERO0]]{{$}}
+; GCN: v_mov_b32_e32 v[[ZERO3:[0-9]+]], v[[ZERO0]]{{$}}
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 
-; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO3]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -90,15 +92,16 @@ define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128
 
 ; Spans more than 2 dword boundaries
 ; GCN-LABEL: {{^}}v_uextract_bit_34_100_i128:
-; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 
 ; GCN-DAG: v_lshl_b64 v{{\[}}[[SHLLO:[0-9]+]]:[[SHLHI:[0-9]+]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, 30
 ; GCN-DAG: v_lshrrev_b32_e32 v[[ELT1PART:[0-9]+]], 2, v{{[[0-9]+}}
 ; GCN-DAG: v_bfe_u32 v[[ELT2PART:[0-9]+]], v[[VAL3]], 2, 2{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_or_b32_e32 v[[OR0:[0-9]+]], v[[SHLLO]], v[[ELT1PART]]
+; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}}
 
-; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO1]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
 define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index 36c33b876919..a6026785b173 100644
--- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -21,10 +21,11 @@ define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 add
 
 ; Extract the high bit of the high half
 ; GCN-LABEL: {{^}}v_uextract_bit_63_i64:
+; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -69,10 +70,11 @@ define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 add
 }
 
 ; GCN-LABEL: {{^}}v_uextract_bit_32_i64:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]{{$}}
+; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -85,10 +87,11 @@ define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 add
 }
 
 ; GCN-LABEL: {{^}}v_uextract_bit_33_i64:
+; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -167,10 +170,11 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64
 }
 
 ; GCN-LABEL: {{^}}v_uextract_bit_32_33_i64:
+; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO1]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -183,11 +187,12 @@ define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64
 }
 
 ; GCN-LABEL: {{^}}v_uextract_bit_30_60_i64:
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
 ; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 30
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
-; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO1]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -200,10 +205,11 @@ define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64
 }
 
 ; GCN-LABEL: {{^}}v_uextract_bit_33_63_i64:
+; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
-; GCN-DAG: v_mov_b32_e32 v[[BFE:[0-9]+]], 0{{$}}
-; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO1:[0-9]+]], v[[ZERO]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO1]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
@@ -216,9 +222,10 @@ define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64
 }
 
 ; GCN-LABEL: {{^}}v_uextract_bit_31_63_i64:
+; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
 ; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
-; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
+; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], v[[ZERO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
 define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@@ -300,7 +307,8 @@ define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)*
 
 ; GCN-LABEL: {{^}}and_not_mask_i64:
 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[VALLO:[0-9]+]]:[[VALHI:[0-9]+]]{{\]}}
-; GCN: v_mov_b32_e32 v[[SHRHI:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[SHRHI:[0-9]+]], v[[ZERO]]{{$}}
 ; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 20, v[[VALLO]]
 ; GCN-DAG: v_and_b32_e32 v[[SHRLO:[0-9]+]], 4, [[SHR]]
 ; GCN-NOT: v[[SHRLO]]
@@ -321,7 +329,7 @@ define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspac
 ; keeping the 32-bit and has a smaller encoding size than the bfe.
 
 ; GCN-LABEL: {{^}}v_uextract_bit_27_29_multi_use_shift_i64:
-; GCN: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
+; GCN-DAG: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
 ; GCN-DAG: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 27
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
@@ -340,8 +348,8 @@ define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspac
 }
 
 ; GCN-LABEL: {{^}}v_uextract_bit_34_37_multi_use_shift_i64:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
-; GCN: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}}
+; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
+; GCN-DAG: v_mov_b32_e32 v[[ZERO_SHR:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[ZERO_BFE:[0-9]+]], v[[ZERO_SHR]]
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 2, [[VAL]]
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
@@ -362,6 +370,7 @@ define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspac
 ; GCN-LABEL: {{^}}v_uextract_bit_33_36_use_upper_half_shift_i64:
 ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 3
+; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:{{[0-9]+\]}}
 ; GCN: buffer_store_dword v[[ZERO]]
 define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
index 16ce86bf8b11..5d71ad2c8ba3 100644
--- a/test/CodeGen/AMDGPU/sminmax.v2i16.ll
+++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -40,13 +40,14 @@ define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %
 ; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
 ; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
 
+; VI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
-; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
+; VI: v_add_u16_sdwa v{{[0-9]+}}, [[TWO]], v{{[0-9]+}}  dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NOT: v_and_b32
 ; VI: v_or_b32_e32
 define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
@@ -206,7 +207,7 @@ define amdgpu_kernel void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <
 }
 
 ; GCN-LABEL: {{^}}u_min_max_v2i16:
-; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+; GFX9: v_pk_max_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_min_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 define amdgpu_kernel void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind {
   %cond0 = icmp ugt <2 x i16> %val0, %val1
diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll
index c89f798397ae..e06725892089 100644
--- a/test/CodeGen/AMDGPU/srem.ll
+++ b/test/CodeGen/AMDGPU/srem.ll
@@ -20,7 +20,7 @@ define amdgpu_kernel void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}srem_i32_7:
 ; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x92492493
-; SI: v_mul_hi_i32 {{v[0-9]+}}, [[MAGIC]],
+; SI: v_mul_hi_i32 {{v[0-9]+}}, {{v[0-9]+}}, [[MAGIC]]
 ; SI: v_mul_lo_i32
 ; SI: v_sub_i32
 ; SI: s_endpgm
diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll
index 431344670ffb..6aeff3fc3b6c 100644
--- a/test/CodeGen/AMDGPU/sub.v2i16.ll
+++ b/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -23,7 +23,7 @@ define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i
 ; GFX9: s_load_dword [[VAL0:s[0-9]+]]
 ; GFX9: s_load_dword [[VAL1:s[0-9]+]]
 ; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]]
-; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]]
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VAL0]], [[VVAL1]]
 
 ; VI: s_sub_i32
 ; VI: s_sub_i32
@@ -47,7 +47,7 @@ define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <
 
 ; FIXME: VI should not scalarize arg access.
 ; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg:
-; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI: v_subrev_i32_e32
 ; VI: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
@@ -59,9 +59,10 @@ define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}v_test_sub_v2i16_constant:
 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
-; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
 
-; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffe38, v{{[0-9]+}}
+; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfffffe38
+; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
 define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -76,9 +77,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %ou
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_sub_v2i16_neg_constant:
 ; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}}
-; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[CONST]]
 
-; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x3df, v{{[0-9]+}}
+; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3df
+; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}}
 define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -93,11 +95,11 @@ define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)*
 ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1:
 ; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}}
 
+; VI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
 ; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
-; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD0]]
+; VI-DAG: v_add_u16_sdwa v{{[0-9]+}}, [[ONE]], [[LOAD0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]]
-; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
 define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -111,7 +113,7 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)*
 
 ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi:
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}}
-; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
 
 ; VI-NOT: v_subrev_i16
 ; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffffe0, v{{[0-9]+}}
@@ -131,12 +133,12 @@ define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspac
 ; The high element gives fp
 ; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_fp_split:
 ; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0
-; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
 
 ; VI-NOT: v_subrev_i16
-; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffc080, v{{[0-9]+}}
+; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffffc080
+; VI: v_add_u16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-NOT: v_subrev_i16
-; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
 define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -185,10 +187,10 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i64:
+; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GFX9: flat_load_dword [[A:v[0-9]+]]
 ; GFX9: flat_load_dword [[B:v[0-9]+]]
 
-; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
 ; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
@@ -199,8 +201,6 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)
 ; VI: flat_load_ushort v[[B_LO:[0-9]+]]
 ; VI: flat_load_ushort v[[B_HI:[0-9]+]]
 
-; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
-; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
 ; VI-DAG: v_subrev_u16_e32
 ; VI-DAG: v_subrev_u16_e32
 
diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll
index 2874a0cdbc05..d9dab0d40acf 100644
--- a/test/CodeGen/AMDGPU/udiv.ll
+++ b/test/CodeGen/AMDGPU/udiv.ll
@@ -74,7 +74,7 @@ define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspa
 ; FUNC-LABEL: {{^}}udiv_i32_div_k_even:
 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0xfabbd9c1
-; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
+; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]]
 ; SI: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -88,7 +88,7 @@ define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrs
 ; FUNC-LABEL: {{^}}udiv_i32_div_k_odd:
 ; SI-DAG: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x7d5deca3
-; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
+; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[VAL]], [[K]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]]
 ; SI: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
@@ -176,7 +176,7 @@ define amdgpu_kernel void @test_udiv2(i32 %p) {
 
 ; FUNC-LABEL: {{^}}test_udiv_3_mulhu:
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
-; SI: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
+; SI: v_mul_hi_u32 v0, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
 define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
    %i = udiv i32 %p, 3
diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll
index fd7f8fa2efab..fb4eab43a2d6 100644
--- a/test/CodeGen/AMDGPU/urem.ll
+++ b/test/CodeGen/AMDGPU/urem.ll
@@ -20,7 +20,7 @@ define amdgpu_kernel void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1
 
 ; FUNC-LABEL: {{^}}test_urem_i32_7:
 ; SI: v_mov_b32_e32 [[MAGIC:v[0-9]+]], 0x24924925
-; SI: v_mul_hi_u32 {{v[0-9]+}}, [[MAGIC]]
+; SI: v_mul_hi_u32 [[MAGIC]], {{v[0-9]+}}
 ; SI: v_subrev_i32
 ; SI: v_mul_lo_i32
 ; SI: v_sub_i32
diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index f8e6b7edfe35..e6bdb68a4f77 100644
--- a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -54,8 +54,8 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(
 ; VI: buffer_load_dword [[VA0:v[0-9]+]]
 ; VI: buffer_load_dword [[VA1:v[0-9]+]]
 
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VA0]], [[SA]], [[VB]]
-; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SA]], [[VA0]], [[VB]]
+; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SA]], [[VA1]], [[VB]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
@@ -74,7 +74,7 @@ define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, fl
 ; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
@@ -88,7 +88,7 @@ define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(
 ; VI-DAG: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI-DAG: s_load_dword [[SGPR1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
+; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
 define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
@@ -228,7 +228,7 @@ define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addr
 ; GCN-DAG: v_mov_b32_e32 [[VK0:v[0-9]+]], 0x44800000
 ; GCN-DAG: v_mov_b32_e32 [[VS1:v[0-9]+]], [[SGPR1]]
 
-; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[VS1]], [[SGPR0]], [[VK0]]
+; GCN-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK0]]
 ; GCN-DAG: v_mov_b32_e32 [[VK1:v[0-9]+]], 0x45800000
 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[SGPR0]], [[VS1]], [[VK1]]
 
@@ -251,7 +251,7 @@ define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a,
 
 ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB0:[0-9]+]], s[[SGPR1_SUB0]]
 ; GCN-DAG: v_mov_b32_e32 v[[VS1_SUB1:[0-9]+]], s[[SGPR1_SUB1]]
-; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, [[SGPR0]], v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}}
+; GCN: v_fma_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[SGPR0]], v{{\[}}[[VS1_SUB0]]:[[VS1_SUB1]]{{\]}}, v{{\[}}[[VZERO]]:[[VK0_SUB1]]{{\]}}
 
 ; Same zero component is re-used for half of each immediate.
 ; GCN: v_mov_b32_e32 v[[VK1_SUB1:[0-9]+]], 0x40b00000
diff --git a/test/CodeGen/AMDGPU/v_mac_f16.ll b/test/CodeGen/AMDGPU/v_mac_f16.ll
index c45af522ec49..3da1a0324042 100644
--- a/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -482,8 +482,9 @@ entry:
 ; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
 ; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
 
-; VI:  v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
-; VI:  v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
+; VI-DAG:  v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
+; VI-DAG:  v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
+; VI-DAG:  v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG:  v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-DAG:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
 
@@ -513,8 +514,9 @@ entry:
 ; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
 ; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
 
-; VI:  v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
+; VI:  v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; VI:  v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
+; VI:  v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG:  v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
 
@@ -544,8 +546,9 @@ entry:
 ; SI-DAG:  v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
 ; SI-DAG:  v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
 
-; VI:  v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
+; VI:  v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; VI:  v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
+; VI:  v_sub_f16_sdwa v[[NEG_A0:[0-9]+]], [[ZERO]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-DAG:  v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-DAG:  v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
 
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll
index 9f277b2c9a59..133aaa35981e 100644
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -349,7 +349,7 @@ main_body:
 
 ; CHECK: [[LOOPHDR:BB[0-9]+_[0-9]+]]: ; %body
 ; CHECK: v_add_f32_e32 [[CTR]], 2.0, [[CTR]]
-; CHECK: v_cmp_lt_f32_e32 vcc, [[SEVEN]], [[CTR]]
+; CHECK: v_cmp_gt_f32_e32 vcc, [[CTR]], [[SEVEN]]
 ; CHECK: s_cbranch_vccz [[LOOPHDR]]
 ; CHECK: ; %break
 
diff --git a/test/CodeGen/WebAssembly/negative-base-reg.ll b/test/CodeGen/WebAssembly/negative-base-reg.ll
index 377966ffa8d9..fc3a287f5858 100644
--- a/test/CodeGen/WebAssembly/negative-base-reg.ll
+++ b/test/CodeGen/WebAssembly/negative-base-reg.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-wasm-fallthrough-return-opt | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32"
+target triple = "wasm32---elf"
 
 @args = hidden local_unnamed_addr global [32 x i32] zeroinitializer, align 16
 
diff --git a/test/CodeGen/X86/bitcast-and-setcc-128.ll b/test/CodeGen/X86/bitcast-and-setcc-128.ll
new file mode 100644
index 000000000000..a681c3b0aa42
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-and-setcc-128.ll
@@ -0,0 +1,1155 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSE2
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+ssse3 < %s | FileCheck %s --check-prefixes=SSE2-SSSE3,SSSE3
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx < %s | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefixes=AVX512
+
+define i8 @v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x i16> %d) {
+; SSE2-SSSE3-LABEL: v8i16:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    pextrw $7, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $6, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $5, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $4, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $3, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $2, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $1, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movd %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: v8i16:
+; AVX12:       ## BB#0:
+; AVX12-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpcmpgtw %xmm3, %xmm2, %xmm1
+; AVX12-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpextrw $7, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrw $6, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrw $5, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrw $1, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vmovd %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT:    retq
+;
+; AVX512-LABEL: v8i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtw %xmm1, %xmm0, %k1
+; AVX512-NEXT:    vpcmpgtw %xmm3, %xmm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <8 x i16> %a, %b
+  %x1 = icmp sgt <8 x i16> %c, %d
+  %y = and <8 x i1> %x0, %x1
+  %res = bitcast <8 x i1> %y to i8
+  ret i8 %res
+}
+
+define i4 @v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
+; SSE2-SSSE3-LABEL: v4i32:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    movd %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: v4i32:
+; AVX12:       ## BB#0:
+; AVX12-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm1
+; AVX12-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpextrd $3, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vmovd %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT:    retq
+;
+; AVX512-LABEL: v4i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <4 x i32> %a, %b
+  %x1 = icmp sgt <4 x i32> %c, %d
+  %y = and <4 x i1> %x0, %x1
+  %res = bitcast <4 x i1> %y to i4
+  ret i4 %res
+}
+
+define i4 @v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
+; SSE2-SSSE3-LABEL: v4f32:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    cmpltps %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    cmpltps %xmm2, %xmm3
+; SSE2-SSSE3-NEXT:    andps %xmm1, %xmm3
+; SSE2-SSSE3-NEXT:    movd %xmm3, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: v4f32:
+; AVX12:       ## BB#0:
+; AVX12-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
+; AVX12-NEXT:    vcmpltps %xmm2, %xmm3, %xmm1
+; AVX12-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpextrd $3, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vmovd %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT:    retq
+;
+; AVX512-LABEL: v4f32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vcmpltps %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vcmpltps %xmm2, %xmm3, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x0 = fcmp ogt <4 x float> %a, %b
+  %x1 = fcmp ogt <4 x float> %c, %d
+  %y = and <4 x i1> %x0, %x1
+  %res = bitcast <4 x i1> %y to i4
+  ret i4 %res
+}
+
+define i16 @v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
+; SSE2-SSSE3-LABEL: v16i8:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    pcmpgtb %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtb %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-SSSE3-NEXT:    andb $1, %cl
+; SSE2-SSSE3-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    andb $1, %al
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: v16i8:
+; AVX12:       ## BB#0:
+; AVX12-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpcmpgtb %xmm3, %xmm2, %xmm1
+; AVX12-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpextrb $15, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $14, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $13, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $11, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $10, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $9, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $7, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $6, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $5, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $3, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $2, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $1, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX12-NEXT:    andb $1, %al
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX12-NEXT:    retq
+;
+; AVX512-LABEL: v16i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtb %xmm1, %xmm0, %k1
+; AVX512-NEXT:    vpcmpgtb %xmm3, %xmm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <16 x i8> %a, %b
+  %x1 = icmp sgt <16 x i8> %c, %d
+  %y = and <16 x i1> %x0, %x1
+  %res = bitcast <16 x i1> %y to i16
+  ret i16 %res
+}
+
+define i2 @v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d) {
+; SSE2-SSSE3-LABEL: v2i8:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    psllq $56, %xmm2
+; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm4
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $24, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-SSSE3-NEXT:    psllq $56, %xmm3
+; SSE2-SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm4
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $24, %xmm3
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-SSSE3-NEXT:    psllq $56, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm4
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $24, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-SSSE3-NEXT:    psllq $56, %xmm1
+; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm4
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $24, %xmm1
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v2i8:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpsllq $56, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm4
+; AVX1-NEXT:    vpsrad $24, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsllq $56, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrad $24, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsllq $56, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm4
+; AVX1-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i8:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpsllq $56, %xmm3, %xmm3
+; AVX2-NEXT:    vpsrad $31, %xmm3, %xmm4
+; AVX2-NEXT:    vpsrad $24, %xmm3, %xmm3
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
+; AVX2-NEXT:    vpsllq $56, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm4
+; AVX2-NEXT:    vpsrad $24, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
+; AVX2-NEXT:    vpsllq $56, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm4
+; AVX2-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3]
+; AVX2-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm4
+; AVX2-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3]
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllq $56, %xmm3, %xmm3
+; AVX512-NEXT:    vpsraq $56, %xmm3, %xmm3
+; AVX512-NEXT:    vpsllq $56, %xmm2, %xmm2
+; AVX512-NEXT:    vpsraq $56, %xmm2, %xmm2
+; AVX512-NEXT:    vpsllq $56, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraq $56, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq $56, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $56, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512-NEXT:    vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <2 x i8> %a, %b
+  %x1 = icmp sgt <2 x i8> %c, %d
+  %y = and <2 x i1> %x0, %x1
+  %res = bitcast <2 x i1> %y to i2
+  ret i2 %res
+}
+
+define i2 @v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d) {
+; SSE2-SSSE3-LABEL: v2i16:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    psllq $48, %xmm2
+; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm4
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm4
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE2-SSSE3-NEXT:    psllq $48, %xmm3
+; SSE2-SSSE3-NEXT:    movdqa %xmm3, %xmm4
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm4
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm3
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; SSE2-SSSE3-NEXT:    psllq $48, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm4
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
+; SSE2-SSSE3-NEXT:    psllq $48, %xmm1
+; SSE2-SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm4
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm1
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v2i16:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpsllq $48, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm4
+; AVX1-NEXT:    vpsrad $16, %xmm3, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsllq $48, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm4
+; AVX1-NEXT:    vpsrad $16, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsllq $48, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm4
+; AVX1-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm4
+; AVX1-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i16:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpsllq $48, %xmm3, %xmm3
+; AVX2-NEXT:    vpsrad $31, %xmm3, %xmm4
+; AVX2-NEXT:    vpsrad $16, %xmm3, %xmm3
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
+; AVX2-NEXT:    vpsllq $48, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm4
+; AVX2-NEXT:    vpsrad $16, %xmm2, %xmm2
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
+; AVX2-NEXT:    vpsllq $48, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm4
+; AVX2-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3]
+; AVX2-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm4
+; AVX2-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3]
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllq $48, %xmm3, %xmm3
+; AVX512-NEXT:    vpsraq $48, %xmm3, %xmm3
+; AVX512-NEXT:    vpsllq $48, %xmm2, %xmm2
+; AVX512-NEXT:    vpsraq $48, %xmm2, %xmm2
+; AVX512-NEXT:    vpsllq $48, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraq $48, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $48, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512-NEXT:    vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <2 x i16> %a, %b
+  %x1 = icmp sgt <2 x i16> %c, %d
+  %y = and <2 x i1> %x0, %x1
+  %res = bitcast <2 x i1> %y to i2
+  ret i2 %res
+}
+
+define i2 @v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
+; SSE2-SSSE3-LABEL: v2i32:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    psllq $32, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE2-SSSE3-NEXT:    psllq $32, %xmm3
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm3
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; SSE2-SSSE3-NEXT:    psllq $32, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE2-SSSE3-NEXT:    psllq $32, %xmm1
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,3,2,3]
+; SSE2-SSSE3-NEXT:    psrad $31, %xmm1
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,3,2,3]
+; SSE2-SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,0,2147483648,0]
+; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm3
+; SSE2-SSSE3-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm0, %xmm5
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT:    por %xmm0, %xmm3
+; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm2
+; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm4
+; SSE2-SSSE3-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm2, %xmm4
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX1-LABEL: v2i32:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vpsllq $32, %xmm3, %xmm3
+; AVX1-NEXT:    vpsrad $31, %xmm3, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2,3],xmm3[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm4
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm4[2,3],xmm0[4,5],xmm4[6,7]
+; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    andl $1, %eax
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: v2i32:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpsllq $32, %xmm3, %xmm3
+; AVX2-NEXT:    vpsrad $31, %xmm3, %xmm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0],xmm4[1],xmm3[2],xmm4[3]
+; AVX2-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX2-NEXT:    vpsrad $31, %xmm2, %xmm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3]
+; AVX2-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX2-NEXT:    vpsrad $31, %xmm1, %xmm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3]
+; AVX2-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX2-NEXT:    vpsrad $31, %xmm0, %xmm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3]
+; AVX2-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v2i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllq $32, %xmm3, %xmm3
+; AVX512-NEXT:    vpsraq $32, %xmm3, %xmm3
+; AVX512-NEXT:    vpsllq $32, %xmm2, %xmm2
+; AVX512-NEXT:    vpsraq $32, %xmm2, %xmm2
+; AVX512-NEXT:    vpsllq $32, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraq $32, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraq $32, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512-NEXT:    vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <2 x i32> %a, %b
+  %x1 = icmp sgt <2 x i32> %c, %d
+  %y = and <2 x i1> %x0, %x1
+  %res = bitcast <2 x i1> %y to i2
+  ret i2 %res
+}
+
+define i2 @v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> %c, <2 x i64> %d) {
+; SSE2-SSSE3-LABEL: v2i64:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [2147483648,0,2147483648,0]
+; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm1
+; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm0
+; SSE2-SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm5
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm6, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm5[1,1,3,3]
+; SSE2-SSSE3-NEXT:    por %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm3
+; SSE2-SSSE3-NEXT:    pxor %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm0
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[0,0,2,2]
+; SSE2-SSSE3-NEXT:    pcmpeqd %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; SSE2-SSSE3-NEXT:    pand %xmm4, %xmm2
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; SSE2-SSSE3-NEXT:    por %xmm2, %xmm0
+; SSE2-SSSE3-NEXT:    pand %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: v2i64:
+; AVX12:       ## BB#0:
+; AVX12-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm1
+; AVX12-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vmovq %xmm0, %rax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT:    retq
+;
+; AVX512-LABEL: v2i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
+; AVX512-NEXT:    vpcmpgtq %xmm3, %xmm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <2 x i64> %a, %b
+  %x1 = icmp sgt <2 x i64> %c, %d
+  %y = and <2 x i1> %x0, %x1
+  %res = bitcast <2 x i1> %y to i2
+  ret i2 %res
+}
+
+define i2 @v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c, <2 x double> %d) {
+; SSE2-SSSE3-LABEL: v2f64:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    cmpltpd %xmm0, %xmm1
+; SSE2-SSSE3-NEXT:    cmpltpd %xmm2, %xmm3
+; SSE2-SSSE3-NEXT:    andpd %xmm1, %xmm3
+; SSE2-SSSE3-NEXT:    movq %xmm3, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movq %xmm0, %rax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: v2f64:
+; AVX12:       ## BB#0:
+; AVX12-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
+; AVX12-NEXT:    vcmpltpd %xmm2, %xmm3, %xmm1
+; AVX12-NEXT:    vandpd %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vmovq %xmm0, %rax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT:    retq
+;
+; AVX512-LABEL: v2f64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vcmpltpd %xmm0, %xmm1, %k1
+; AVX512-NEXT:    vcmpltpd %xmm2, %xmm3, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x0 = fcmp ogt <2 x double> %a, %b
+  %x1 = fcmp ogt <2 x double> %c, %d
+  %y = and <2 x i1> %x0, %x1
+  %res = bitcast <2 x i1> %y to i2
+  ret i2 %res
+}
+
+define i4 @v4i8(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
+; SSE2-SSSE3-LABEL: v4i8:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    pslld $24, %xmm3
+; SSE2-SSSE3-NEXT:    psrad $24, %xmm3
+; SSE2-SSSE3-NEXT:    pslld $24, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $24, %xmm2
+; SSE2-SSSE3-NEXT:    pslld $24, %xmm1
+; SSE2-SSSE3-NEXT:    psrad $24, %xmm1
+; SSE2-SSSE3-NEXT:    pslld $24, %xmm0
+; SSE2-SSSE3-NEXT:    psrad $24, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    movd %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: v4i8:
+; AVX12:       ## BB#0:
+; AVX12-NEXT:    vpslld $24, %xmm3, %xmm3
+; AVX12-NEXT:    vpsrad $24, %xmm3, %xmm3
+; AVX12-NEXT:    vpslld $24, %xmm2, %xmm2
+; AVX12-NEXT:    vpsrad $24, %xmm2, %xmm2
+; AVX12-NEXT:    vpslld $24, %xmm1, %xmm1
+; AVX12-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX12-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX12-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX12-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm1
+; AVX12-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpextrd $3, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vmovd %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT:    retq
+;
+; AVX512-LABEL: v4i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpslld $24, %xmm3, %xmm3
+; AVX512-NEXT:    vpsrad $24, %xmm3, %xmm3
+; AVX512-NEXT:    vpslld $24, %xmm2, %xmm2
+; AVX512-NEXT:    vpsrad $24, %xmm2, %xmm2
+; AVX512-NEXT:    vpslld $24, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrad $24, %xmm1, %xmm1
+; AVX512-NEXT:    vpslld $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $24, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <4 x i8> %a, %b
+  %x1 = icmp sgt <4 x i8> %c, %d
+  %y = and <4 x i1> %x0, %x1
+  %res = bitcast <4 x i1> %y to i4
+  ret i4 %res
+}
+
+define i4 @v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
+; SSE2-SSSE3-LABEL: v4i16:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm3
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm3
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm2
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm2
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm1
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm1
+; SSE2-SSSE3-NEXT:    pslld $16, %xmm0
+; SSE2-SSSE3-NEXT:    psrad $16, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtd %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    movd %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
+; SSE2-SSSE3-NEXT:    movd %xmm0, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: v4i16:
+; AVX12:       ## BB#0:
+; AVX12-NEXT:    vpslld $16, %xmm3, %xmm3
+; AVX12-NEXT:    vpsrad $16, %xmm3, %xmm3
+; AVX12-NEXT:    vpslld $16, %xmm2, %xmm2
+; AVX12-NEXT:    vpsrad $16, %xmm2, %xmm2
+; AVX12-NEXT:    vpslld $16, %xmm1, %xmm1
+; AVX12-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX12-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX12-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX12-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpcmpgtd %xmm3, %xmm2, %xmm1
+; AVX12-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpextrd $3, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vmovd %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT:    retq
+;
+; AVX512-LABEL: v4i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpslld $16, %xmm3, %xmm3
+; AVX512-NEXT:    vpsrad $16, %xmm3, %xmm3
+; AVX512-NEXT:    vpslld $16, %xmm2, %xmm2
+; AVX512-NEXT:    vpsrad $16, %xmm2, %xmm2
+; AVX512-NEXT:    vpslld $16, %xmm1, %xmm1
+; AVX512-NEXT:    vpsrad $16, %xmm1, %xmm1
+; AVX512-NEXT:    vpslld $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpsrad $16, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
+; AVX512-NEXT:    vpcmpgtd %xmm3, %xmm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <4 x i16> %a, %b
+  %x1 = icmp sgt <4 x i16> %c, %d
+  %y = and <4 x i1> %x0, %x1
+  %res = bitcast <4 x i1> %y to i4
+  ret i4 %res
+}
+
+define i8 @v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
+; SSE2-SSSE3-LABEL: v8i8:
+; SSE2-SSSE3:       ## BB#0:
+; SSE2-SSSE3-NEXT:    psllw $8, %xmm3
+; SSE2-SSSE3-NEXT:    psraw $8, %xmm3
+; SSE2-SSSE3-NEXT:    psllw $8, %xmm2
+; SSE2-SSSE3-NEXT:    psraw $8, %xmm2
+; SSE2-SSSE3-NEXT:    psllw $8, %xmm1
+; SSE2-SSSE3-NEXT:    psraw $8, %xmm1
+; SSE2-SSSE3-NEXT:    psllw $8, %xmm0
+; SSE2-SSSE3-NEXT:    psraw $8, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtw %xmm1, %xmm0
+; SSE2-SSSE3-NEXT:    pcmpgtw %xmm3, %xmm2
+; SSE2-SSSE3-NEXT:    pand %xmm0, %xmm2
+; SSE2-SSSE3-NEXT:    pextrw $7, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $6, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $5, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $4, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $3, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $2, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    pextrw $1, %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movd %xmm2, %eax
+; SSE2-SSSE3-NEXT:    andl $1, %eax
+; SSE2-SSSE3-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE2-SSSE3-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-SSSE3-NEXT:    retq
+;
+; AVX12-LABEL: v8i8:
+; AVX12:       ## BB#0:
+; AVX12-NEXT:    vpsllw $8, %xmm3, %xmm3
+; AVX12-NEXT:    vpsraw $8, %xmm3, %xmm3
+; AVX12-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX12-NEXT:    vpsraw $8, %xmm2, %xmm2
+; AVX12-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX12-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX12-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX12-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX12-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpcmpgtw %xmm3, %xmm2, %xmm1
+; AVX12-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX12-NEXT:    vpextrw $7, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrw $6, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrw $5, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vpextrw $1, %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    vmovd %xmm0, %eax
+; AVX12-NEXT:    andl $1, %eax
+; AVX12-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX12-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX12-NEXT:    retq
+;
+; AVX512-LABEL: v8i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpsllw $8, %xmm3, %xmm3
+; AVX512-NEXT:    vpsraw $8, %xmm3, %xmm3
+; AVX512-NEXT:    vpsllw $8, %xmm2, %xmm2
+; AVX512-NEXT:    vpsraw $8, %xmm2, %xmm2
+; AVX512-NEXT:    vpsllw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpsraw $8, %xmm1, %xmm1
+; AVX512-NEXT:    vpsllw $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpsraw $8, %xmm0, %xmm0
+; AVX512-NEXT:    vpcmpgtw %xmm1, %xmm0, %k1
+; AVX512-NEXT:    vpcmpgtw %xmm3, %xmm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <8 x i8> %a, %b
+  %x1 = icmp sgt <8 x i8> %c, %d
+  %y = and <8 x i1> %x0, %x1
+  %res = bitcast <8 x i1> %y to i8
+  ret i8 %res
+}
diff --git a/test/CodeGen/X86/bitcast-and-setcc-256.ll b/test/CodeGen/X86/bitcast-and-setcc-256.ll
new file mode 100644
index 000000000000..06b1a76f6bae
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -0,0 +1,403 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX2
+; RUN: llc -mtriple=x86_64-apple-darwin -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw < %s | FileCheck %s --check-prefix=AVX512
+
+define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
+; AVX2-LABEL: v4i64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v4i64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
+; AVX512-NEXT:    vpcmpgtq %ymm3, %ymm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <4 x i64> %a, %b
+  %x1 = icmp sgt <4 x i64> %c, %d
+  %y = and <4 x i1> %x0, %x1
+  %res = bitcast <4 x i1> %y to i4
+  ret i4 %res
+}
+
+define i4 @v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
+; AVX2-LABEL: v4f64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vcmpltpd %ymm2, %ymm3, %ymm1
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrd $3, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrd $2, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrd $1, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v4f64:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vcmpltpd %ymm0, %ymm1, %k1
+; AVX512-NEXT:    vcmpltpd %ymm2, %ymm3, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX512-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = fcmp ogt <4 x double> %a, %b
+  %x1 = fcmp ogt <4 x double> %c, %d
+  %y = and <4 x i1> %x0, %x1
+  %res = bitcast <4 x i1> %y to i4
+  ret i4 %res
+}
+
+define i16 @v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16> %c, <16 x i16> %d) {
+; AVX2-LABEL: v16i16:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtw %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $14, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $13, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $11, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $10, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $9, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $7, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $6, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $5, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $3, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $2, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $1, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movzwl -{{[0-9]+}}(%rsp), %eax
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v16i16:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtw %ymm1, %ymm0, %k1
+; AVX512-NEXT:    vpcmpgtw %ymm3, %ymm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <16 x i16> %a, %b
+  %x1 = icmp sgt <16 x i16> %c, %d
+  %y = and <16 x i1> %x0, %x1
+  %res = bitcast <16 x i1> %y to i16
+  ret i16 %res
+}
+
+define i8 @v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) {
+; AVX2-LABEL: v8i32:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpgtd %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $7, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $6, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $5, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $1, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v8i32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
+; AVX512-NEXT:    vpcmpgtd %ymm3, %ymm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <8 x i32> %a, %b
+  %x1 = icmp sgt <8 x i32> %c, %d
+  %y = and <8 x i1> %x0, %x1
+  %res = bitcast <8 x i1> %y to i8
+  ret i8 %res
+}
+
+define i8 @v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c, <8 x float> %d) {
+; AVX2-LABEL: v8f32:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vcmpltps %ymm2, %ymm3, %ymm1
+; AVX2-NEXT:    vextractf128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpacksswb %xmm2, %xmm1, %xmm1
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpextrw $7, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $6, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $5, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $4, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $3, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $2, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vpextrw $1, %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    andl $1, %eax
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v8f32:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vcmpltps %ymm0, %ymm1, %k1
+; AVX512-NEXT:    vcmpltps %ymm2, %ymm3, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = fcmp ogt <8 x float> %a, %b
+  %x1 = fcmp ogt <8 x float> %c, %d
+  %y = and <8 x i1> %x0, %x1
+  %res = bitcast <8 x i1> %y to i8
+  ret i8 %res
+}
+
+define i32 @v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
+; AVX2-LABEL: v32i8:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:  Lcfi0:
+; AVX2-NEXT:    .cfi_def_cfa_offset 16
+; AVX2-NEXT:  Lcfi1:
+; AVX2-NEXT:    .cfi_offset %rbp, -16
+; AVX2-NEXT:    movq %rsp, %rbp
+; AVX2-NEXT:  Lcfi2:
+; AVX2-NEXT:    .cfi_def_cfa_register %rbp
+; AVX2-NEXT:    andq $-32, %rsp
+; AVX2-NEXT:    subq $32, %rsp
+; AVX2-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpgtb %ymm3, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrb $15, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $14, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $13, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $12, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $11, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $10, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $9, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $8, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $7, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $6, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $5, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $4, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $3, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $2, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $1, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $0, %xmm1, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $15, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $14, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $13, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $12, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $11, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $10, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $9, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $7, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $6, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $5, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $4, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $3, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $2, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $1, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    andb $1, %al
+; AVX2-NEXT:    movb %al, (%rsp)
+; AVX2-NEXT:    movl (%rsp), %eax
+; AVX2-NEXT:    movq %rbp, %rsp
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: v32i8:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vpcmpgtb %ymm1, %ymm0, %k1
+; AVX512-NEXT:    vpcmpgtb %ymm3, %ymm2, %k0 {%k1}
+; AVX512-NEXT:    kmovd %k0, %eax
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  %x0 = icmp sgt <32 x i8> %a, %b
+  %x1 = icmp sgt <32 x i8> %c, %d
+  %y = and <32 x i1> %x0, %x1
+  %res = bitcast <32 x i1> %y to i32
+  ret i32 %res
+}
diff --git a/test/CodeGen/X86/mul-constant-i16.ll b/test/CodeGen/X86/mul-constant-i16.ll
index e3e2737cf3e6..6d2465ddd3a8 100644
--- a/test/CodeGen/X86/mul-constant-i16.ll
+++ b/test/CodeGen/X86/mul-constant-i16.ll
@@ -188,13 +188,16 @@ define i16 @test_mul_by_11(i16 %x) {
 ; X86-LABEL: test_mul_by_11:
 ; X86:       # BB#0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $11, %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_11:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $11, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rdi,%rax,2), %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 11
@@ -225,13 +228,16 @@ define i16 @test_mul_by_13(i16 %x) {
 ; X86-LABEL: test_mul_by_13:
 ; X86:       # BB#0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $13, %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_13:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $13, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 13
@@ -241,14 +247,19 @@ define i16 @test_mul_by_13(i16 %x) {
 define i16 @test_mul_by_14(i16 %x) {
 ; X86-LABEL: test_mul_by_14:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $14, %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %eax
+; X86-NEXT:    leal (%ecx,%eax,4), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_14:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $14, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 14
@@ -338,14 +349,19 @@ define i16 @test_mul_by_19(i16 %x) {
 ; X86-LABEL: test_mul_by_19:
 ; X86:       # BB#0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $19, %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    shll $2, %ecx
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_19:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $19, %edi, %eax
-; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    shll $2, %eax
+; X64-NEXT:    subl %eax, %edi
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 19
   ret i16 %mul
@@ -375,13 +391,16 @@ define i16 @test_mul_by_21(i16 %x) {
 ; X86-LABEL: test_mul_by_21:
 ; X86:       # BB#0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $21, %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_21:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $21, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 21
@@ -391,14 +410,19 @@ define i16 @test_mul_by_21(i16 %x) {
 define i16 @test_mul_by_22(i16 %x) {
 ; X86-LABEL: test_mul_by_22:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $22, %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,4), %eax
+; X86-NEXT:    leal (%ecx,%eax,4), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_22:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $22, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 22
@@ -409,14 +433,19 @@ define i16 @test_mul_by_23(i16 %x) {
 ; X86-LABEL: test_mul_by_23:
 ; X86:       # BB#0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $23, %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    shll $3, %ecx
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_23:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $23, %edi, %eax
-; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    shll $3, %eax
+; X64-NEXT:    subl %eax, %edi
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 23
   ret i16 %mul
@@ -466,14 +495,19 @@ define i16 @test_mul_by_26(i16 %x) {
 ; X86-LABEL: test_mul_by_26:
 ; X86:       # BB#0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $26, %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_26:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $26, %edi, %eax
-; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    subl %eax, %edi
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 26
   ret i16 %mul
@@ -502,14 +536,19 @@ define i16 @test_mul_by_27(i16 %x) {
 define i16 @test_mul_by_28(i16 %x) {
 ; X86-LABEL: test_mul_by_28:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $28, %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_28:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $28, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 28
@@ -519,14 +558,21 @@ define i16 @test_mul_by_28(i16 %x) {
 define i16 @test_mul_by_29(i16 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # BB#0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $29, %eax, %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_29:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $29, %edi, %eax
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    addl %edi, %eax
 ; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 29
@@ -537,14 +583,22 @@ define i16 @test_mul_by_30(i16 %x) {
 ; X86-LABEL: test_mul_by_30:
 ; X86:       # BB#0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    imull $30, %eax, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $5, %ecx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test_mul_by_30:
 ; X64:       # BB#0:
-; X64-NEXT:    imull $30, %edi, %eax
-; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $5, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    subl %eax, %ecx
+; X64-NEXT:    subl %ecx, %edi
+; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    retq
   %mul = mul nsw i16 %x, 30
   ret i16 %mul
@@ -587,3 +641,30 @@ define i16 @test_mul_by_32(i16 %x) {
   %mul = mul nsw i16 %x, 32
   ret i16 %mul
 }
+
+; (x*9+42)*(x*5+2)
+define i16 @test_mul_spec(i16 %x) nounwind {
+; X86-LABEL: test_mul_spec:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal 42(%eax,%eax,8), %ecx
+; X86-NEXT:    leal 2(%eax,%eax,4), %eax
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_spec:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal 42(%rdi,%rdi,8), %ecx
+; X64-NEXT:    leal 2(%rdi,%rdi,4), %eax
+; X64-NEXT:    imull %ecx, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 9
+  %add = add nsw i16 %mul, 42
+  %mul2 = mul nsw i16 %x, 5
+  %add2 = add nsw i16 %mul2, 2
+  %mul3 = mul nsw i16 %add, %add2
+  ret i16 %mul3
+}
diff --git a/test/CodeGen/X86/mul-constant-i32.ll b/test/CodeGen/X86/mul-constant-i32.ll
index 76e46e1f1b09..b1e9a929b7f2 100644
--- a/test/CodeGen/X86/mul-constant-i32.ll
+++ b/test/CodeGen/X86/mul-constant-i32.ll
@@ -1,6 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG
+; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM
+; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT
 
 define i32 @test_mul_by_1(i32 %x) {
 ; X86-LABEL: test_mul_by_1:
@@ -8,10 +14,40 @@ define i32 @test_mul_by_1(i32 %x) {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_1:
-; X64:       # BB#0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_1:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_1:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_1:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_1:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_1:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_1:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_1:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 1
   ret i32 %mul
 }
@@ -23,11 +59,47 @@ define i32 @test_mul_by_2(i32 %x) {
 ; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_2:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_2:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_2:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_2:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    addl %eax, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_2:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_2:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_2:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_2:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal (%rdi,%rdi), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 2
   ret i32 %mul
 }
@@ -38,11 +110,46 @@ define i32 @test_mul_by_3(i32 %x) {
 ; X86-NEXT:    imull $3, {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_3:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi,2), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_3:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_3:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_3:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $3, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_3:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_3:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_3:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_3:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 3
   ret i32 %mul
 }
@@ -54,11 +161,47 @@ define i32 @test_mul_by_4(i32 %x) {
 ; X86-NEXT:    shll $2, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_4:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (,%rdi,4), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_4:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_4:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_4:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    shll $2, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_4:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (,%rdi,4), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_4:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal (,%rdi,4), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_4:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_4:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal (,%rdi,4), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 4
   ret i32 %mul
 }
@@ -69,11 +212,46 @@ define i32 @test_mul_by_5(i32 %x) {
 ; X86-NEXT:    imull $5, {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_5:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_5:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_5:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_5:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $5, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_5:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_5:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_5:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_5:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 5
   ret i32 %mul
 }
@@ -86,12 +264,46 @@ define i32 @test_mul_by_6(i32 %x) {
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_6:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    addl %edi, %edi
-; X64-NEXT:    leal (%rdi,%rdi,2), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_6:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    addl %edi, %edi # sched: [1:0.25]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_6:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    addl %edi, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_6:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $6, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_6:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $6, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_6:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $6, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_6:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    addl %edi, %edi # sched: [1:0.50]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_6:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $6, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 6
   ret i32 %mul
 }
@@ -104,12 +316,46 @@ define i32 @test_mul_by_7(i32 %x) {
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_7:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (,%rdi,8), %eax
-; X64-NEXT:    subl %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_7:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_7:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_7:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $7, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_7:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $7, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_7:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $7, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_7:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (,%rdi,8), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_7:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $7, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 7
   ret i32 %mul
 }
@@ -121,11 +367,47 @@ define i32 @test_mul_by_8(i32 %x) {
 ; X86-NEXT:    shll $3, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_8:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (,%rdi,8), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_8:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_8:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_8:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    shll $3, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_8:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_8:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal (,%rdi,8), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_8:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (,%rdi,8), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_8:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal (,%rdi,8), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 8
   ret i32 %mul
 }
@@ -136,11 +418,46 @@ define i32 @test_mul_by_9(i32 %x) {
 ; X86-NEXT:    imull $9, {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_9:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_9:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_9:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_9:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $9, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_9:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_9:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_9:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_9:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 9
   ret i32 %mul
 }
@@ -153,12 +470,46 @@ define i32 @test_mul_by_10(i32 %x) {
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_10:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    addl %edi, %edi
-; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_10:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    addl %edi, %edi # sched: [1:0.25]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_10:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    addl %edi, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_10:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $10, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_10:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $10, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_10:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $10, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_10:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    addl %edi, %edi # sched: [1:0.50]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_10:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $10, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 10
   ret i32 %mul
 }
@@ -166,13 +517,49 @@ define i32 @test_mul_by_10(i32 %x) {
 define i32 @test_mul_by_11(i32 %x) {
 ; X86-LABEL: test_mul_by_11:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $11, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_11:
-; X64:       # BB#0:
-; X64-NEXT:    imull $11, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_11:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rax,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_11:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rax,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_11:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $11, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_11:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $11, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_11:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $11, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_11:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $11, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_11:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $11, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 11
   ret i32 %mul
 }
@@ -185,12 +572,46 @@ define i32 @test_mul_by_12(i32 %x) {
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_12:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    shll $2, %edi
-; X64-NEXT:    leal (%rdi,%rdi,2), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_12:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    shll $2, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_12:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    shll $2, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_12:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $12, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_12:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $12, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_12:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $12, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_12:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    shll $2, %edi # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_12:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $12, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 12
   ret i32 %mul
 }
@@ -198,13 +619,49 @@ define i32 @test_mul_by_12(i32 %x) {
 define i32 @test_mul_by_13(i32 %x) {
 ; X86-LABEL: test_mul_by_13:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $13, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_13:
-; X64:       # BB#0:
-; X64-NEXT:    imull $13, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_13:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_13:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_13:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $13, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_13:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $13, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_13:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $13, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_13:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $13, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_13:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $13, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 13
   ret i32 %mul
 }
@@ -212,13 +669,52 @@ define i32 @test_mul_by_13(i32 %x) {
 define i32 @test_mul_by_14(i32 %x) {
 ; X86-LABEL: test_mul_by_14:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $14, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %eax
+; X86-NEXT:    leal (%ecx,%eax,4), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_14:
-; X64:       # BB#0:
-; X64-NEXT:    imull $14, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_14:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_14:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_14:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $14, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_14:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $14, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_14:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $14, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_14:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $14, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_14:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $14, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 14
   ret i32 %mul
 }
@@ -231,12 +727,46 @@ define i32 @test_mul_by_15(i32 %x) {
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_15:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    leal (%rax,%rax,2), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_15:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_15:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_15:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $15, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_15:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $15, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_15:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $15, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_15:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_15:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $15, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 15
   ret i32 %mul
 }
@@ -248,11 +778,47 @@ define i32 @test_mul_by_16(i32 %x) {
 ; X86-NEXT:    shll $4, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_16:
-; X64:       # BB#0:
-; X64-NEXT:    shll $4, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_16:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shll $4, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_16:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shll $4, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_16:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    shll $4, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_16:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    shll $4, %edi # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_16:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    shll $4, %edi # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_16:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shll $4, %edi # sched: [1:1.00]
+; X64-SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_16:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    shll $4, %edi # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 16
   ret i32 %mul
 }
@@ -266,13 +832,49 @@ define i32 @test_mul_by_17(i32 %x) {
 ; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_17:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $4, %eax
-; X64-NEXT:    leal (%rax,%rdi), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_17:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    shll $4, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rdi), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_17:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    shll $4, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rdi), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_17:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $17, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_17:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $17, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_17:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $17, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_17:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    shll $4, %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rax,%rdi), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_17:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $17, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 17
   ret i32 %mul
 }
@@ -285,12 +887,46 @@ define i32 @test_mul_by_18(i32 %x) {
 ; X86-NEXT:    leal (%eax,%eax,8), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_18:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    addl %edi, %edi
-; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_18:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    addl %edi, %edi # sched: [1:0.25]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_18:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    addl %edi, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_18:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $18, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_18:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $18, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_18:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $18, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_18:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    addl %edi, %edi # sched: [1:0.50]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_18:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $18, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 18
   ret i32 %mul
 }
@@ -298,13 +934,54 @@ define i32 @test_mul_by_18(i32 %x) {
 define i32 @test_mul_by_19(i32 %x) {
 ; X86-LABEL: test_mul_by_19:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $19, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    shll $2, %ecx
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_19:
-; X64:       # BB#0:
-; X64-NEXT:    imull $19, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_19:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $2, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    subl %eax, %edi # sched: [1:0.25]
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_19:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    shll $2, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %eax, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_19:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $19, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_19:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $19, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_19:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $19, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_19:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $19, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_19:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $19, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 19
   ret i32 %mul
 }
@@ -317,12 +994,46 @@ define i32 @test_mul_by_20(i32 %x) {
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_20:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    shll $2, %edi
-; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_20:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    shll $2, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_20:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    shll $2, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_20:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $20, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_20:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $20, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_20:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $20, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_20:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    shll $2, %edi # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_20:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $20, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 20
   ret i32 %mul
 }
@@ -330,13 +1041,49 @@ define i32 @test_mul_by_20(i32 %x) {
 define i32 @test_mul_by_21(i32 %x) {
 ; X86-LABEL: test_mul_by_21:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $21, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_21:
-; X64:       # BB#0:
-; X64-NEXT:    imull $21, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_21:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_21:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_21:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $21, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_21:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $21, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_21:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $21, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_21:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $21, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_21:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $21, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 21
   ret i32 %mul
 }
@@ -344,13 +1091,52 @@ define i32 @test_mul_by_21(i32 %x) {
 define i32 @test_mul_by_22(i32 %x) {
 ; X86-LABEL: test_mul_by_22:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $22, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,4), %eax
+; X86-NEXT:    leal (%ecx,%eax,4), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_22:
-; X64:       # BB#0:
-; X64-NEXT:    imull $22, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_22:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_22:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rax,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_22:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $22, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_22:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $22, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_22:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $22, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_22:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $22, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_22:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $22, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 22
   ret i32 %mul
 }
@@ -358,13 +1144,54 @@ define i32 @test_mul_by_22(i32 %x) {
 define i32 @test_mul_by_23(i32 %x) {
 ; X86-LABEL: test_mul_by_23:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $23, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    shll $3, %ecx
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_23:
-; X64:       # BB#0:
-; X64-NEXT:    imull $23, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_23:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    shll $3, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    subl %eax, %edi # sched: [1:0.25]
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_23:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    shll $3, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %eax, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_23:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $23, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_23:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $23, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_23:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $23, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_23:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $23, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_23:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $23, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 23
   ret i32 %mul
 }
@@ -377,12 +1204,46 @@ define i32 @test_mul_by_24(i32 %x) {
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_24:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    shll $3, %edi
-; X64-NEXT:    leal (%rdi,%rdi,2), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_24:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    shll $3, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_24:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    shll $3, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_24:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $24, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_24:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $24, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_24:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $24, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_24:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    shll $3, %edi # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_24:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $24, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 24
   ret i32 %mul
 }
@@ -395,12 +1256,46 @@ define i32 @test_mul_by_25(i32 %x) {
 ; X86-NEXT:    leal (%eax,%eax,4), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_25:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    leal (%rax,%rax,4), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_25:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rax,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_25:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_25:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $25, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_25:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $25, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_25:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $25, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_25:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rax,%rax,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_25:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $25, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 25
   ret i32 %mul
 }
@@ -408,13 +1303,54 @@ define i32 @test_mul_by_25(i32 %x) {
 define i32 @test_mul_by_26(i32 %x) {
 ; X86-LABEL: test_mul_by_26:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $26, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_26:
-; X64:       # BB#0:
-; X64-NEXT:    imull $26, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_26:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    subl %eax, %edi # sched: [1:0.25]
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_26:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %eax, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_26:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $26, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_26:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $26, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_26:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $26, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_26:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $26, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_26:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $26, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 26
   ret i32 %mul
 }
@@ -427,12 +1363,46 @@ define i32 @test_mul_by_27(i32 %x) {
 ; X86-NEXT:    leal (%eax,%eax,2), %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_27:
-; X64:       # BB#0:
-; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    leal (%rax,%rax,2), %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_27:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_27:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_27:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $27, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_27:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $27, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_27:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $27, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_27:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_27:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $27, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 27
   ret i32 %mul
 }
@@ -440,13 +1410,52 @@ define i32 @test_mul_by_27(i32 %x) {
 define i32 @test_mul_by_28(i32 %x) {
 ; X86-LABEL: test_mul_by_28:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $28, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_28:
-; X64:       # BB#0:
-; X64-NEXT:    imull $28, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_28:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_28:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_28:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $28, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_28:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $28, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_28:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $28, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_28:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $28, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_28:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $28, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 28
   ret i32 %mul
 }
@@ -454,13 +1463,55 @@ define i32 @test_mul_by_28(i32 %x) {
 define i32 @test_mul_by_29(i32 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $29, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    addl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_29:
-; X64:       # BB#0:
-; X64-NEXT:    imull $29, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_29:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    addl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_29:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    leal (%rax,%rax,2), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    addl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_29:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $29, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_29:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $29, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_29:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $29, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_29:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $29, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_29:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $29, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 29
   ret i32 %mul
 }
@@ -468,13 +1519,58 @@ define i32 @test_mul_by_29(i32 %x) {
 define i32 @test_mul_by_30(i32 %x) {
 ; X86-LABEL: test_mul_by_30:
 ; X86:       # BB#0:
-; X86-NEXT:    imull $30, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $5, %ecx
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    subl %ecx, %edx
+; X86-NEXT:    subl %edx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_30:
-; X64:       # BB#0:
-; X64-NEXT:    imull $30, %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_30:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    shll $5, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    movl %edi, %ecx # sched: [1:0.25]
+; X64-HSW-NEXT:    subl %eax, %ecx # sched: [1:0.25]
+; X64-HSW-NEXT:    subl %ecx, %edi # sched: [1:0.25]
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_30:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    movl %edi, %ecx # sched: [1:0.17]
+; X64-JAG-NEXT:    shll $5, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %eax, %ecx # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %ecx, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_30:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $30, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_30:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $30, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_30:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $30, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_30:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imull $30, %edi, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_30:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $30, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 30
   ret i32 %mul
 }
@@ -488,12 +1584,46 @@ define i32 @test_mul_by_31(i32 %x) {
 ; X86-NEXT:    subl %ecx, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_31:
-; X64:       # BB#0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $5, %eax
-; X64-NEXT:    subl %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_31:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    shll $5, %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    subl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_31:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    shll $5, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_31:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    imull $31, {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_31:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imull $31, %edi, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_31:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imull $31, %edi, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_31:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    shll $5, %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    subl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_31:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imull $31, %edi, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 31
   ret i32 %mul
 }
@@ -505,11 +1635,124 @@ define i32 @test_mul_by_32(i32 %x) {
 ; X86-NEXT:    shll $5, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_32:
-; X64:       # BB#0:
-; X64-NEXT:    shll $5, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_32:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shll $5, %edi # sched: [1:0.50]
+; X64-HSW-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_32:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shll $5, %edi # sched: [1:0.50]
+; X64-JAG-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_32:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    shll $5, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_32:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    shll $5, %edi # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_32:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    shll $5, %edi # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.17]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_32:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shll $5, %edi # sched: [1:1.00]
+; X64-SLM-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_32:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    shll $5, %edi # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    movl %edi, %eax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i32 %x, 32
   ret i32 %mul
 }
+
+; (x*9+42)*(x*5+2)
+define i32 @test_mul_spec(i32 %x) nounwind {
+; X86-LABEL: test_mul_spec:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal 42(%eax,%eax,8), %ecx
+; X86-NEXT:    leal 2(%eax,%eax,4), %eax
+; X86-NEXT:    imull %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-HSW-LABEL: test_mul_spec:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-HSW-NEXT:    leal (%rdi,%rdi,8), %ecx # sched: [1:0.50]
+; X64-HSW-NEXT:    addl $42, %ecx # sched: [1:0.25]
+; X64-HSW-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-HSW-NEXT:    addl $2, %eax # sched: [1:0.25]
+; X64-HSW-NEXT:    imull %ecx, %eax # sched: [4:1.00]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_spec:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-JAG-NEXT:    leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50]
+; X64-JAG-NEXT:    leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50]
+; X64-JAG-NEXT:    imull %ecx, %eax # sched: [3:1.00]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_spec:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    leal 42(%eax,%eax,8), %ecx
+; X86-NOOPT-NEXT:    leal 2(%eax,%eax,4), %eax
+; X86-NOOPT-NEXT:    imull %ecx, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_spec:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,8), %ecx # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    addl $42, %ecx # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    addl $2, %eax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    imull %ecx, %eax # sched: [4:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_spec:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; JAG-NOOPT-NEXT:    leal 42(%rdi,%rdi,8), %ecx # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    leal 2(%rdi,%rdi,4), %eax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    imull %ecx, %eax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_spec:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SLM-NEXT:    leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00]
+; X64-SLM-NEXT:    leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00]
+; X64-SLM-NEXT:    imull %ecx, %eax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_spec:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; SLM-NOOPT-NEXT:    leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    imull %ecx, %eax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
+  %mul = mul nsw i32 %x, 9
+  %add = add nsw i32 %mul, 42
+  %mul2 = mul nsw i32 %x, 5
+  %add2 = add nsw i32 %mul2, 2
+  %mul3 = mul nsw i32 %add, %add2
+  ret i32 %mul3
+}
diff --git a/test/CodeGen/X86/mul-constant-i64.ll b/test/CodeGen/X86/mul-constant-i64.ll
index 8579179a8231..22eb0bdc6c3f 100644
--- a/test/CodeGen/X86/mul-constant-i64.ll
+++ b/test/CodeGen/X86/mul-constant-i64.ll
@@ -1,18 +1,55 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG
+; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT
+; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM
+; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT
 
-define i64 @test_mul_by_1(i64 %x) {
+define i64 @test_mul_by_1(i64 %x) nounwind {
 ; X86-LABEL: test_mul_by_1:
 ; X86:       # BB#0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_1:
-; X64:       # BB#0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_1:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_1:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_1:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_1:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_1:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_1:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_1:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 1
   ret i64 %mul
 }
@@ -26,10 +63,43 @@ define i64 @test_mul_by_2(i64 %x) {
 ; X86-NEXT:    addl %eax, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_2:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_2:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_2:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_2:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOOPT-NEXT:    shldl $1, %eax, %edx
+; X86-NOOPT-NEXT:    addl %eax, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_2:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_2:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_2:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_2:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq (%rdi,%rdi), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 2
   ret i64 %mul
 }
@@ -43,10 +113,43 @@ define i64 @test_mul_by_3(i64 %x) {
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_3:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi,2), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_3:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_3:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_3:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $3, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $3, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_3:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_3:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_3:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_3:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 3
   ret i64 %mul
 }
@@ -60,10 +163,43 @@ define i64 @test_mul_by_4(i64 %x) {
 ; X86-NEXT:    shll $2, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_4:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (,%rdi,4), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_4:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_4:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_4:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOOPT-NEXT:    shldl $2, %eax, %edx
+; X86-NOOPT-NEXT:    shll $2, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_4:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (,%rdi,4), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_4:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq (,%rdi,4), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_4:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_4:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq (,%rdi,4), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 4
   ret i64 %mul
 }
@@ -77,10 +213,43 @@ define i64 @test_mul_by_5(i64 %x) {
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_5:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_5:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_5:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_5:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $5, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $5, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_5:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_5:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_5:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_5:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 5
   ret i64 %mul
 }
@@ -95,11 +264,46 @@ define i64 @test_mul_by_6(i64 %x) {
 ; X86-NEXT:    leal (%edx,%ecx,2), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_6:
-; X64:       # BB#0:
-; X64-NEXT:    addq %rdi, %rdi
-; X64-NEXT:    leaq (%rdi,%rdi,2), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_6:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    addq %rdi, %rdi # sched: [1:0.25]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_6:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    addq %rdi, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_6:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $6, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $6, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_6:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $6, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_6:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $6, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_6:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    addq %rdi, %rdi # sched: [1:0.50]
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_6:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $6, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 6
   ret i64 %mul
 }
@@ -115,11 +319,46 @@ define i64 @test_mul_by_7(i64 %x) {
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_7:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (,%rdi,8), %rax
-; X64-NEXT:    subq %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_7:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_7:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_7:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $7, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $7, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_7:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $7, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_7:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $7, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_7:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (,%rdi,8), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_7:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $7, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 7
   ret i64 %mul
 }
@@ -133,10 +372,43 @@ define i64 @test_mul_by_8(i64 %x) {
 ; X86-NEXT:    shll $3, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_8:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (,%rdi,8), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_8:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_8:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_8:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOOPT-NEXT:    shldl $3, %eax, %edx
+; X86-NOOPT-NEXT:    shll $3, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_8:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_8:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq (,%rdi,8), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_8:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (,%rdi,8), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_8:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq (,%rdi,8), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 8
   ret i64 %mul
 }
@@ -150,10 +422,43 @@ define i64 @test_mul_by_9(i64 %x) {
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_9:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi,8), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_9:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_9:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_9:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $9, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $9, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_9:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_9:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_9:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_9:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 9
   ret i64 %mul
 }
@@ -168,11 +473,46 @@ define i64 @test_mul_by_10(i64 %x) {
 ; X86-NEXT:    leal (%edx,%ecx,2), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_10:
-; X64:       # BB#0:
-; X64-NEXT:    addq %rdi, %rdi
-; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_10:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    addq %rdi, %rdi # sched: [1:0.25]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_10:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    addq %rdi, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_10:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $10, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $10, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_10:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $10, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_10:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $10, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_10:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    addq %rdi, %rdi # sched: [1:0.50]
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_10:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $10, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 10
   ret i64 %mul
 }
@@ -180,16 +520,53 @@ define i64 @test_mul_by_10(i64 %x) {
 define i64 @test_mul_by_11(i64 %x) {
 ; X86-LABEL: test_mul_by_11:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,2), %ecx
 ; X86-NEXT:    movl $11, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $11, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_11:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $11, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_11:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rax,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_11:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rax,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_11:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $11, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $11, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_11:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $11, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_11:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $11, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_11:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $11, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_11:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $11, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 11
   ret i64 %mul
 }
@@ -204,11 +581,46 @@ define i64 @test_mul_by_12(i64 %x) {
 ; X86-NEXT:    leal (%edx,%ecx,4), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_12:
-; X64:       # BB#0:
-; X64-NEXT:    shlq $2, %rdi
-; X64-NEXT:    leaq (%rdi,%rdi,2), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_12:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shlq $2, %rdi # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_12:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shlq $2, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_12:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $12, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $12, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_12:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $12, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_12:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $12, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_12:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shlq $2, %rdi # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_12:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $12, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 12
   ret i64 %mul
 }
@@ -216,16 +628,53 @@ define i64 @test_mul_by_12(i64 %x) {
 define i64 @test_mul_by_13(i64 %x) {
 ; X86-LABEL: test_mul_by_13:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %ecx
 ; X86-NEXT:    movl $13, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $13, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_13:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $13, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_13:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_13:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_13:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $13, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $13, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_13:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $13, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_13:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $13, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_13:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $13, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_13:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $13, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 13
   ret i64 %mul
 }
@@ -233,16 +682,56 @@ define i64 @test_mul_by_13(i64 %x) {
 define i64 @test_mul_by_14(i64 %x) {
 ; X86-LABEL: test_mul_by_14:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %ecx
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl $14, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $14, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_14:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $14, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_14:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_14:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_14:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $14, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $14, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_14:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $14, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_14:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $14, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_14:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $14, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_14:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $14, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 14
   ret i64 %mul
 }
@@ -258,11 +747,46 @@ define i64 @test_mul_by_15(i64 %x) {
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_15:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
-; X64-NEXT:    leaq (%rax,%rax,2), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_15:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_15:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_15:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $15, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $15, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_15:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $15, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_15:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $15, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_15:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_15:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $15, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 15
   ret i64 %mul
 }
@@ -276,11 +800,49 @@ define i64 @test_mul_by_16(i64 %x) {
 ; X86-NEXT:    shll $4, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_16:
-; X64:       # BB#0:
-; X64-NEXT:    shlq $4, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_16:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shlq $4, %rdi # sched: [1:0.50]
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_16:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shlq $4, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_16:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOOPT-NEXT:    shldl $4, %eax, %edx
+; X86-NOOPT-NEXT:    shll $4, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_16:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    shlq $4, %rdi # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_16:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    shlq $4, %rdi # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_16:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shlq $4, %rdi # sched: [1:1.00]
+; X64-SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_16:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    shlq $4, %rdi # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 16
   ret i64 %mul
 }
@@ -297,12 +859,49 @@ define i64 @test_mul_by_17(i64 %x) {
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_17:
-; X64:       # BB#0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlq $4, %rax
-; X64-NEXT:    leaq (%rax,%rdi), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_17:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    shlq $4, %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rdi), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_17:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    shlq $4, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rdi), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_17:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $17, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $17, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_17:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $17, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_17:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $17, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_17:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    shlq $4, %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_17:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $17, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 17
   ret i64 %mul
 }
@@ -317,11 +916,46 @@ define i64 @test_mul_by_18(i64 %x) {
 ; X86-NEXT:    leal (%edx,%ecx,2), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_18:
-; X64:       # BB#0:
-; X64-NEXT:    addq %rdi, %rdi
-; X64-NEXT:    leaq (%rdi,%rdi,8), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_18:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    addq %rdi, %rdi # sched: [1:0.25]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_18:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    addq %rdi, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_18:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $18, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $18, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_18:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $18, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_18:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $18, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_18:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    addq %rdi, %rdi # sched: [1:0.50]
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_18:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $18, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 18
   ret i64 %mul
 }
@@ -329,16 +963,58 @@ define i64 @test_mul_by_18(i64 %x) {
 define i64 @test_mul_by_19(i64 %x) {
 ; X86-LABEL: test_mul_by_19:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,4), %eax
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl $19, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $19, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_19:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $19, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_19:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    shlq $2, %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    subq %rax, %rdi # sched: [1:0.25]
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_19:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    shlq $2, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rax, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_19:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $19, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $19, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_19:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $19, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_19:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $19, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_19:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $19, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_19:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $19, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 19
   ret i64 %mul
 }
@@ -353,11 +1029,46 @@ define i64 @test_mul_by_20(i64 %x) {
 ; X86-NEXT:    leal (%edx,%ecx,4), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_20:
-; X64:       # BB#0:
-; X64-NEXT:    shlq $2, %rdi
-; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_20:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shlq $2, %rdi # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_20:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shlq $2, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_20:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $20, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $20, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_20:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $20, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_20:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $20, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_20:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shlq $2, %rdi # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_20:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $20, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 20
   ret i64 %mul
 }
@@ -365,16 +1076,53 @@ define i64 @test_mul_by_20(i64 %x) {
 define i64 @test_mul_by_21(i64 %x) {
 ; X86-LABEL: test_mul_by_21:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %ecx
 ; X86-NEXT:    movl $21, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $21, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_21:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $21, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_21:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_21:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_21:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $21, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $21, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_21:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $21, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_21:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $21, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_21:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $21, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_21:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $21, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 21
   ret i64 %mul
 }
@@ -382,16 +1130,56 @@ define i64 @test_mul_by_21(i64 %x) {
 define i64 @test_mul_by_22(i64 %x) {
 ; X86-LABEL: test_mul_by_22:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    leal (%eax,%ecx,4), %ecx
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl $22, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $22, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_22:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $22, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_22:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_22:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rax,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_22:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $22, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $22, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_22:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $22, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_22:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $22, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_22:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $22, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_22:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $22, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 22
   ret i64 %mul
 }
@@ -399,16 +1187,58 @@ define i64 @test_mul_by_22(i64 %x) {
 define i64 @test_mul_by_23(i64 %x) {
 ; X86-LABEL: test_mul_by_23:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %eax
+; X86-NEXT:    shll $3, %eax
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl $23, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $23, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_23:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $23, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_23:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    shlq $3, %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    subq %rax, %rdi # sched: [1:0.25]
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_23:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    shlq $3, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rax, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_23:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $23, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $23, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_23:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $23, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_23:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $23, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_23:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $23, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_23:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $23, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 23
   ret i64 %mul
 }
@@ -423,11 +1253,46 @@ define i64 @test_mul_by_24(i64 %x) {
 ; X86-NEXT:    leal (%edx,%ecx,8), %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_24:
-; X64:       # BB#0:
-; X64-NEXT:    shlq $3, %rdi
-; X64-NEXT:    leaq (%rdi,%rdi,2), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_24:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shlq $3, %rdi # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_24:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shlq $3, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_24:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $24, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $24, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_24:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $24, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_24:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $24, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_24:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shlq $3, %rdi # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,2), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_24:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $24, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 24
   ret i64 %mul
 }
@@ -443,11 +1308,46 @@ define i64 @test_mul_by_25(i64 %x) {
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_25:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
-; X64-NEXT:    leaq (%rax,%rax,4), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_25:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_25:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_25:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $25, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $25, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_25:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $25, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_25:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $25, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_25:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq (%rax,%rax,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_25:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $25, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 25
   ret i64 %mul
 }
@@ -455,16 +1355,58 @@ define i64 @test_mul_by_25(i64 %x) {
 define i64 @test_mul_by_26(i64 %x) {
 ; X86-LABEL: test_mul_by_26:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    subl %eax, %ecx
 ; X86-NEXT:    movl $26, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $26, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_26:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $26, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_26:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    subq %rax, %rdi # sched: [1:0.25]
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_26:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rax, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_26:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $26, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $26, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_26:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $26, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_26:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $26, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_26:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $26, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_26:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $26, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 26
   ret i64 %mul
 }
@@ -480,11 +1422,46 @@ define i64 @test_mul_by_27(i64 %x) {
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_27:
-; X64:       # BB#0:
-; X64-NEXT:    leaq (%rdi,%rdi,8), %rax
-; X64-NEXT:    leaq (%rax,%rax,2), %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_27:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_27:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_27:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $27, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $27, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_27:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $27, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_27:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $27, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_27:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_27:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $27, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 27
   ret i64 %mul
 }
@@ -492,16 +1469,56 @@ define i64 @test_mul_by_27(i64 %x) {
 define i64 @test_mul_by_28(i64 %x) {
 ; X86-LABEL: test_mul_by_28:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl $28, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $28, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_28:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $28, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_28:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_28:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_28:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $28, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $28, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_28:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $28, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_28:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $28, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_28:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $28, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_28:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $28, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 28
   ret i64 %mul
 }
@@ -509,16 +1526,59 @@ define i64 @test_mul_by_28(i64 %x) {
 define i64 @test_mul_by_29(i64 %x) {
 ; X86-LABEL: test_mul_by_29:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    addl %eax, %ecx
 ; X86-NEXT:    movl $29, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $29, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_29:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $29, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_29:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    addq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_29:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq (%rdi,%rdi,8), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq (%rax,%rax,2), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    addq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_29:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $29, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $29, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_29:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $29, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_29:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $29, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_29:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $29, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_29:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $29, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 29
   ret i64 %mul
 }
@@ -526,16 +1586,60 @@ define i64 @test_mul_by_29(i64 %x) {
 define i64 @test_mul_by_30(i64 %x) {
 ; X86-LABEL: test_mul_by_30:
 ; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    shll $5, %ecx
 ; X86-NEXT:    movl $30, %eax
 ; X86-NEXT:    mull {{[0-9]+}}(%esp)
-; X86-NEXT:    imull $30, {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_30:
-; X64:       # BB#0:
-; X64-NEXT:    imulq $30, %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_30:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    shlq $5, %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    movq %rdi, %rcx # sched: [1:0.25]
+; X64-HSW-NEXT:    subq %rax, %rcx # sched: [1:0.25]
+; X64-HSW-NEXT:    subq %rcx, %rdi # sched: [1:0.25]
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_30:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    movq %rdi, %rcx # sched: [1:0.17]
+; X64-JAG-NEXT:    shlq $5, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rax, %rcx # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rcx, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_30:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $30, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $30, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_30:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $30, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_30:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $30, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_30:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    imulq $30, %rdi, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_30:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $30, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 30
   ret i64 %mul
 }
@@ -552,12 +1656,49 @@ define i64 @test_mul_by_31(i64 %x) {
 ; X86-NEXT:    addl %ecx, %edx
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_31:
-; X64:       # BB#0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    shlq $5, %rax
-; X64-NEXT:    subq %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_31:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    shlq $5, %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    subq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_31:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    shlq $5, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_31:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl $31, %eax
+; X86-NOOPT-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NOOPT-NEXT:    imull $31, {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_31:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    imulq $31, %rdi, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_31:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    imulq $31, %rdi, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_31:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    shlq $5, %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    subq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_31:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    imulq $31, %rdi, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 31
   ret i64 %mul
 }
@@ -571,11 +1712,168 @@ define i64 @test_mul_by_32(i64 %x) {
 ; X86-NEXT:    shll $5, %eax
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: test_mul_by_32:
-; X64:       # BB#0:
-; X64-NEXT:    shlq $5, %rdi
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    retq
+; X64-HSW-LABEL: test_mul_by_32:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    shlq $5, %rdi # sched: [1:0.50]
+; X64-HSW-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_by_32:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    shlq $5, %rdi # sched: [1:0.50]
+; X64-JAG-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_by_32:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NOOPT-NEXT:    shldl $5, %eax, %edx
+; X86-NOOPT-NEXT:    shll $5, %eax
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_by_32:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    shlq $5, %rdi # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_by_32:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    shlq $5, %rdi # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.17]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_by_32:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    shlq $5, %rdi # sched: [1:1.00]
+; X64-SLM-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_by_32:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    shlq $5, %rdi # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    movq %rdi, %rax # sched: [1:0.50]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
   %mul = mul nsw i64 %x, 32
   ret i64 %mul
 }
+
+; (x*9+42)*(x*5+2)
+define i64 @test_mul_spec(i64 %x) nounwind {
+; X86-LABEL: test_mul_spec:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl $9, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    leal (%edi,%edi,8), %ebx
+; X86-NEXT:    addl $42, %esi
+; X86-NEXT:    adcl %edx, %ebx
+; X86-NEXT:    movl $5, %edx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    mull %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    leal (%edi,%edi,4), %edi
+; X86-NEXT:    addl $2, %ecx
+; X86-NEXT:    adcl %edx, %edi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    mull %ecx
+; X86-NEXT:    imull %esi, %edi
+; X86-NEXT:    addl %edi, %edx
+; X86-NEXT:    imull %ebx, %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-HSW-LABEL: test_mul_spec:
+; X64-HSW:       # BB#0:
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50]
+; X64-HSW-NEXT:    addq $42, %rcx # sched: [1:0.25]
+; X64-HSW-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-HSW-NEXT:    addq $2, %rax # sched: [1:0.25]
+; X64-HSW-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
+; X64-HSW-NEXT:    retq # sched: [1:1.00]
+;
+; X64-JAG-LABEL: test_mul_spec:
+; X64-JAG:       # BB#0:
+; X64-JAG-NEXT:    leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50]
+; X64-JAG-NEXT:    leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50]
+; X64-JAG-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
+; X64-JAG-NEXT:    retq # sched: [4:1.00]
+;
+; X86-NOOPT-LABEL: test_mul_spec:
+; X86-NOOPT:       # BB#0:
+; X86-NOOPT-NEXT:    pushl %ebx
+; X86-NOOPT-NEXT:    pushl %edi
+; X86-NOOPT-NEXT:    pushl %esi
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOOPT-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NOOPT-NEXT:    movl $9, %edx
+; X86-NOOPT-NEXT:    movl %ecx, %eax
+; X86-NOOPT-NEXT:    mull %edx
+; X86-NOOPT-NEXT:    movl %eax, %esi
+; X86-NOOPT-NEXT:    leal (%edi,%edi,8), %ebx
+; X86-NOOPT-NEXT:    addl $42, %esi
+; X86-NOOPT-NEXT:    adcl %edx, %ebx
+; X86-NOOPT-NEXT:    movl $5, %edx
+; X86-NOOPT-NEXT:    movl %ecx, %eax
+; X86-NOOPT-NEXT:    mull %edx
+; X86-NOOPT-NEXT:    movl %eax, %ecx
+; X86-NOOPT-NEXT:    leal (%edi,%edi,4), %edi
+; X86-NOOPT-NEXT:    addl $2, %ecx
+; X86-NOOPT-NEXT:    adcl %edx, %edi
+; X86-NOOPT-NEXT:    movl %esi, %eax
+; X86-NOOPT-NEXT:    mull %ecx
+; X86-NOOPT-NEXT:    imull %esi, %edi
+; X86-NOOPT-NEXT:    addl %edi, %edx
+; X86-NOOPT-NEXT:    imull %ebx, %ecx
+; X86-NOOPT-NEXT:    addl %ecx, %edx
+; X86-NOOPT-NEXT:    popl %esi
+; X86-NOOPT-NEXT:    popl %edi
+; X86-NOOPT-NEXT:    popl %ebx
+; X86-NOOPT-NEXT:    retl
+;
+; HSW-NOOPT-LABEL: test_mul_spec:
+; HSW-NOOPT:       # BB#0:
+; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,8), %rcx # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    addq $42, %rcx # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    leaq (%rdi,%rdi,4), %rax # sched: [1:0.50]
+; HSW-NOOPT-NEXT:    addq $2, %rax # sched: [1:0.25]
+; HSW-NOOPT-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
+; HSW-NOOPT-NEXT:    retq # sched: [1:1.00]
+;
+; JAG-NOOPT-LABEL: test_mul_spec:
+; JAG-NOOPT:       # BB#0:
+; JAG-NOOPT-NEXT:    leaq 42(%rdi,%rdi,8), %rcx # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    leaq 2(%rdi,%rdi,4), %rax # sched: [1:0.50]
+; JAG-NOOPT-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
+; JAG-NOOPT-NEXT:    retq # sched: [4:1.00]
+;
+; X64-SLM-LABEL: test_mul_spec:
+; X64-SLM:       # BB#0:
+; X64-SLM-NEXT:    leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00]
+; X64-SLM-NEXT:    leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00]
+; X64-SLM-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
+; X64-SLM-NEXT:    retq # sched: [4:1.00]
+;
+; SLM-NOOPT-LABEL: test_mul_spec:
+; SLM-NOOPT:       # BB#0:
+; SLM-NOOPT-NEXT:    leaq 42(%rdi,%rdi,8), %rcx # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    leaq 2(%rdi,%rdi,4), %rax # sched: [1:1.00]
+; SLM-NOOPT-NEXT:    imulq %rcx, %rax # sched: [3:1.00]
+; SLM-NOOPT-NEXT:    retq # sched: [4:1.00]
+  %mul = mul nsw i64 %x, 9
+  %add = add nsw i64 %mul, 42
+  %mul2 = mul nsw i64 %x, 5
+  %add2 = add nsw i64 %mul2, 2
+  %mul3 = mul nsw i64 %add, %add2
+  ret i64 %mul3
+}
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index 391f1cc9fb43..1b8f8e7ae559 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -41,14 +41,67 @@ entry:
   ret <8 x i16> %3
 }
 
-define void @pr26232(i64 %a) {
+define void @pr26232(i64 %a, <16 x i1> %b) {
 ; AVX-LABEL: pr26232:
 ; AVX:       # BB#0: # %for_loop599.preheader
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    .p2align 4, 0x90
 ; AVX-NEXT:  .LBB1_1: # %for_loop599
 ; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX-NEXT:    xorl %eax, %eax
 ; AVX-NEXT:    cmpq $65536, %rdi # imm = 0x10000
-; AVX-NEXT:    setl -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    setl %al
+; AVX-NEXT:    vmovd %eax, %xmm2
+; AVX-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
+; AVX-NEXT:    vpand %xmm0, %xmm2, %xmm2
+; AVX-NEXT:    vpextrb $15, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $14, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $13, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $12, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $11, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $10, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $9, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $8, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $7, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $6, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $5, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $4, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $3, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $2, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $1, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    vpextrb $0, %xmm2, %eax
+; AVX-NEXT:    andb $1, %al
+; AVX-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    cmpw $0, -{{[0-9]+}}(%rsp)
 ; AVX-NEXT:    jne .LBB1_1
 ; AVX-NEXT:  # BB#2: # %for_exit600
@@ -61,6 +114,9 @@ define void @pr26232(i64 %a) {
 ; KNL-32-NEXT:    .cfi_def_cfa_offset 8
 ; KNL-32-NEXT:  .Lcfi1:
 ; KNL-32-NEXT:    .cfi_offset %esi, -8
+; KNL-32-NEXT:    vpmovsxbd %xmm0, %zmm0
+; KNL-32-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-32-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; KNL-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; KNL-32-NEXT:    movw $-1, %dx
@@ -72,6 +128,9 @@ define void @pr26232(i64 %a) {
 ; KNL-32-NEXT:    sbbl $0, %esi
 ; KNL-32-NEXT:    movl $0, %esi
 ; KNL-32-NEXT:    cmovlw %dx, %si
+; KNL-32-NEXT:    kmovw %esi, %k1
+; KNL-32-NEXT:    kandw %k0, %k1, %k1
+; KNL-32-NEXT:    kmovw %k1, %esi
 ; KNL-32-NEXT:    testw %si, %si
 ; KNL-32-NEXT:    jne .LBB1_1
 ; KNL-32-NEXT:  # BB#2: # %for_exit600
@@ -87,7 +146,7 @@ for_loop599:                                      ; preds = %for_loop599, %for_t
   %less_i_load605_ = icmp slt i64 %a, 65536
   %less_i_load605__broadcast_init = insertelement <16 x i1> undef, i1 %less_i_load605_, i32 0
   %less_i_load605__broadcast = shufflevector <16 x i1> %less_i_load605__broadcast_init, <16 x i1> undef, <16 x i32> zeroinitializer
-  %"oldMask&test607" = and <16 x i1> %less_i_load605__broadcast, undef
+  %"oldMask&test607" = and <16 x i1> %less_i_load605__broadcast, %b
   %intmask.i894 = bitcast <16 x i1> %"oldMask&test607" to i16
   %res.i895 = icmp eq i16 %intmask.i894, 0
   br i1 %res.i895, label %for_exit600, label %for_loop599
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index 8cc1d8c765ac..53e471d6f175 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -1749,6 +1749,62 @@ entry:
  ret <4 x i64> %Y
 }
 
+define <2 x i64> @load_sext_4i8_to_4i64_extract(<4 x i8> *%ptr) {
+; SSE2-LABEL: load_sext_4i8_to_4i64_extract:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movsbq 3(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm1
+; SSE2-NEXT:    movsbq 2(%rdi), %rax
+; SSE2-NEXT:    movq %rax, %xmm0
+; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: load_sext_4i8_to_4i64_extract:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movsbq 3(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm1
+; SSSE3-NEXT:    movsbq 2(%rdi), %rax
+; SSSE3-NEXT:    movq %rax, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: load_sext_4i8_to_4i64_extract:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovsxbq 2(%rdi), %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: load_sext_4i8_to_4i64_extract:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovsxbd (%rdi), %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_sext_4i8_to_4i64_extract:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovsxbq (%rdi), %ymm0
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: load_sext_4i8_to_4i64_extract:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovsxbq (%rdi), %ymm0
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
+; X32-SSE41-LABEL: load_sext_4i8_to_4i64_extract:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    pmovsxbq 2(%eax), %xmm0
+; X32-SSE41-NEXT:    retl
+ %ld = load <4 x i8>, <4 x i8>* %ptr
+ %sext = sext <4 x i8> %ld to <4 x i64>
+ %extract = shufflevector <4 x i64> %sext, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
+ ret <2 x i64> %extract
+}
+
 define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
 ; SSE2-LABEL: load_sext_8i1_to_8i16:
 ; SSE2:       # BB#0: # %entry
diff --git a/test/CodeGen/X86/xchg-nofold.ll b/test/CodeGen/X86/xchg-nofold.ll
new file mode 100644
index 000000000000..fddc7906e08f
--- /dev/null
+++ b/test/CodeGen/X86/xchg-nofold.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=x86_64-linux-gnu < %s | FileCheck %s
+
+%"struct.std::atomic" = type { %"struct.std::atomic_bool" }
+%"struct.std::atomic_bool" = type { %"struct.std::__atomic_base" }
+%"struct.std::__atomic_base" = type { i8 }
+
+; CHECK-LABEL: _Z3fooRSt6atomicIbEb
+define zeroext i1 @_Z3fooRSt6atomicIbEb(%"struct.std::atomic"* nocapture dereferenceable(1) %a, i1 returned zeroext %b) nounwind {
+entry:
+  %frombool.i.i = zext i1 %b to i8
+  %_M_i.i.i = getelementptr inbounds %"struct.std::atomic", %"struct.std::atomic"* %a, i64 0, i32 0, i32 0, i32 0
+  %0 = ptrtoint i8* %_M_i.i.i to i64
+  %1 = lshr i64 %0, 3
+  %2 = add i64 %1, 2147450880
+  %3 = inttoptr i64 %2 to i8*
+  %4 = load i8, i8* %3
+  %5 = icmp ne i8 %4, 0
+  br i1 %5, label %6, label %11
+
+; <label>:6:                                      ; preds = %entry
+  %7 = and i64 %0, 7
+  %8 = trunc i64 %7 to i8
+  %9 = icmp sge i8 %8, %4
+  br i1 %9, label %10, label %11
+
+; <label>:10:                                     ; preds = %6
+  call void @__asan_report_store1(i64 %0)
+  call void asm sideeffect "", ""()
+  unreachable
+
+; <label>:11:                                     ; preds = %6, %entry
+  store atomic i8 %frombool.i.i, i8* %_M_i.i.i seq_cst, align 1
+; CHECK: xchgb	%{{.*}}, (%{{.*}})
+  ret i1 %b
+}
+
+declare void @__asan_report_store1(i64)
diff --git a/test/MC/AArch64/ldr-pseudo.s b/test/MC/AArch64/ldr-pseudo.s
index e132f7cf651f..1d99d1401801 100644
--- a/test/MC/AArch64/ldr-pseudo.s
+++ b/test/MC/AArch64/ldr-pseudo.s
@@ -205,6 +205,13 @@ f18:
   ldr x1, =0x320064
 // CHECK: ldr x1, .Ltmp[[TMP26:[0-9]+]]
 
+// We previously used a DenseMap with constant values as keys, check that
+// sentinel values can be used.
+  ldr x0, =0x7ffffffffffffffe
+// CHECK: ldr x0, .Ltmp[[TMP27:[0-9]+]]
+  ldr x1, =0x7fffffffffffffff
+// CHECK: ldr x1, .Ltmp[[TMP28:[0-9]+]]
+
 //
 // Constant Pools
 //
@@ -311,3 +318,8 @@ f18:
 // CHECK: .p2align 2
 // CHECK: .Ltmp[[TMP25]]
 // CHECK: .word 3276900
+
+// CHECK: .Ltmp[[TMP27]]
+// CHECK: .xword 9223372036854775806
+// CHECK: .Ltmp[[TMP28]]
+// CHECK: .xword 9223372036854775807
diff --git a/test/MC/Disassembler/SystemZ/insns-z13.txt b/test/MC/Disassembler/SystemZ/insns-z13.txt
index 4f5ec43f7348..c48bdee8d613 100644
--- a/test/MC/Disassembler/SystemZ/insns-z13.txt
+++ b/test/MC/Disassembler/SystemZ/insns-z13.txt
@@ -2,6 +2,114 @@
 # RUN: llvm-mc --disassemble %s -triple=s390x-linux-gnu -mcpu=z13 \
 # RUN:   | FileCheck %s
 
+# CHECK: cdpt %f0, 0(1), 0
+0xed 0x00 0x00 0x00 0x00 0xae
+
+# CHECK: cdpt %f15, 0(1), 0
+0xed 0x00 0x00 0x00 0xf0 0xae
+
+# CHECK: cdpt %f0, 0(1), 15
+0xed 0x00 0x00 0x00 0x0f 0xae
+
+# CHECK: cdpt %f0, 0(1,%r1), 0
+0xed 0x00 0x10 0x00 0x00 0xae
+
+# CHECK: cdpt %f0, 0(1,%r15), 0
+0xed 0x00 0xf0 0x00 0x00 0xae
+
+# CHECK: cdpt %f0, 4095(1,%r1), 0
+0xed 0x00 0x1f 0xff 0x00 0xae
+
+# CHECK: cdpt %f0, 4095(1,%r15), 0
+0xed 0x00 0xff 0xff 0x00 0xae
+
+# CHECK: cdpt %f0, 0(256,%r1), 0
+0xed 0xff 0x10 0x00 0x00 0xae
+
+# CHECK: cdpt %f0, 0(256,%r15), 0
+0xed 0xff 0xf0 0x00 0x00 0xae
+
+# CHECK: cpdt %f0, 0(1), 0
+0xed 0x00 0x00 0x00 0x00 0xac
+
+# CHECK: cpdt %f15, 0(1), 0
+0xed 0x00 0x00 0x00 0xf0 0xac
+
+# CHECK: cpdt %f0, 0(1), 15
+0xed 0x00 0x00 0x00 0x0f 0xac
+
+# CHECK: cpdt %f0, 0(1,%r1), 0
+0xed 0x00 0x10 0x00 0x00 0xac
+
+# CHECK: cpdt %f0, 0(1,%r15), 0
+0xed 0x00 0xf0 0x00 0x00 0xac
+
+# CHECK: cpdt %f0, 4095(1,%r1), 0
+0xed 0x00 0x1f 0xff 0x00 0xac
+
+# CHECK: cpdt %f0, 4095(1,%r15), 0
+0xed 0x00 0xff 0xff 0x00 0xac
+
+# CHECK: cpdt %f0, 0(256,%r1), 0
+0xed 0xff 0x10 0x00 0x00 0xac
+
+# CHECK: cpdt %f0, 0(256,%r15), 0
+0xed 0xff 0xf0 0x00 0x00 0xac
+
+# CHECK: cpxt %f0, 0(1), 0
+0xed 0x00 0x00 0x00 0x00 0xad
+
+# CHECK: cpxt %f13, 0(1), 0
+0xed 0x00 0x00 0x00 0xd0 0xad
+
+# CHECK: cpxt %f0, 0(1), 15
+0xed 0x00 0x00 0x00 0x0f 0xad
+
+# CHECK: cpxt %f0, 0(1,%r1), 0
+0xed 0x00 0x10 0x00 0x00 0xad
+
+# CHECK: cpxt %f0, 0(1,%r15), 0
+0xed 0x00 0xf0 0x00 0x00 0xad
+
+# CHECK: cpxt %f0, 4095(1,%r1), 0
+0xed 0x00 0x1f 0xff 0x00 0xad
+
+# CHECK: cpxt %f0, 4095(1,%r15), 0
+0xed 0x00 0xff 0xff 0x00 0xad
+
+# CHECK: cpxt %f0, 0(256,%r1), 0
+0xed 0xff 0x10 0x00 0x00 0xad
+
+# CHECK: cpxt %f0, 0(256,%r15), 0
+0xed 0xff 0xf0 0x00 0x00 0xad
+
+# CHECK: cxpt %f0, 0(1), 0
+0xed 0x00 0x00 0x00 0x00 0xaf
+
+# CHECK: cxpt %f13, 0(1), 0
+0xed 0x00 0x00 0x00 0xd0 0xaf
+
+# CHECK: cxpt %f0, 0(1), 15
+0xed 0x00 0x00 0x00 0x0f 0xaf
+
+# CHECK: cxpt %f0, 0(1,%r1), 0
+0xed 0x00 0x10 0x00 0x00 0xaf
+
+# CHECK: cxpt %f0, 0(1,%r15), 0
+0xed 0x00 0xf0 0x00 0x00 0xaf
+
+# CHECK: cxpt %f0, 4095(1,%r1), 0
+0xed 0x00 0x1f 0xff 0x00 0xaf
+
+# CHECK: cxpt %f0, 4095(1,%r15), 0
+0xed 0x00 0xff 0xff 0x00 0xaf
+
+# CHECK: cxpt %f0, 0(256,%r1), 0
+0xed 0xff 0x10 0x00 0x00 0xaf
+
+# CHECK: cxpt %f0, 0(256,%r15), 0
+0xed 0xff 0xf0 0x00 0x00 0xaf
+
 # CHECK: lcbb %r0, 0, 0
 0xe7 0x00 0x00 0x00 0x00 0x27
 
diff --git a/test/MC/Disassembler/SystemZ/insns.txt b/test/MC/Disassembler/SystemZ/insns.txt
index dac94099f276..75f7f9669b5c 100644
--- a/test/MC/Disassembler/SystemZ/insns.txt
+++ b/test/MC/Disassembler/SystemZ/insns.txt
@@ -22,6 +22,27 @@
 # CHECK: a %r15, 0
 0x5a 0xf0 0x00 0x00
 
+# CHECK: ad %f0, 0
+0x6a 0x00 0x00 0x00
+
+# CHECK: ad %f0, 4095
+0x6a 0x00 0x0f 0xff
+
+# CHECK: ad %f0, 0(%r1)
+0x6a 0x00 0x10 0x00
+
+# CHECK: ad %f0, 0(%r15)
+0x6a 0x00 0xf0 0x00
+
+# CHECK: ad %f0, 4095(%r1,%r15)
+0x6a 0x01 0xff 0xff
+
+# CHECK: ad %f0, 4095(%r15,%r1)
+0x6a 0x0f 0x1f 0xff
+
+# CHECK: ad %f15, 0
+0x6a 0xf0 0x00 0x00
+
 # CHECK: adb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x1a
 
@@ -55,6 +76,72 @@
 # CHECK: adbr %f15, %f0
 0xb3 0x1a 0x00 0xf0
 
+# CHECK: adr %f0, %f0
+0x2a 0x00
+
+# CHECK: adr %f0, %f15
+0x2a 0x0f
+
+# CHECK: adr %f7, %f8
+0x2a 0x78
+
+# CHECK: adr %f15, %f0
+0x2a 0xf0
+
+# CHECK: adtr %f0, %f0, %f0
+0xb3 0xd2 0x00 0x00
+
+# CHECK: adtr %f0, %f0, %f15
+0xb3 0xd2 0xf0 0x00
+
+# CHECK: adtr %f0, %f15, %f0
+0xb3 0xd2 0x00 0x0f
+
+# CHECK: adtr %f15, %f0, %f0
+0xb3 0xd2 0x00 0xf0
+
+# CHECK: adtr %f7, %f8, %f9
+0xb3 0xd2 0x90 0x78
+
+# CHECK: adtra %f0, %f0, %f0, 1
+0xb3 0xd2 0x01 0x00
+
+# CHECK: adtra %f0, %f0, %f0, 15
+0xb3 0xd2 0x0f 0x00
+
+# CHECK: adtra %f0, %f0, %f15, 1
+0xb3 0xd2 0xf1 0x00
+
+# CHECK: adtra %f0, %f15, %f0, 1
+0xb3 0xd2 0x01 0x0f
+
+# CHECK: adtra %f15, %f0, %f0, 1
+0xb3 0xd2 0x01 0xf0
+
+# CHECK: adtra %f7, %f8, %f9, 10
+0xb3 0xd2 0x9a 0x78
+
+# CHECK: ae %f0, 0
+0x7a 0x00 0x00 0x00
+
+# CHECK: ae %f0, 4095
+0x7a 0x00 0x0f 0xff
+
+# CHECK: ae %f0, 0(%r1)
+0x7a 0x00 0x10 0x00
+
+# CHECK: ae %f0, 0(%r15)
+0x7a 0x00 0xf0 0x00
+
+# CHECK: ae %f0, 4095(%r1,%r15)
+0x7a 0x01 0xff 0xff
+
+# CHECK: ae %f0, 4095(%r15,%r1)
+0x7a 0x0f 0x1f 0xff
+
+# CHECK: ae %f15, 0
+0x7a 0xf0 0x00 0x00
+
 # CHECK: aeb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x0a
 
@@ -88,6 +175,18 @@
 # CHECK: aebr %f15, %f0
 0xb3 0x0a 0x00 0xf0
 
+# CHECK: aer %f0, %f0
+0x3a 0x00
+
+# CHECK: aer %f0, %f15
+0x3a 0x0f
+
+# CHECK: aer %f7, %f8
+0x3a 0x78
+
+# CHECK: aer %f15, %f0
+0x3a 0xf0
+
 # CHECK: afi %r0, -2147483648
 0xc2 0x09 0x80 0x00 0x00 0x00
 
@@ -856,6 +955,72 @@
 # CHECK: asi 524287(%r15), 42
 0xeb 0x2a 0xff 0xff 0x7f 0x6a
 
+# CHECK: au %f0, 0
+0x7e 0x00 0x00 0x00
+
+# CHECK: au %f0, 4095
+0x7e 0x00 0x0f 0xff
+
+# CHECK: au %f0, 0(%r1)
+0x7e 0x00 0x10 0x00
+
+# CHECK: au %f0, 0(%r15)
+0x7e 0x00 0xf0 0x00
+
+# CHECK: au %f0, 4095(%r1,%r15)
+0x7e 0x01 0xff 0xff
+
+# CHECK: au %f0, 4095(%r15,%r1)
+0x7e 0x0f 0x1f 0xff
+
+# CHECK: au %f15, 0
+0x7e 0xf0 0x00 0x00
+
+# CHECK: aur %f0, %f0
+0x3e 0x00
+
+# CHECK: aur %f0, %f15
+0x3e 0x0f
+
+# CHECK: aur %f7, %f8
+0x3e 0x78
+
+# CHECK: aur %f15, %f0
+0x3e 0xf0
+
+# CHECK: aw %f0, 0
+0x6e 0x00 0x00 0x00
+
+# CHECK: aw %f0, 4095
+0x6e 0x00 0x0f 0xff
+
+# CHECK: aw %f0, 0(%r1)
+0x6e 0x00 0x10 0x00
+
+# CHECK: aw %f0, 0(%r15)
+0x6e 0x00 0xf0 0x00
+
+# CHECK: aw %f0, 4095(%r1,%r15)
+0x6e 0x01 0xff 0xff
+
+# CHECK: aw %f0, 4095(%r15,%r1)
+0x6e 0x0f 0x1f 0xff
+
+# CHECK: aw %f15, 0
+0x6e 0xf0 0x00 0x00
+
+# CHECK: awr %f0, %f0
+0x2e 0x00
+
+# CHECK: awr %f0, %f15
+0x2e 0x0f
+
+# CHECK: awr %f7, %f8
+0x2e 0x78
+
+# CHECK: awr %f15, %f0
+0x2e 0xf0
+
 # CHECK: axbr %f0, %f0
 0xb3 0x4a 0x00 0x00
 
@@ -868,6 +1033,51 @@
 # CHECK: axbr %f13, %f0
 0xb3 0x4a 0x00 0xd0
 
+# CHECK: axr %f0, %f0
+0x36 0x00
+
+# CHECK: axr %f0, %f13
+0x36 0x0d
+
+# CHECK: axr %f8, %f8
+0x36 0x88
+
+# CHECK: axr %f13, %f0
+0x36 0xd0
+
+# CHECK: axtr %f0, %f0, %f0
+0xb3 0xda 0x00 0x00
+
+# CHECK: axtr %f0, %f0, %f13
+0xb3 0xda 0xd0 0x00
+
+# CHECK: axtr %f0, %f13, %f0
+0xb3 0xda 0x00 0x0d
+
+# CHECK: axtr %f13, %f0, %f0
+0xb3 0xda 0x00 0xd0
+
+# CHECK: axtr %f8, %f8, %f8
+0xb3 0xda 0x80 0x88
+
+# CHECK: axtra %f0, %f0, %f0, 1
+0xb3 0xda 0x01 0x00
+
+# CHECK: axtra %f0, %f0, %f0, 15
+0xb3 0xda 0x0f 0x00
+
+# CHECK: axtra %f0, %f0, %f13, 1
+0xb3 0xda 0xd1 0x00
+
+# CHECK: axtra %f0, %f13, %f0, 1
+0xb3 0xda 0x01 0x0d
+
+# CHECK: axtra %f13, %f0, %f0, 1
+0xb3 0xda 0x01 0xd0
+
+# CHECK: axtra %f8, %f8, %f8, 8
+0xb3 0xda 0x88 0x88
+
 # CHECK: ay %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x5a
 
@@ -1348,6 +1558,27 @@
 # CHECK: c %r15, 0
 0x59 0xf0 0x00 0x00
 
+# CHECK: cd %f0, 0
+0x69 0x00 0x00 0x00
+
+# CHECK: cd %f0, 4095
+0x69 0x00 0x0f 0xff
+
+# CHECK: cd %f0, 0(%r1)
+0x69 0x00 0x10 0x00
+
+# CHECK: cd %f0, 0(%r15)
+0x69 0x00 0xf0 0x00
+
+# CHECK: cd %f0, 4095(%r1,%r15)
+0x69 0x01 0xff 0xff
+
+# CHECK: cd %f0, 4095(%r15,%r1)
+0x69 0x0f 0x1f 0xff
+
+# CHECK: cd %f15, 0
+0x69 0xf0 0x00 0x00
+
 # CHECK: cdb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x19
 
@@ -1414,6 +1645,39 @@
 # CHECK: cdfbra %f15, 0, %r0, 1
 0xb3 0x95 0x01 0xf0
 
+# CHECK: cdfr %f0, %r0
+0xb3 0xb5 0x00 0x00
+
+# CHECK: cdfr %f0, %r15
+0xb3 0xb5 0x00 0x0f
+
+# CHECK: cdfr %f15, %r0
+0xb3 0xb5 0x00 0xf0
+
+# CHECK: cdfr %f7, %r8
+0xb3 0xb5 0x00 0x78
+
+# CHECK: cdfr %f15, %r15
+0xb3 0xb5 0x00 0xff
+
+# CHECK: cdftr %f0, 0, %r0, 0
+0xb9 0x51 0x00 0x00
+
+# CHECK: cdftr %f0, 0, %r0, 15
+0xb9 0x51 0x0f 0x00
+
+# CHECK: cdftr %f0, 0, %r15, 0
+0xb9 0x51 0x00 0x0f
+
+# CHECK: cdftr %f0, 15, %r0, 0
+0xb9 0x51 0xf0 0x00
+
+# CHECK: cdftr %f4, 5, %r6, 7
+0xb9 0x51 0x57 0x46
+
+# CHECK: cdftr %f15, 0, %r0, 0
+0xb9 0x51 0x00 0xf0
+
 # CHECK: cdgbr %f0, %r0
 0xb3 0xa5 0x00 0x00
 
@@ -1447,6 +1711,54 @@
 # CHECK: cdgbra %f15, 0, %r0, 1
 0xb3 0xa5 0x01 0xf0
 
+# CHECK: cdgr %f0, %r0
+0xb3 0xc5 0x00 0x00
+
+# CHECK: cdgr %f0, %r15
+0xb3 0xc5 0x00 0x0f
+
+# CHECK: cdgr %f15, %r0
+0xb3 0xc5 0x00 0xf0
+
+# CHECK: cdgr %f7, %r8
+0xb3 0xc5 0x00 0x78
+
+# CHECK: cdgr %f15, %r15
+0xb3 0xc5 0x00 0xff
+
+# CHECK: cdgtr %f0, %r0
+0xb3 0xf1 0x00 0x00
+
+# CHECK: cdgtr %f0, %r15
+0xb3 0xf1 0x00 0x0f
+
+# CHECK: cdgtr %f15, %r0
+0xb3 0xf1 0x00 0xf0
+
+# CHECK: cdgtr %f7, %r8
+0xb3 0xf1 0x00 0x78
+
+# CHECK: cdgtr %f15, %r15
+0xb3 0xf1 0x00 0xff
+
+# CHECK: cdgtra %f0, 0, %r0, 1
+0xb3 0xf1 0x01 0x00
+
+# CHECK: cdgtra %f0, 0, %r0, 15
+0xb3 0xf1 0x0f 0x00
+
+# CHECK: cdgtra %f0, 0, %r15, 1
+0xb3 0xf1 0x01 0x0f
+
+# CHECK: cdgtra %f0, 15, %r0, 1
+0xb3 0xf1 0xf1 0x00
+
+# CHECK: cdgtra %f4, 5, %r6, 7
+0xb3 0xf1 0x57 0x46
+
+# CHECK: cdgtra %f15, 0, %r0, 1
+0xb3 0xf1 0x01 0xf0
+
 # CHECK: cdlfbr %f0, 0, %r0, 1
 0xb3 0x91 0x01 0x00
 
@@ -1465,6 +1777,24 @@
 # CHECK: cdlfbr %f15, 0, %r0, 1
 0xb3 0x91 0x01 0xf0
 
+# CHECK: cdlftr %f0, 0, %r0, 0
+0xb9 0x53 0x00 0x00
+
+# CHECK: cdlftr %f0, 0, %r0, 15
+0xb9 0x53 0x0f 0x00
+
+# CHECK: cdlftr %f0, 0, %r15, 0
+0xb9 0x53 0x00 0x0f
+
+# CHECK: cdlftr %f0, 15, %r0, 0
+0xb9 0x53 0xf0 0x00
+
+# CHECK: cdlftr %f4, 5, %r6, 7
+0xb9 0x53 0x57 0x46
+
+# CHECK: cdlftr %f15, 0, %r0, 0
+0xb9 0x53 0x00 0xf0
+
 # CHECK: cdlgbr %f0, 0, %r0, 1
 0xb3 0xa1 0x01 0x00
 
@@ -1483,6 +1813,36 @@
 # CHECK: cdlgbr %f15, 0, %r0, 1
 0xb3 0xa1 0x01 0xf0
 
+# CHECK: cdlgtr %f0, 0, %r0, 0
+0xb9 0x52 0x00 0x00
+
+# CHECK: cdlgtr %f0, 0, %r0, 15
+0xb9 0x52 0x0f 0x00
+
+# CHECK: cdlgtr %f0, 0, %r15, 0
+0xb9 0x52 0x00 0x0f
+
+# CHECK: cdlgtr %f0, 15, %r0, 0
+0xb9 0x52 0xf0 0x00
+
+# CHECK: cdlgtr %f4, 5, %r6, 7
+0xb9 0x52 0x57 0x46
+
+# CHECK: cdlgtr %f15, 0, %r0, 0
+0xb9 0x52 0x00 0xf0
+
+# CHECK: cdr %f0, %f0
+0x29 0x00
+
+# CHECK: cdr %f0, %f15
+0x29 0x0f
+
+# CHECK: cdr %f7, %f8
+0x29 0x78
+
+# CHECK: cdr %f15, %f0
+0x29 0xf0
+
 # CHECK: cds %r0, %r0, 0
 0xbb 0x00 0x00 0x00
 
@@ -1540,6 +1900,21 @@
 # CHECK: cdsg %r14, %r0, 0
 0xeb 0xe0 0x00 0x00 0x00 0x3e
 
+# CHECK: cdstr %f0, %r0
+0xb3 0xf3 0x00 0x00
+
+# CHECK: cdstr %f0, %r15
+0xb3 0xf3 0x00 0x0f
+
+# CHECK: cdstr %f15, %r0
+0xb3 0xf3 0x00 0xf0
+
+# CHECK: cdstr %f7, %r8
+0xb3 0xf3 0x00 0x78
+
+# CHECK: cdstr %f15, %r15
+0xb3 0xf3 0x00 0xff
+
 # CHECK: cdsy %r0, %r0, -524288
 0xeb 0x00 0x00 0x00 0x80 0x31
 
@@ -1573,6 +1948,81 @@
 # CHECK: cdsy %r14, %r0, 0
 0xeb 0xe0 0x00 0x00 0x00 0x31
 
+# CHECK: cdtr %f0, %f0
+0xb3 0xe4 0x00 0x00
+
+# CHECK: cdtr %f0, %f15
+0xb3 0xe4 0x00 0x0f
+
+# CHECK: cdtr %f7, %f8
+0xb3 0xe4 0x00 0x78
+
+# CHECK: cdtr %f15, %f0
+0xb3 0xe4 0x00 0xf0
+
+# CHECK: cdutr %f0, %r0
+0xb3 0xf2 0x00 0x00
+
+# CHECK: cdutr %f0, %r15
+0xb3 0xf2 0x00 0x0f
+
+# CHECK: cdutr %f15, %r0
+0xb3 0xf2 0x00 0xf0
+
+# CHECK: cdutr %f7, %r8
+0xb3 0xf2 0x00 0x78
+
+# CHECK: cdutr %f15, %r15
+0xb3 0xf2 0x00 0xff
+
+# CHECK: cdzt %f0, 0(1), 0
+0xed 0x00 0x00 0x00 0x00 0xaa
+
+# CHECK: cdzt %f15, 0(1), 0
+0xed 0x00 0x00 0x00 0xf0 0xaa
+
+# CHECK: cdzt %f0, 0(1), 15
+0xed 0x00 0x00 0x00 0x0f 0xaa
+
+# CHECK: cdzt %f0, 0(1,%r1), 0
+0xed 0x00 0x10 0x00 0x00 0xaa
+
+# CHECK: cdzt %f0, 0(1,%r15), 0
+0xed 0x00 0xf0 0x00 0x00 0xaa
+
+# CHECK: cdzt %f0, 4095(1,%r1), 0
+0xed 0x00 0x1f 0xff 0x00 0xaa
+
+# CHECK: cdzt %f0, 4095(1,%r15), 0
+0xed 0x00 0xff 0xff 0x00 0xaa
+
+# CHECK: cdzt %f0, 0(256,%r1), 0
+0xed 0xff 0x10 0x00 0x00 0xaa
+
+# CHECK: cdzt %f0, 0(256,%r15), 0
+0xed 0xff 0xf0 0x00 0x00 0xaa
+
+# CHECK: ce %f0, 0
+0x79 0x00 0x00 0x00
+
+# CHECK: ce %f0, 4095
+0x79 0x00 0x0f 0xff
+
+# CHECK: ce %f0, 0(%r1)
+0x79 0x00 0x10 0x00
+
+# CHECK: ce %f0, 0(%r15)
+0x79 0x00 0xf0 0x00
+
+# CHECK: ce %f0, 4095(%r1,%r15)
+0x79 0x01 0xff 0xff
+
+# CHECK: ce %f0, 4095(%r15,%r1)
+0x79 0x0f 0x1f 0xff
+
+# CHECK: ce %f15, 0
+0x79 0xf0 0x00 0x00
+
 # CHECK: ceb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x09
 
@@ -1606,6 +2056,18 @@
 # CHECK: cebr %f15, %f0
 0xb3 0x09 0x00 0xf0
 
+# CHECK: cedtr %f0, %f0
+0xb3 0xf4 0x00 0x00
+
+# CHECK: cedtr %f0, %f15
+0xb3 0xf4 0x00 0x0f
+
+# CHECK: cedtr %f7, %f8
+0xb3 0xf4 0x00 0x78
+
+# CHECK: cedtr %f15, %f0
+0xb3 0xf4 0x00 0xf0
+
 # CHECK: cefbr %f0, %r0
 0xb3 0x94 0x00 0x00
 
@@ -1639,6 +2101,21 @@
 # CHECK: cefbra %f15, 0, %r0, 1
 0xb3 0x94 0x01 0xf0
 
+# CHECK: cefr %f0, %r0
+0xb3 0xb4 0x00 0x00
+
+# CHECK: cefr %f0, %r15
+0xb3 0xb4 0x00 0x0f
+
+# CHECK: cefr %f15, %r0
+0xb3 0xb4 0x00 0xf0
+
+# CHECK: cefr %f7, %r8
+0xb3 0xb4 0x00 0x78
+
+# CHECK: cefr %f15, %r15
+0xb3 0xb4 0x00 0xff
+
 # CHECK: cegbr %f0, %r0
 0xb3 0xa4 0x00 0x00
 
@@ -1672,6 +2149,21 @@
 # CHECK: cegbra %f15, 0, %r0, 1
 0xb3 0xa4 0x01 0xf0
 
+# CHECK: cegr %f0, %r0
+0xb3 0xc4 0x00 0x00
+
+# CHECK: cegr %f0, %r15
+0xb3 0xc4 0x00 0x0f
+
+# CHECK: cegr %f15, %r0
+0xb3 0xc4 0x00 0xf0
+
+# CHECK: cegr %f7, %r8
+0xb3 0xc4 0x00 0x78
+
+# CHECK: cegr %f15, %r15
+0xb3 0xc4 0x00 0xff
+
 # CHECK: celfbr %f0, 0, %r0, 1
 0xb3 0x90 0x01 0x00
 
@@ -1708,6 +2200,30 @@
 # CHECK: celgbr %f15, 0, %r0, 1
 0xb3 0xa0 0x01 0xf0
 
+# CHECK: cer %f0, %f0
+0x39 0x00
+
+# CHECK: cer %f0, %f15
+0x39 0x0f
+
+# CHECK: cer %f7, %f8
+0x39 0x78
+
+# CHECK: cer %f15, %f0
+0x39 0xf0
+
+# CHECK: cextr %f0, %f0
+0xb3 0xfc 0x00 0x00
+
+# CHECK: cextr %f0, %f13
+0xb3 0xfc 0x00 0x0d
+
+# CHECK: cextr %f8, %f8
+0xb3 0xfc 0x00 0x88
+
+# CHECK: cextr %f13, %f0
+0xb3 0xfc 0x00 0xd0
+
 # CHECK: cfc 0
 0xb2 0x1a 0x00 0x00
 
@@ -1759,6 +2275,39 @@
 # CHECK: cfdbra %r15, 0, %f0, 1
 0xb3 0x99 0x01 0xf0
 
+# CHECK: cfdr %r0, 0, %f0
+0xb3 0xb9 0x00 0x00
+
+# CHECK: cfdr %r0, 0, %f15
+0xb3 0xb9 0x00 0x0f
+
+# CHECK: cfdr %r0, 15, %f0
+0xb3 0xb9 0xf0 0x00
+
+# CHECK: cfdr %r4, 5, %f6
+0xb3 0xb9 0x50 0x46
+
+# CHECK: cfdr %r15, 0, %f0
+0xb3 0xb9 0x00 0xf0
+
+# CHECK: cfdtr %r0, 0, %f0, 0
+0xb9 0x41 0x00 0x00
+
+# CHECK: cfdtr %r0, 0, %f0, 15
+0xb9 0x41 0x0f 0x00
+
+# CHECK: cfdtr %r0, 0, %f15, 0
+0xb9 0x41 0x00 0x0f
+
+# CHECK: cfdtr %r0, 15, %f0, 0
+0xb9 0x41 0xf0 0x00
+
+# CHECK: cfdtr %r4, 5, %f6, 7
+0xb9 0x41 0x57 0x46
+
+# CHECK: cfdtr %r15, 0, %f0, 0
+0xb9 0x41 0x00 0xf0
+
 # CHECK: cfebr %r0, 0, %f0
 0xb3 0x98 0x00 0x00
 
@@ -1792,6 +2341,21 @@
 # CHECK: cfebra %r15, 0, %f0, 1
 0xb3 0x98 0x01 0xf0
 
+# CHECK: cfer %r0, 0, %f0
+0xb3 0xb8 0x00 0x00
+
+# CHECK: cfer %r0, 0, %f15
+0xb3 0xb8 0x00 0x0f
+
+# CHECK: cfer %r0, 15, %f0
+0xb3 0xb8 0xf0 0x00
+
+# CHECK: cfer %r4, 5, %f6
+0xb3 0xb8 0x50 0x46
+
+# CHECK: cfer %r15, 0, %f0
+0xb3 0xb8 0x00 0xf0
+
 # CHECK: cfi %r0, -2147483648
 0xc2 0x0d 0x80 0x00 0x00 0x00
 
@@ -1843,6 +2407,39 @@
 # CHECK: cfxbra %r15, 0, %f0, 1
 0xb3 0x9a 0x01 0xf0
 
+# CHECK: cfxr %r0, 0, %f0
+0xb3 0xba 0x00 0x00
+
+# CHECK: cfxr %r0, 0, %f13
+0xb3 0xba 0x00 0x0d
+
+# CHECK: cfxr %r0, 15, %f0
+0xb3 0xba 0xf0 0x00
+
+# CHECK: cfxr %r4, 5, %f8
+0xb3 0xba 0x50 0x48
+
+# CHECK: cfxr %r15, 0, %f0
+0xb3 0xba 0x00 0xf0
+
+# CHECK: cfxtr %r0, 0, %f0, 0
+0xb9 0x49 0x00 0x00
+
+# CHECK: cfxtr %r0, 0, %f0, 15
+0xb9 0x49 0x0f 0x00
+
+# CHECK: cfxtr %r0, 0, %f13, 0
+0xb9 0x49 0x00 0x0d
+
+# CHECK: cfxtr %r0, 15, %f0, 0
+0xb9 0x49 0xf0 0x00
+
+# CHECK: cfxtr %r7, 5, %f8, 9
+0xb9 0x49 0x59 0x78
+
+# CHECK: cfxtr %r15, 0, %f0, 0
+0xb9 0x49 0x00 0xf0
+
 # CHECK: cg %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x20
 
@@ -1906,6 +2503,54 @@
 # CHECK: cgdbra %r15, 0, %f0, 1
 0xb3 0xa9 0x01 0xf0
 
+# CHECK: cgdr %r0, 0, %f0
+0xb3 0xc9 0x00 0x00
+
+# CHECK: cgdr %r0, 0, %f15
+0xb3 0xc9 0x00 0x0f
+
+# CHECK: cgdr %r0, 15, %f0
+0xb3 0xc9 0xf0 0x00
+
+# CHECK: cgdr %r4, 5, %f6
+0xb3 0xc9 0x50 0x46
+
+# CHECK: cgdr %r15, 0, %f0
+0xb3 0xc9 0x00 0xf0
+
+# CHECK: cgdtr %r0, 0, %f0
+0xb3 0xe1 0x00 0x00
+
+# CHECK: cgdtr %r0, 0, %f15
+0xb3 0xe1 0x00 0x0f
+
+# CHECK: cgdtr %r0, 15, %f0
+0xb3 0xe1 0xf0 0x00
+
+# CHECK: cgdtr %r4, 5, %f6
+0xb3 0xe1 0x50 0x46
+
+# CHECK: cgdtr %r15, 0, %f0
+0xb3 0xe1 0x00 0xf0
+
+# CHECK: cgdtra %r0, 0, %f0, 1
+0xb3 0xe1 0x01 0x00
+
+# CHECK: cgdtra %r0, 0, %f0, 15
+0xb3 0xe1 0x0f 0x00
+
+# CHECK: cgdtra %r0, 0, %f15, 1
+0xb3 0xe1 0x01 0x0f
+
+# CHECK: cgdtra %r0, 15, %f0, 1
+0xb3 0xe1 0xf1 0x00
+
+# CHECK: cgdtra %r4, 5, %f6, 7
+0xb3 0xe1 0x57 0x46
+
+# CHECK: cgdtra %r15, 0, %f0, 1
+0xb3 0xe1 0x01 0xf0
+
 # CHECK: cgebr %r0, 0, %f0
 0xb3 0xa8 0x00 0x00
 
@@ -1939,6 +2584,21 @@
 # CHECK: cgebra %r15, 0, %f0, 1
 0xb3 0xa8 0x01 0xf0
 
+# CHECK: cger %r0, 0, %f0
+0xb3 0xc8 0x00 0x00
+
+# CHECK: cger %r0, 0, %f15
+0xb3 0xc8 0x00 0x0f
+
+# CHECK: cger %r0, 15, %f0
+0xb3 0xc8 0xf0 0x00
+
+# CHECK: cger %r4, 5, %f6
+0xb3 0xc8 0x50 0x46
+
+# CHECK: cger %r15, 0, %f0
+0xb3 0xc8 0x00 0xf0
+
 # CHECK: cgf %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x30
 
@@ -2299,6 +2959,54 @@
 # CHECK: cgxbra %r15, 0, %f0, 1
 0xb3 0xaa 0x01 0xf0
 
+# CHECK: cgxr %r0, 0, %f0
+0xb3 0xca 0x00 0x00
+
+# CHECK: cgxr %r0, 0, %f13
+0xb3 0xca 0x00 0x0d
+
+# CHECK: cgxr %r0, 15, %f0
+0xb3 0xca 0xf0 0x00
+
+# CHECK: cgxr %r4, 5, %f8
+0xb3 0xca 0x50 0x48
+
+# CHECK: cgxr %r15, 0, %f0
+0xb3 0xca 0x00 0xf0
+
+# CHECK: cgxtr %r0, 0, %f0
+0xb3 0xe9 0x00 0x00
+
+# CHECK: cgxtr %r0, 0, %f13
+0xb3 0xe9 0x00 0x0d
+
+# CHECK: cgxtr %r0, 15, %f0
+0xb3 0xe9 0xf0 0x00
+
+# CHECK: cgxtr %r4, 5, %f8
+0xb3 0xe9 0x50 0x48
+
+# CHECK: cgxtr %r15, 0, %f0
+0xb3 0xe9 0x00 0xf0
+
+# CHECK: cgxtra %r0, 0, %f0, 1
+0xb3 0xe9 0x01 0x00
+
+# CHECK: cgxtra %r0, 0, %f0, 15
+0xb3 0xe9 0x0f 0x00
+
+# CHECK: cgxtra %r0, 0, %f13, 1
+0xb3 0xe9 0x01 0x0d
+
+# CHECK: cgxtra %r0, 15, %f0, 1
+0xb3 0xe9 0xf1 0x00
+
+# CHECK: cgxtra %r7, 5, %f8, 9
+0xb3 0xe9 0x59 0x78
+
+# CHECK: cgxtra %r15, 0, %f0, 1
+0xb3 0xe9 0x01 0xf0
+
 # CHECK: ch %r0, 0
 0x49 0x00 0x00 0x00
 
@@ -2722,6 +3430,24 @@
 # CHECK: clfdbr %r15, 0, %f0, 1
 0xb3 0x9d 0x01 0xf0
 
+# CHECK: clfdtr %r0, 0, %f0, 0
+0xb9 0x43 0x00 0x00
+
+# CHECK: clfdtr %r0, 0, %f0, 15
+0xb9 0x43 0x0f 0x00
+
+# CHECK: clfdtr %r0, 0, %f15, 0
+0xb9 0x43 0x00 0x0f
+
+# CHECK: clfdtr %r0, 15, %f0, 0
+0xb9 0x43 0xf0 0x00
+
+# CHECK: clfdtr %r4, 5, %f6, 7
+0xb9 0x43 0x57 0x46
+
+# CHECK: clfdtr %r15, 0, %f0, 0
+0xb9 0x43 0x00 0xf0
+
 # CHECK: clfebr %r0, 0, %f0, 1
 0xb3 0x9c 0x01 0x00
 
@@ -2758,6 +3484,24 @@
 # CHECK: clfxbr %r15, 0, %f0, 1
 0xb3 0x9e 0x01 0xf0
 
+# CHECK: clfxtr %r0, 0, %f0, 0
+0xb9 0x4b 0x00 0x00
+
+# CHECK: clfxtr %r0, 0, %f0, 15
+0xb9 0x4b 0x0f 0x00
+
+# CHECK: clfxtr %r0, 0, %f13, 0
+0xb9 0x4b 0x00 0x0d
+
+# CHECK: clfxtr %r0, 15, %f0, 0
+0xb9 0x4b 0xf0 0x00
+
+# CHECK: clfxtr %r7, 5, %f8, 9
+0xb9 0x4b 0x59 0x78
+
+# CHECK: clfxtr %r15, 0, %f0, 0
+0xb9 0x4b 0x00 0xf0
+
 # CHECK: clgdbr %r0, 0, %f0, 1
 0xb3 0xad 0x01 0x00
 
@@ -2776,6 +3520,24 @@
 # CHECK: clgdbr %r15, 0, %f0, 1
 0xb3 0xad 0x01 0xf0
 
+# CHECK: clgdtr %r0, 0, %f0, 0
+0xb9 0x42 0x00 0x00
+
+# CHECK: clgdtr %r0, 0, %f0, 15
+0xb9 0x42 0x0f 0x00
+
+# CHECK: clgdtr %r0, 0, %f15, 0
+0xb9 0x42 0x00 0x0f
+
+# CHECK: clgdtr %r0, 15, %f0, 0
+0xb9 0x42 0xf0 0x00
+
+# CHECK: clgdtr %r4, 5, %f6, 7
+0xb9 0x42 0x57 0x46
+
+# CHECK: clgdtr %r15, 0, %f0, 0
+0xb9 0x42 0x00 0xf0
+
 # CHECK: clgebr %r0, 0, %f0, 1
 0xb3 0xac 0x01 0x00
 
@@ -2884,6 +3646,24 @@
 # CHECK: clgxbr %r15, 0, %f0, 1
 0xb3 0xae 0x01 0xf0
 
+# CHECK: clgxtr %r0, 0, %f0, 0
+0xb9 0x4a 0x00 0x00
+
+# CHECK: clgxtr %r0, 0, %f0, 15
+0xb9 0x4a 0x0f 0x00
+
+# CHECK: clgxtr %r0, 0, %f13, 0
+0xb9 0x4a 0x00 0x0d
+
+# CHECK: clgxtr %r0, 15, %f0, 0
+0xb9 0x4a 0xf0 0x00
+
+# CHECK: clgxtr %r7, 5, %f8, 9
+0xb9 0x4a 0x59 0x78
+
+# CHECK: clgxtr %r15, 0, %f0, 0
+0xb9 0x4a 0x00 0xf0
+
 # CHECK: clfhsi 0, 0
 0xe5 0x5d 0x00 0x00 0x00 0x00
 
@@ -3844,6 +4624,21 @@
 # CHECK: cs %r15, %r0, 0
 0xba 0xf0 0x00 0x00
 
+# CHECK: csdtr %r0, %f0, 0
+0xb3 0xe3 0x00 0x00
+
+# CHECK: csdtr %r0, %f15, 0
+0xb3 0xe3 0x00 0x0f
+
+# CHECK: csdtr %r0, %f0, 15
+0xb3 0xe3 0x0f 0x00
+
+# CHECK: csdtr %r4, %f5, 6
+0xb3 0xe3 0x06 0x45
+
+# CHECK: csdtr %r15, %f0, 0
+0xb3 0xe3 0x00 0xf0
+
 # CHECK: csg %r0, %r0, -524288
 0xeb 0x00 0x00 0x00 0x80 0x30
 
@@ -3898,6 +4693,21 @@
 # CHECK: csst 4095(%r1), 0(%r15), %r2
 0xc8 0x22 0x1f 0xff 0xf0 0x00
 
+# CHECK: csxtr %r0, %f0, 0
+0xb3 0xeb 0x00 0x00
+
+# CHECK: csxtr %r0, %f13, 0
+0xb3 0xeb 0x00 0x0d
+
+# CHECK: csxtr %r0, %f0, 15
+0xb3 0xeb 0x0f 0x00
+
+# CHECK: csxtr %r4, %f5, 6
+0xb3 0xeb 0x06 0x45
+
+# CHECK: csxtr %r14, %f0, 0
+0xb3 0xeb 0x00 0xe0
+
 # CHECK: csy %r0, %r0, -524288
 0xeb 0x00 0x00 0x00 0x80 0x14
 
@@ -4027,6 +4837,21 @@
 # CHECK: cu42 %r6, %r8
 0xb9 0xb3 0x00 0x68
 
+# CHECK: cudtr %r0, %f0
+0xb3 0xe2 0x00 0x00
+
+# CHECK: cudtr %r0, %f15
+0xb3 0xe2 0x00 0x0f
+
+# CHECK: cudtr %r15, %f0
+0xb3 0xe2 0x00 0xf0
+
+# CHECK: cudtr %r7, %f8
+0xb3 0xe2 0x00 0x78
+
+# CHECK: cudtr %r15, %f15
+0xb3 0xe2 0x00 0xff
+
 # CHECK: cuse %r0, %r0
 0xb2 0x57 0x00 0x00
 
@@ -4039,6 +4864,21 @@
 # CHECK: cuse %r6, %r8
 0xb2 0x57 0x00 0x68
 
+# CHECK: cuxtr %r0, %f0
+0xb3 0xea 0x00 0x00
+
+# CHECK: cuxtr %r0, %f13
+0xb3 0xea 0x00 0x0d
+
+# CHECK: cuxtr %r14, %f0
+0xb3 0xea 0x00 0xe0
+
+# CHECK: cuxtr %r6, %f8
+0xb3 0xea 0x00 0x68
+
+# CHECK: cuxtr %r14, %f13
+0xb3 0xea 0x00 0xed
+
 # CHECK: cvb %r0, 0
 0x4f 0x00 0x00 0x00
 
@@ -4246,6 +5086,39 @@
 # CHECK: cxfbra %f13, 0, %r0, 1
 0xb3 0x96 0x01 0xd0
 
+# CHECK: cxfr %f0, %r0
+0xb3 0xb6 0x00 0x00
+
+# CHECK: cxfr %f0, %r15
+0xb3 0xb6 0x00 0x0f
+
+# CHECK: cxfr %f13, %r0
+0xb3 0xb6 0x00 0xd0
+
+# CHECK: cxfr %f8, %r7
+0xb3 0xb6 0x00 0x87
+
+# CHECK: cxfr %f13, %r15
+0xb3 0xb6 0x00 0xdf
+
+# CHECK: cxftr %f0, 0, %r0, 0
+0xb9 0x59 0x00 0x00
+
+# CHECK: cxftr %f0, 0, %r0, 15
+0xb9 0x59 0x0f 0x00
+
+# CHECK: cxftr %f0, 0, %r15, 0
+0xb9 0x59 0x00 0x0f
+
+# CHECK: cxftr %f0, 15, %r0, 0
+0xb9 0x59 0xf0 0x00
+
+# CHECK: cxftr %f4, 5, %r9, 10
+0xb9 0x59 0x5a 0x49
+
+# CHECK: cxftr %f13, 0, %r0, 0
+0xb9 0x59 0x00 0xd0
+
 # CHECK: cxgbr %f0, %r0
 0xb3 0xa6 0x00 0x00
 
@@ -4279,6 +5152,54 @@
 # CHECK: cxgbra %f13, 0, %r0, 1
 0xb3 0xa6 0x01 0xd0
 
+# CHECK: cxgr %f0, %r0
+0xb3 0xc6 0x00 0x00
+
+# CHECK: cxgr %f0, %r15
+0xb3 0xc6 0x00 0x0f
+
+# CHECK: cxgr %f13, %r0
+0xb3 0xc6 0x00 0xd0
+
+# CHECK: cxgr %f8, %r7
+0xb3 0xc6 0x00 0x87
+
+# CHECK: cxgr %f13, %r15
+0xb3 0xc6 0x00 0xdf
+
+# CHECK: cxgtr %f0, %r0
+0xb3 0xf9 0x00 0x00
+
+# CHECK: cxgtr %f0, %r15
+0xb3 0xf9 0x00 0x0f
+
+# CHECK: cxgtr %f13, %r0
+0xb3 0xf9 0x00 0xd0
+
+# CHECK: cxgtr %f8, %r7
+0xb3 0xf9 0x00 0x87
+
+# CHECK: cxgtr %f13, %r15
+0xb3 0xf9 0x00 0xdf
+
+# CHECK: cxgtra %f0, 0, %r0, 1
+0xb3 0xf9 0x01 0x00
+
+# CHECK: cxgtra %f0, 0, %r0, 15
+0xb3 0xf9 0x0f 0x00
+
+# CHECK: cxgtra %f0, 0, %r15, 1
+0xb3 0xf9 0x01 0x0f
+
+# CHECK: cxgtra %f0, 15, %r0, 1
+0xb3 0xf9 0xf1 0x00
+
+# CHECK: cxgtra %f4, 5, %r9, 10
+0xb3 0xf9 0x5a 0x49
+
+# CHECK: cxgtra %f13, 0, %r0, 1
+0xb3 0xf9 0x01 0xd0
+
 # CHECK: cxlfbr %f0, 0, %r0, 1
 0xb3 0x92 0x01 0x00
 
@@ -4297,6 +5218,24 @@
 # CHECK: cxlfbr %f13, 0, %r0, 1
 0xb3 0x92 0x01 0xd0
 
+# CHECK: cxlftr %f0, 0, %r0, 0
+0xb9 0x5b 0x00 0x00
+
+# CHECK: cxlftr %f0, 0, %r0, 15
+0xb9 0x5b 0x0f 0x00
+
+# CHECK: cxlftr %f0, 0, %r15, 0
+0xb9 0x5b 0x00 0x0f
+
+# CHECK: cxlftr %f0, 15, %r0, 0
+0xb9 0x5b 0xf0 0x00
+
+# CHECK: cxlftr %f4, 5, %r9, 10
+0xb9 0x5b 0x5a 0x49
+
+# CHECK: cxlftr %f13, 0, %r0, 0
+0xb9 0x5b 0x00 0xd0
+
 # CHECK: cxlgbr %f0, 0, %r0, 1
 0xb3 0xa2 0x01 0x00
 
@@ -4315,6 +5254,105 @@
 # CHECK: cxlgbr %f13, 0, %r0, 1
 0xb3 0xa2 0x01 0xd0
 
+# CHECK: cxlgtr %f0, 0, %r0, 0
+0xb9 0x5a 0x00 0x00
+
+# CHECK: cxlgtr %f0, 0, %r0, 15
+0xb9 0x5a 0x0f 0x00
+
+# CHECK: cxlgtr %f0, 0, %r15, 0
+0xb9 0x5a 0x00 0x0f
+
+# CHECK: cxlgtr %f0, 15, %r0, 0
+0xb9 0x5a 0xf0 0x00
+
+# CHECK: cxlgtr %f4, 5, %r9, 10
+0xb9 0x5a 0x5a 0x49
+
+# CHECK: cxlgtr %f13, 0, %r0, 0
+0xb9 0x5a 0x00 0xd0
+
+# CHECK: cxr %f0, %f0
+0xb3 0x69 0x00 0x00
+
+# CHECK: cxr %f0, %f13
+0xb3 0x69 0x00 0x0d
+
+# CHECK: cxr %f8, %f8
+0xb3 0x69 0x00 0x88
+
+# CHECK: cxr %f13, %f0
+0xb3 0x69 0x00 0xd0
+
+# CHECK: cxstr %f0, %r0
+0xb3 0xfb 0x00 0x00
+
+# CHECK: cxstr %f0, %r14
+0xb3 0xfb 0x00 0x0e
+
+# CHECK: cxstr %f13, %r0
+0xb3 0xfb 0x00 0xd0
+
+# CHECK: cxstr %f8, %r6
+0xb3 0xfb 0x00 0x86
+
+# CHECK: cxstr %f13, %r14
+0xb3 0xfb 0x00 0xde
+
+# CHECK: cxtr %f0, %f0
+0xb3 0xec 0x00 0x00
+
+# CHECK: cxtr %f0, %f13
+0xb3 0xec 0x00 0x0d
+
+# CHECK: cxtr %f8, %f8
+0xb3 0xec 0x00 0x88
+
+# CHECK: cxtr %f13, %f0
+0xb3 0xec 0x00 0xd0
+
+# CHECK: cxutr %f0, %r0
+0xb3 0xfa 0x00 0x00
+
+# CHECK: cxutr %f0, %r14
+0xb3 0xfa 0x00 0x0e
+
+# CHECK: cxutr %f13, %r0
+0xb3 0xfa 0x00 0xd0
+
+# CHECK: cxutr %f8, %r6
+0xb3 0xfa 0x00 0x86
+
+# CHECK: cxutr %f13, %r14
+0xb3 0xfa 0x00 0xde
+
+# CHECK: cxzt %f0, 0(1), 0
+0xed 0x00 0x00 0x00 0x00 0xab
+
+# CHECK: cxzt %f13, 0(1), 0
+0xed 0x00 0x00 0x00 0xd0 0xab
+
+# CHECK: cxzt %f0, 0(1), 15
+0xed 0x00 0x00 0x00 0x0f 0xab
+
+# CHECK: cxzt %f0, 0(1,%r1), 0
+0xed 0x00 0x10 0x00 0x00 0xab
+
+# CHECK: cxzt %f0, 0(1,%r15), 0
+0xed 0x00 0xf0 0x00 0x00 0xab
+
+# CHECK: cxzt %f0, 4095(1,%r1), 0
+0xed 0x00 0x1f 0xff 0x00 0xab
+
+# CHECK: cxzt %f0, 4095(1,%r15), 0
+0xed 0x00 0xff 0xff 0x00 0xab
+
+# CHECK: cxzt %f0, 0(256,%r1), 0
+0xed 0xff 0x10 0x00 0x00 0xab
+
+# CHECK: cxzt %f0, 0(256,%r15), 0
+0xed 0xff 0xf0 0x00 0x00 0xab
+
 # CHECK: cy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x59
 
@@ -4345,6 +5383,60 @@
 # CHECK: cy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x59
 
+# CHECK: czdt %f0, 0(1), 0
+0xed 0x00 0x00 0x00 0x00 0xa8
+
+# CHECK: czdt %f15, 0(1), 0
+0xed 0x00 0x00 0x00 0xf0 0xa8
+
+# CHECK: czdt %f0, 0(1), 15
+0xed 0x00 0x00 0x00 0x0f 0xa8
+
+# CHECK: czdt %f0, 0(1,%r1), 0
+0xed 0x00 0x10 0x00 0x00 0xa8
+
+# CHECK: czdt %f0, 0(1,%r15), 0
+0xed 0x00 0xf0 0x00 0x00 0xa8
+
+# CHECK: czdt %f0, 4095(1,%r1), 0
+0xed 0x00 0x1f 0xff 0x00 0xa8
+
+# CHECK: czdt %f0, 4095(1,%r15), 0
+0xed 0x00 0xff 0xff 0x00 0xa8
+
+# CHECK: czdt %f0, 0(256,%r1), 0
+0xed 0xff 0x10 0x00 0x00 0xa8
+
+# CHECK: czdt %f0, 0(256,%r15), 0
+0xed 0xff 0xf0 0x00 0x00 0xa8
+
+# CHECK: czxt %f0, 0(1), 0
+0xed 0x00 0x00 0x00 0x00 0xa9
+
+# CHECK: czxt %f13, 0(1), 0
+0xed 0x00 0x00 0x00 0xd0 0xa9
+
+# CHECK: czxt %f0, 0(1), 15
+0xed 0x00 0x00 0x00 0x0f 0xa9
+
+# CHECK: czxt %f0, 0(1,%r1), 0
+0xed 0x00 0x10 0x00 0x00 0xa9
+
+# CHECK: czxt %f0, 0(1,%r15), 0
+0xed 0x00 0xf0 0x00 0x00 0xa9
+
+# CHECK: czxt %f0, 4095(1,%r1), 0
+0xed 0x00 0x1f 0xff 0x00 0xa9
+
+# CHECK: czxt %f0, 4095(1,%r15), 0
+0xed 0x00 0xff 0xff 0x00 0xa9
+
+# CHECK: czxt %f0, 0(256,%r1), 0
+0xed 0xff 0x10 0x00 0x00 0xa9
+
+# CHECK: czxt %f0, 0(256,%r15), 0
+0xed 0xff 0xf0 0x00 0x00 0xa9
+
 # CHECK: d %r0, 0
 0x5d 0x00 0x00 0x00
 
@@ -4366,6 +5458,27 @@
 # CHECK: d %r14, 0
 0x5d 0xe0 0x00 0x00
 
+# CHECK: dd %f0, 0
+0x6d 0x00 0x00 0x00
+
+# CHECK: dd %f0, 4095
+0x6d 0x00 0x0f 0xff
+
+# CHECK: dd %f0, 0(%r1)
+0x6d 0x00 0x10 0x00
+
+# CHECK: dd %f0, 0(%r15)
+0x6d 0x00 0xf0 0x00
+
+# CHECK: dd %f0, 4095(%r1,%r15)
+0x6d 0x01 0xff 0xff
+
+# CHECK: dd %f0, 4095(%r15,%r1)
+0x6d 0x0f 0x1f 0xff
+
+# CHECK: dd %f15, 0
+0x6d 0xf0 0x00 0x00
+
 # CHECK: ddb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x1d
 
@@ -4399,6 +5512,72 @@
 # CHECK: ddbr %f15, %f0
 0xb3 0x1d 0x00 0xf0
 
+# CHECK: ddr %f0, %f0
+0x2d 0x00
+
+# CHECK: ddr %f0, %f15
+0x2d 0x0f
+
+# CHECK: ddr %f7, %f8
+0x2d 0x78
+
+# CHECK: ddr %f15, %f0
+0x2d 0xf0
+
+# CHECK: ddtr %f0, %f0, %f0
+0xb3 0xd1 0x00 0x00
+
+# CHECK: ddtr %f0, %f0, %f15
+0xb3 0xd1 0xf0 0x00
+
+# CHECK: ddtr %f0, %f15, %f0
+0xb3 0xd1 0x00 0x0f
+
+# CHECK: ddtr %f15, %f0, %f0
+0xb3 0xd1 0x00 0xf0
+
+# CHECK: ddtr %f7, %f8, %f9
+0xb3 0xd1 0x90 0x78
+
+# CHECK: ddtra %f0, %f0, %f0, 1
+0xb3 0xd1 0x01 0x00
+
+# CHECK: ddtra %f0, %f0, %f0, 15
+0xb3 0xd1 0x0f 0x00
+
+# CHECK: ddtra %f0, %f0, %f15, 1
+0xb3 0xd1 0xf1 0x00
+
+# CHECK: ddtra %f0, %f15, %f0, 1
+0xb3 0xd1 0x01 0x0f
+
+# CHECK: ddtra %f15, %f0, %f0, 1
+0xb3 0xd1 0x01 0xf0
+
+# CHECK: ddtra %f7, %f8, %f9, 10
+0xb3 0xd1 0x9a 0x78
+
+# CHECK: de %f0, 0
+0x7d 0x00 0x00 0x00
+
+# CHECK: de %f0, 4095
+0x7d 0x00 0x0f 0xff
+
+# CHECK: de %f0, 0(%r1)
+0x7d 0x00 0x10 0x00
+
+# CHECK: de %f0, 0(%r15)
+0x7d 0x00 0xf0 0x00
+
+# CHECK: de %f0, 4095(%r1,%r15)
+0x7d 0x01 0xff 0xff
+
+# CHECK: de %f0, 4095(%r15,%r1)
+0x7d 0x0f 0x1f 0xff
+
+# CHECK: de %f15, 0
+0x7d 0xf0 0x00 0x00
+
 # CHECK: deb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x0d
 
@@ -4432,6 +5611,18 @@
 # CHECK: debr %f15, %f0
 0xb3 0x0d 0x00 0xf0
 
+# CHECK: der %f0, %f0
+0x3d 0x00
+
+# CHECK: der %f0, %f15
+0x3d 0x0f
+
+# CHECK: der %f7, %f8
+0x3d 0x78
+
+# CHECK: der %f15, %f0
+0x3d 0xf0
+
 # CHECK: didbr	%f0, %f0, %f0, 1
 0xb3 0x5b 0x01 0x00
 
@@ -4702,6 +5893,51 @@
 # CHECK: dxbr %f13, %f0
 0xb3 0x4d 0x00 0xd0
 
+# CHECK: dxr %f0, %f0
+0xb2 0x2d 0x00 0x00
+
+# CHECK: dxr %f0, %f13
+0xb2 0x2d 0x00 0x0d
+
+# CHECK: dxr %f8, %f8
+0xb2 0x2d 0x00 0x88
+
+# CHECK: dxr %f13, %f0
+0xb2 0x2d 0x00 0xd0
+
+# CHECK: dxtr %f0, %f0, %f0
+0xb3 0xd9 0x00 0x00
+
+# CHECK: dxtr %f0, %f0, %f13
+0xb3 0xd9 0xd0 0x00
+
+# CHECK: dxtr %f0, %f13, %f0
+0xb3 0xd9 0x00 0x0d
+
+# CHECK: dxtr %f13, %f0, %f0
+0xb3 0xd9 0x00 0xd0
+
+# CHECK: dxtr %f8, %f8, %f8
+0xb3 0xd9 0x80 0x88
+
+# CHECK: dxtra %f0, %f0, %f0, 1
+0xb3 0xd9 0x01 0x00
+
+# CHECK: dxtra %f0, %f0, %f0, 15
+0xb3 0xd9 0x0f 0x00
+
+# CHECK: dxtra %f0, %f0, %f13, 1
+0xb3 0xd9 0xd1 0x00
+
+# CHECK: dxtra %f0, %f13, %f0, 1
+0xb3 0xd9 0x01 0x0d
+
+# CHECK: dxtra %f13, %f0, %f0, 1
+0xb3 0xd9 0x01 0xd0
+
+# CHECK: dxtra %f8, %f8, %f8, 8
+0xb3 0xd9 0x88 0x88
+
 # CHECK: ear %r0, %a0
 0xb2 0x4f 0x00 0x00
 
@@ -4849,6 +6085,30 @@
 # CHECK: edmk 0(256,%r15), 0
 0xdf 0xff 0xf0 0x00 0x00 0x00
 
+# CHECK: eedtr %f0, %f9
+0xb3 0xe5 0x00 0x09
+
+# CHECK: eedtr %f0, %f15
+0xb3 0xe5 0x00 0x0f
+
+# CHECK: eedtr %f15, %f0
+0xb3 0xe5 0x00 0xf0
+
+# CHECK: eedtr %f15, %f9
+0xb3 0xe5 0x00 0xf9
+
+# CHECK: eextr %f0, %f8
+0xb3 0xed 0x00 0x08
+
+# CHECK: eextr %f0, %f13
+0xb3 0xed 0x00 0x0d
+
+# CHECK: eextr %f13, %f0
+0xb3 0xed 0x00 0xd0
+
+# CHECK: eextr %f13, %f9
+0xb3 0xed 0x00 0xd9
+
 # CHECK: efpc %r0
 0xb3 0x8c 0x00 0x00
 
@@ -4870,6 +6130,30 @@
 # CHECK: epsw %r6, %r8
 0xb9 0x8d 0x00 0x68
 
+# CHECK: esdtr %f0, %f9
+0xb3 0xe7 0x00 0x09
+
+# CHECK: esdtr %f0, %f15
+0xb3 0xe7 0x00 0x0f
+
+# CHECK: esdtr %f15, %f0
+0xb3 0xe7 0x00 0xf0
+
+# CHECK: esdtr %f15, %f9
+0xb3 0xe7 0x00 0xf9
+
+# CHECK: esxtr %f0, %f8
+0xb3 0xef 0x00 0x08
+
+# CHECK: esxtr %f0, %f13
+0xb3 0xef 0x00 0x0d
+
+# CHECK: esxtr %f13, %f0
+0xb3 0xef 0x00 0xd0
+
+# CHECK: esxtr %f13, %f9
+0xb3 0xef 0x00 0xd9
+
 # CHECK: etnd %r0
 0xb2 0xec 0x00 0x00
 
@@ -4933,6 +6217,36 @@
 # CHECK: fidbra %f15, 0, %f0, 1
 0xb3 0x5f 0x01 0xf0
 
+# CHECK: fidr %f0, %f0
+0xb3 0x7f 0x00 0x00
+
+# CHECK: fidr %f0, %f15
+0xb3 0x7f 0x00 0x0f
+
+# CHECK: fidr %f4, %f6
+0xb3 0x7f 0x00 0x46
+
+# CHECK: fidr %f15, %f0
+0xb3 0x7f 0x00 0xf0
+
+# CHECK: fidtr %f0, 0, %f0, 0
+0xb3 0xd7 0x00 0x00
+
+# CHECK: fidtr %f0, 0, %f0, 15
+0xb3 0xd7 0x0f 0x00
+
+# CHECK: fidtr %f0, 0, %f15, 0
+0xb3 0xd7 0x00 0x0f
+
+# CHECK: fidtr %f0, 15, %f0, 0
+0xb3 0xd7 0xf0 0x00
+
+# CHECK: fidtr %f4, 5, %f6, 7
+0xb3 0xd7 0x57 0x46
+
+# CHECK: fidtr %f15, 0, %f0, 0
+0xb3 0xd7 0x00 0xf0
+
 # CHECK: fiebr %f0, 0, %f0
 0xb3 0x57 0x00 0x00
 
@@ -4966,6 +6280,18 @@
 # CHECK: fiebra %f15, 0, %f0, 1
 0xb3 0x57 0x01 0xf0
 
+# CHECK: fier %f0, %f0
+0xb3 0x77 0x00 0x00
+
+# CHECK: fier %f0, %f15
+0xb3 0x77 0x00 0x0f
+
+# CHECK: fier %f4, %f6
+0xb3 0x77 0x00 0x46
+
+# CHECK: fier %f15, %f0
+0xb3 0x77 0x00 0xf0
+
 # CHECK: fixbr %f0, 0, %f0
 0xb3 0x47 0x00 0x00
 
@@ -4999,6 +6325,36 @@
 # CHECK: fixbra %f13, 0, %f0, 1
 0xb3 0x47 0x01 0xd0
 
+# CHECK: fixr %f0, %f0
+0xb3 0x67 0x00 0x00
+
+# CHECK: fixr %f0, %f13
+0xb3 0x67 0x00 0x0d
+
+# CHECK: fixr %f4, %f8
+0xb3 0x67 0x00 0x48
+
+# CHECK: fixr %f13, %f0
+0xb3 0x67 0x00 0xd0
+
+# CHECK: fixtr %f0, 0, %f0, 0
+0xb3 0xdf 0x00 0x00
+
+# CHECK: fixtr %f0, 0, %f0, 15
+0xb3 0xdf 0x0f 0x00
+
+# CHECK: fixtr %f0, 0, %f13, 0
+0xb3 0xdf 0x00 0x0d
+
+# CHECK: fixtr %f0, 15, %f0, 0
+0xb3 0xdf 0xf0 0x00
+
+# CHECK: fixtr %f4, 5, %f8, 9
+0xb3 0xdf 0x59 0x48
+
+# CHECK: fixtr %f13, 0, %f0, 0
+0xb3 0xdf 0x00 0xd0
+
 # CHECK: flogr %r0, %r0
 0xb9 0x83 0x00 0x00
 
@@ -5011,6 +6367,30 @@
 # CHECK: flogr %r14, %r0
 0xb9 0x83 0x00 0xe0
 
+# CHECK: hdr %f0, %f0
+0x24 0x00
+
+# CHECK: hdr %f0, %f15
+0x24 0x0f
+
+# CHECK: hdr %f7, %f8
+0x24 0x78
+
+# CHECK: hdr %f15, %f0
+0x24 0xf0
+
+# CHECK: her %f0, %f0
+0x34 0x00
+
+# CHECK: her %f0, %f15
+0x34 0x0f
+
+# CHECK: her %f7, %f8
+0x34 0x78
+
+# CHECK: her %f15, %f0
+0x34 0xf0
+
 # CHECK: ic %r0, 0
 0x43 0x00 0x00 0x00
 
@@ -5143,6 +6523,42 @@
 # CHECK: icy %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x73
 
+# CHECK: iedtr %f0, %f0, %f0
+0xb3 0xf6 0x00 0x00
+
+# CHECK: iedtr %f0, %f0, %f15
+0xb3 0xf6 0x00 0x0f
+
+# CHECK: iedtr %f0, %f15, %f0
+0xb3 0xf6 0xf0 0x00
+
+# CHECK: iedtr %f15, %f0, %f0
+0xb3 0xf6 0x00 0xf0
+
+# CHECK: iedtr %f1, %f2, %f3
+0xb3 0xf6 0x20 0x13
+
+# CHECK: iedtr %f15, %f15, %f15
+0xb3 0xf6 0xf0 0xff
+
+# CHECK: iextr %f0, %f0, %f0
+0xb3 0xfe 0x00 0x00
+
+# CHECK: iextr %f0, %f0, %f13
+0xb3 0xfe 0x00 0x0d
+
+# CHECK: iextr %f0, %f13, %f0
+0xb3 0xfe 0xd0 0x00
+
+# CHECK: iextr %f13, %f0, %f0
+0xb3 0xfe 0x00 0xd0
+
+# CHECK: iextr %f1, %f8, %f4
+0xb3 0xfe 0x80 0x14
+
+# CHECK: iextr %f13, %f13, %f13
+0xb3 0xfe 0xd0 0xdd
+
 # CHECK: iihf %r0, 0
 0xc0 0x08 0x00 0x00 0x00 0x00
 
@@ -5251,6 +6667,18 @@
 # CHECK: kdbr %f15, %f0
 0xb3 0x18 0x00 0xf0
 
+# CHECK: kdtr %f0, %f0
+0xb3 0xe0 0x00 0x00
+
+# CHECK: kdtr %f0, %f15
+0xb3 0xe0 0x00 0x0f
+
+# CHECK: kdtr %f7, %f8
+0xb3 0xe0 0x00 0x78
+
+# CHECK: kdtr %f15, %f0
+0xb3 0xe0 0x00 0xf0
+
 # CHECK: keb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x08
 
@@ -5392,6 +6820,18 @@
 # CHECK: kxbr %f13, %f0
 0xb3 0x48 0x00 0xd0
 
+# CHECK: kxtr %f0, %f0
+0xb3 0xe8 0x00 0x00
+
+# CHECK: kxtr %f0, %f13
+0xb3 0xe8 0x00 0x0d
+
+# CHECK: kxtr %f8, %f8
+0xb3 0xe8 0x00 0x88
+
+# CHECK: kxtr %f13, %f0
+0xb3 0xe8 0x00 0xd0
+
 # CHECK: l %r0, 0
 0x58 0x00 0x00 0x00
 
@@ -6025,6 +7465,18 @@
 # CHECK: lcdbr %f15, %f9
 0xb3 0x13 0x00 0xf9
 
+# CHECK: lcdr %f0, %f9
+0x23 0x09
+
+# CHECK: lcdr %f0, %f15
+0x23 0x0f
+
+# CHECK: lcdr %f15, %f0
+0x23 0xf0
+
+# CHECK: lcdr %f15, %f9
+0x23 0xf9
+
 # CHECK: lcebr %f0, %f9
 0xb3 0x03 0x00 0x09
 
@@ -6037,9 +7489,20 @@
 # CHECK: lcebr %f15, %f9
 0xb3 0x03 0x00 0xf9
 
+# CHECK: lcer %f0, %f9
+0x33 0x09
+
+# CHECK: lcer %f0, %f15
+0x33 0x0f
+
+# CHECK: lcer %f15, %f0
+0x33 0xf0
+
+# CHECK: lcer %f15, %f9
+0x33 0xf9
+
 # CHECK: lcgfr %r0, %r0
 0xb9 0x13 0x00 0x00
-
 # CHECK: lcgfr %r0, %r15
 0xb9 0x13 0x00 0x0f
 
@@ -6085,6 +7548,18 @@
 # CHECK: lcxbr %f13, %f9
 0xb3 0x43 0x00 0xd9
 
+# CHECK: lcxr %f0, %f8
+0xb3 0x63 0x00 0x08
+
+# CHECK: lcxr %f0, %f13
+0xb3 0x63 0x00 0x0d
+
+# CHECK: lcxr %f13, %f0
+0xb3 0x63 0x00 0xd0
+
+# CHECK: lcxr %f13, %f9
+0xb3 0x63 0x00 0xd9
+
 # CHECK: ld %f0, 0
 0x68 0x00 0x00 0x00
 
@@ -6106,6 +7581,27 @@
 # CHECK: ld %f15, 0
 0x68 0xf0 0x00 0x00
 
+# CHECK: lde %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x24
+
+# CHECK: lde %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x24
+
+# CHECK: lde %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x24
+
+# CHECK: lde %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x24
+
+# CHECK: lde %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x24
+
+# CHECK: lde %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x24
+
+# CHECK: lde %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x24
+
 # CHECK: ldeb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x04
 
@@ -6136,6 +7632,27 @@
 # CHECK: ldebr %f15, %f0
 0xb3 0x04 0x00 0xf0
 
+# CHECK: lder %f0, %f15
+0xb3 0x24 0x00 0x0f
+
+# CHECK: lder %f7, %f8
+0xb3 0x24 0x00 0x78
+
+# CHECK: lder %f15, %f0
+0xb3 0x24 0x00 0xf0
+
+# CHECK: ldetr %f0, %f0, 15
+0xb3 0xd4 0x0f 0x00
+
+# CHECK: ldetr %f0, %f15, 0
+0xb3 0xd4 0x00 0x0f
+
+# CHECK: ldetr %f7, %f8, 9
+0xb3 0xd4 0x09 0x78
+
+# CHECK: ldetr %f15, %f0, 0
+0xb3 0xd4 0x00 0xf0
+
 # CHECK: ldgr %f0, %r0
 0xb3 0xc1 0x00 0x00
 
@@ -6196,6 +7713,39 @@
 # CHECK: ldxbra %f13, 0, %f0, 1
 0xb3 0x45 0x01 0xd0
 
+# CHECK: ldxr %f0, %f0
+0x25 0x00
+
+# CHECK: ldxr %f0, %f13
+0x25 0x0d
+
+# CHECK: ldxr %f7, %f8
+0x25 0x78
+
+# CHECK: ldxr %f15, %f0
+0x25 0xf0
+
+# CHECK: ldxr %f15, %f13
+0x25 0xfd
+
+# CHECK: ldxtr %f0, 0, %f0, 0
+0xb3 0xdd 0x00 0x00
+
+# CHECK: ldxtr %f0, 0, %f0, 15
+0xb3 0xdd 0x0f 0x00
+
+# CHECK: ldxtr %f0, 0, %f13, 0
+0xb3 0xdd 0x00 0x0d
+
+# CHECK: ldxtr %f0, 15, %f0, 0
+0xb3 0xdd 0xf0 0x00
+
+# CHECK: ldxtr %f4, 5, %f8, 9
+0xb3 0xdd 0x59 0x48
+
+# CHECK: ldxtr %f13, 0, %f0, 0
+0xb3 0xdd 0x00 0xd0
+
 # CHECK: ldy %f0, -524288
 0xed 0x00 0x00 0x00 0x80 0x65
 
@@ -6280,6 +7830,36 @@
 # CHECK: ledbra %f15, 0, %f0, 1
 0xb3 0x44 0x01 0xf0
 
+# CHECK: ledr %f0, %f0
+0x35 0x00
+
+# CHECK: ledr %f0, %f15
+0x35 0x0f
+
+# CHECK: ledr %f7, %f8
+0x35 0x78
+
+# CHECK: ledr %f15, %f0
+0x35 0xf0
+
+# CHECK: ledr %f15, %f15
+0x35 0xff
+
+# CHECK: ledtr %f0, 0, %f0, 15
+0xb3 0xd5 0x0f 0x00
+
+# CHECK: ledtr %f0, 0, %f15, 0
+0xb3 0xd5 0x00 0x0f
+
+# CHECK: ledtr %f0, 15, %f0, 0
+0xb3 0xd5 0xf0 0x00
+
+# CHECK: ledtr %f4, 5, %f6, 7
+0xb3 0xd5 0x57 0x46
+
+# CHECK: ledtr %f15, 0, %f0, 0
+0xb3 0xd5 0x00 0xf0
+
 # CHECK: ler %f0, %f9
 0x38 0x09
 
@@ -6325,6 +7905,21 @@
 # CHECK: lexbra %f13, 0, %f0, 1
 0xb3 0x46 0x01 0xd0
 
+# CHECK: lexr %f0, %f0
+0xb3 0x66 0x00 0x00
+
+# CHECK: lexr %f0, %f13
+0xb3 0x66 0x00 0x0d
+
+# CHECK: lexr %f7, %f8
+0xb3 0x66 0x00 0x78
+
+# CHECK: lexr %f15, %f0
+0xb3 0x66 0x00 0xf0
+
+# CHECK: lexr %f15, %f13
+0xb3 0x66 0x00 0xfd
+
 # CHECK: ley %f0, -524288
 0xed 0x00 0x00 0x00 0x80 0x64
 
@@ -7399,6 +8994,18 @@
 # CHECK: lndbr %f15, %f9
 0xb3 0x11 0x00 0xf9
 
+# CHECK: lndr %f0, %f9
+0x21 0x09
+
+# CHECK: lndr %f0, %f15
+0x21 0x0f
+
+# CHECK: lndr %f15, %f0
+0x21 0xf0
+
+# CHECK: lndr %f15, %f9
+0x21 0xf9
+
 # CHECK: lnebr %f0, %f9
 0xb3 0x01 0x00 0x09
 
@@ -7411,6 +9018,18 @@
 # CHECK: lnebr %f15, %f9
 0xb3 0x01 0x00 0xf9
 
+# CHECK: lner %f0, %f9
+0x31 0x09
+
+# CHECK: lner %f0, %f15
+0x31 0x0f
+
+# CHECK: lner %f15, %f0
+0x31 0xf0
+
+# CHECK: lner %f15, %f9
+0x31 0xf9
+
 # CHECK: lngfr %r0, %r0
 0xb9 0x11 0x00 0x00
 
@@ -7459,6 +9078,18 @@
 # CHECK: lnxbr %f13, %f9
 0xb3 0x41 0x00 0xd9
 
+# CHECK: lnxr %f0, %f8
+0xb3 0x61 0x00 0x08
+
+# CHECK: lnxr %f0, %f13
+0xb3 0x61 0x00 0x0d
+
+# CHECK: lnxr %f13, %f0
+0xb3 0x61 0x00 0xd0
+
+# CHECK: lnxr %f13, %f9
+0xb3 0x61 0x00 0xd9
+
 # CHECK: loc %r7, 6399(%r8), 0
 0xeb 0x70 0x88 0xff 0x01 0xf2
 
@@ -7705,6 +9336,18 @@
 # CHECK: lpdg %r2, 4095(%r1), 0(%r15)
 0xc8 0x25 0x1f 0xff 0xf0 0x00
 
+# CHECK: lpdr %f0, %f9
+0x20 0x09
+
+# CHECK: lpdr %f0, %f15
+0x20 0x0f
+
+# CHECK: lpdr %f15, %f0
+0x20 0xf0
+
+# CHECK: lpdr %f15, %f9
+0x20 0xf9
+
 # CHECK: lpebr %f0, %f9
 0xb3 0x00 0x00 0x09
 
@@ -7717,6 +9360,18 @@
 # CHECK: lpebr %f15, %f9
 0xb3 0x00 0x00 0xf9
 
+# CHECK: lper %f0, %f9
+0x30 0x09
+
+# CHECK: lper %f0, %f15
+0x30 0x0f
+
+# CHECK: lper %f15, %f0
+0x30 0xf0
+
+# CHECK: lper %f15, %f9
+0x30 0xf9
+
 # CHECK: lpgfr %r0, %r0
 0xb9 0x10 0x00 0x00
 
@@ -7795,6 +9450,18 @@
 # CHECK: lpxbr %f13, %f9
 0xb3 0x40 0x00 0xd9
 
+# CHECK: lpxr %f0, %f8
+0xb3 0x60 0x00 0x08
+
+# CHECK: lpxr %f0, %f13
+0xb3 0x60 0x00 0x0d
+
+# CHECK: lpxr %f13, %f0
+0xb3 0x60 0x00 0xd0
+
+# CHECK: lpxr %f13, %f9
+0xb3 0x60 0x00 0xd9
+
 # CHECK: lr %r0, %r9
 0x18 0x09
 
@@ -7969,6 +9636,30 @@
 # CHECK: ltdbr %f15, %f9
 0xb3 0x12 0x00 0xf9
 
+# CHECK: ltdr %f0, %f9
+0x22 0x09
+
+# CHECK: ltdr %f0, %f15
+0x22 0x0f
+
+# CHECK: ltdr %f15, %f0
+0x22 0xf0
+
+# CHECK: ltdr %f15, %f9
+0x22 0xf9
+
+# CHECK: ltdtr %f0, %f9
+0xb3 0xd6 0x00 0x09
+
+# CHECK: ltdtr %f0, %f15
+0xb3 0xd6 0x00 0x0f
+
+# CHECK: ltdtr %f15, %f0
+0xb3 0xd6 0x00 0xf0
+
+# CHECK: ltdtr %f15, %f9
+0xb3 0xd6 0x00 0xf9
+
 # CHECK: ltebr %f0, %f9
 0xb3 0x02 0x00 0x09
 
@@ -7981,6 +9672,18 @@
 # CHECK: ltebr %f15, %f9
 0xb3 0x02 0x00 0xf9
 
+# CHECK: lter %f0, %f9
+0x32 0x09
+
+# CHECK: lter %f0, %f15
+0x32 0x0f
+
+# CHECK: lter %f15, %f0
+0x32 0xf0
+
+# CHECK: lter %f15, %f9
+0x32 0xf9
+
 # CHECK: ltg %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x02
 
@@ -8089,6 +9792,168 @@
 # CHECK: ltxbr %f13, %f9
 0xb3 0x42 0x00 0xd9
 
+# CHECK: ltxr %f0, %f9
+0xb3 0x62 0x00 0x09
+
+# CHECK: ltxr %f0, %f13
+0xb3 0x62 0x00 0x0d
+
+# CHECK: ltxr %f13, %f0
+0xb3 0x62 0x00 0xd0
+
+# CHECK: ltxr %f13, %f9
+0xb3 0x62 0x00 0xd9
+
+# CHECK: ltxtr %f0, %f9
+0xb3 0xde 0x00 0x09
+
+# CHECK: ltxtr %f0, %f13
+0xb3 0xde 0x00 0x0d
+
+# CHECK: ltxtr %f13, %f0
+0xb3 0xde 0x00 0xd0
+
+# CHECK: ltxtr %f13, %f9
+0xb3 0xde 0x00 0xd9
+
+# CHECK: lxd %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x25
+
+# CHECK: lxd %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x25
+
+# CHECK: lxd %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x25
+
+# CHECK: lxd %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x25
+
+# CHECK: lxd %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x25
+
+# CHECK: lxd %f13, 0
+0xed 0xd0 0x00 0x00 0x00 0x25
+
+# CHECK: lxdb %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x05
+
+# CHECK: lxdb %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x05
+
+# CHECK: lxdb %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x05
+
+# CHECK: lxdb %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x05
+
+# CHECK: lxdb %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x05
+
+# CHECK: lxdb %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x05
+
+# CHECK: lxdb %f13, 0
+0xed 0xd0 0x00 0x00 0x00 0x05
+
+# CHECK: lxdbr %f0, %f8
+0xb3 0x05 0x00 0x08
+
+# CHECK: lxdbr %f0, %f13
+0xb3 0x05 0x00 0x0d
+
+# CHECK: lxdbr %f13, %f0
+0xb3 0x05 0x00 0xd0
+
+# CHECK: lxdbr %f13, %f15
+0xb3 0x05 0x00 0xdf
+
+# CHECK: lxdr %f0, %f8
+0xb3 0x25 0x00 0x08
+
+# CHECK: lxdr %f0, %f13
+0xb3 0x25 0x00 0x0d
+
+# CHECK: lxdr %f13, %f0
+0xb3 0x25 0x00 0xd0
+
+# CHECK: lxdr %f13, %f15
+0xb3 0x25 0x00 0xdf
+
+# CHECK: lxdtr %f0, %f0, 15
+0xb3 0xdc 0x0f 0x00
+
+# CHECK: lxdtr %f0, %f15, 0
+0xb3 0xdc 0x00 0x0f
+
+# CHECK: lxdtr %f5, %f8, 9
+0xb3 0xdc 0x09 0x58
+
+# CHECK: lxdtr %f13, %f0, 0
+0xb3 0xdc 0x00 0xd0
+
+# CHECK: lxe %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x26
+
+# CHECK: lxe %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x26
+
+# CHECK: lxe %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x26
+
+# CHECK: lxe %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x26
+
+# CHECK: lxe %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x26
+
+# CHECK: lxe %f13, 0
+0xed 0xd0 0x00 0x00 0x00 0x26
+
+# CHECK: lxeb %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x06
+
+# CHECK: lxeb %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x06
+
+# CHECK: lxeb %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x06
+
+# CHECK: lxeb %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x06
+
+# CHECK: lxeb %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x06
+
+# CHECK: lxeb %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x06
+
+# CHECK: lxeb %f13, 0
+0xed 0xd0 0x00 0x00 0x00 0x06
+
+# CHECK: lxebr %f0, %f8
+0xb3 0x06 0x00 0x08
+
+# CHECK: lxebr %f0, %f13
+0xb3 0x06 0x00 0x0d
+
+# CHECK: lxebr %f13, %f0
+0xb3 0x06 0x00 0xd0
+
+# CHECK: lxebr %f13, %f15
+0xb3 0x06 0x00 0xdf
+
+# CHECK: lxer %f0, %f8
+0xb3 0x26 0x00 0x08
+
+# CHECK: lxer %f0, %f13
+0xb3 0x26 0x00 0x0d
+
+# CHECK: lxer %f13, %f0
+0xb3 0x26 0x00 0xd0
+
+# CHECK: lxer %f13, %f15
+0xb3 0x26 0x00 0xdf
+
 # CHECK: lxr %f0, %f8
 0xb3 0x65 0x00 0x08
 
@@ -8179,6 +10044,33 @@
 # CHECK: m %r14, 0
 0x5c 0xe0 0x00 0x00
 
+# CHECK: mad %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x3e
+
+# CHECK: mad %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x3e
+
+# CHECK: mad %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x3e
+
+# CHECK: mad %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x3e
+
+# CHECK: mad %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x3e
+
+# CHECK: mad %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x3e
+
+# CHECK: mad %f0, %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x3e
+
+# CHECK: mad %f15, %f0, 0
+0xed 0x00 0x00 0x00 0xf0 0x3e
+
+# CHECK: mad %f15, %f15, 0
+0xed 0xf0 0x00 0x00 0xf0 0x3e
+
 # CHECK: madb %f0, %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x1e
 
@@ -8224,6 +10116,51 @@
 # CHECK: madbr %f15, %f15, %f15
 0xb3 0x1e 0xf0 0xff
 
+# CHECK: madr %f0, %f0, %f0
+0xb3 0x3e 0x00 0x00
+
+# CHECK: madr %f0, %f0, %f15
+0xb3 0x3e 0x00 0x0f
+
+# CHECK: madr %f0, %f15, %f0
+0xb3 0x3e 0x00 0xf0
+
+# CHECK: madr %f15, %f0, %f0
+0xb3 0x3e 0xf0 0x00
+
+# CHECK: madr %f7, %f8, %f9
+0xb3 0x3e 0x70 0x89
+
+# CHECK: madr %f15, %f15, %f15
+0xb3 0x3e 0xf0 0xff
+
+# CHECK: mae %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x2e
+
+# CHECK: mae %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x2e
+
+# CHECK: mae %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x2e
+
+# CHECK: mae %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x2e
+
+# CHECK: mae %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x2e
+
+# CHECK: mae %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x2e
+
+# CHECK: mae %f0, %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x2e
+
+# CHECK: mae %f15, %f0, 0
+0xed 0x00 0x00 0x00 0xf0 0x2e
+
+# CHECK: mae %f15, %f15, 0
+0xed 0xf0 0x00 0x00 0xf0 0x2e
+
 # CHECK: maeb %f0, %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x0e
 
@@ -8269,6 +10206,159 @@
 # CHECK: maebr %f15, %f15, %f15
 0xb3 0x0e 0xf0 0xff
 
+# CHECK: maer %f0, %f0, %f0
+0xb3 0x2e 0x00 0x00
+
+# CHECK: maer %f0, %f0, %f15
+0xb3 0x2e 0x00 0x0f
+
+# CHECK: maer %f0, %f15, %f0
+0xb3 0x2e 0x00 0xf0
+
+# CHECK: maer %f15, %f0, %f0
+0xb3 0x2e 0xf0 0x00
+
+# CHECK: maer %f7, %f8, %f9
+0xb3 0x2e 0x70 0x89
+
+# CHECK: maer %f15, %f15, %f15
+0xb3 0x2e 0xf0 0xff
+
+# CHECK: may %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x3a
+
+# CHECK: may %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x3a
+
+# CHECK: may %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x3a
+
+# CHECK: may %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x3a
+
+# CHECK: may %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x3a
+
+# CHECK: may %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x3a
+
+# CHECK: may %f0, %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x3a
+
+# CHECK: may %f13, %f0, 0
+0xed 0x00 0x00 0x00 0xd0 0x3a
+
+# CHECK: may %f13, %f15, 0
+0xed 0xf0 0x00 0x00 0xd0 0x3a
+
+# CHECK: mayh %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x3c
+
+# CHECK: mayh %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x3c
+
+# CHECK: mayh %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x3c
+
+# CHECK: mayh %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x3c
+
+# CHECK: mayh %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x3c
+
+# CHECK: mayh %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x3c
+
+# CHECK: mayh %f0, %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x3c
+
+# CHECK: mayh %f15, %f0, 0
+0xed 0x00 0x00 0x00 0xf0 0x3c
+
+# CHECK: mayh %f15, %f15, 0
+0xed 0xf0 0x00 0x00 0xf0 0x3c
+
+# CHECK: mayhr %f0, %f0, %f0
+0xb3 0x3c 0x00 0x00
+
+# CHECK: mayhr %f0, %f0, %f15
+0xb3 0x3c 0x00 0x0f
+
+# CHECK: mayhr %f0, %f15, %f0
+0xb3 0x3c 0x00 0xf0
+
+# CHECK: mayhr %f15, %f0, %f0
+0xb3 0x3c 0xf0 0x00
+
+# CHECK: mayhr %f7, %f8, %f9
+0xb3 0x3c 0x70 0x89
+
+# CHECK: mayhr %f15, %f15, %f15
+0xb3 0x3c 0xf0 0xff
+
+# CHECK: mayl %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x38
+
+# CHECK: mayl %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x38
+
+# CHECK: mayl %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x38
+
+# CHECK: mayl %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x38
+
+# CHECK: mayl %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x38
+
+# CHECK: mayl %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x38
+
+# CHECK: mayl %f0, %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x38
+
+# CHECK: mayl %f15, %f0, 0
+0xed 0x00 0x00 0x00 0xf0 0x38
+
+# CHECK: mayl %f15, %f15, 0
+0xed 0xf0 0x00 0x00 0xf0 0x38
+
+# CHECK: maylr %f0, %f0, %f0
+0xb3 0x38 0x00 0x00
+
+# CHECK: maylr %f0, %f0, %f15
+0xb3 0x38 0x00 0x0f
+
+# CHECK: maylr %f0, %f15, %f0
+0xb3 0x38 0x00 0xf0
+
+# CHECK: maylr %f15, %f0, %f0
+0xb3 0x38 0xf0 0x00
+
+# CHECK: maylr %f7, %f8, %f9
+0xb3 0x38 0x70 0x89
+
+# CHECK: maylr %f15, %f15, %f15
+0xb3 0x38 0xf0 0xff
+
+# CHECK: mayr %f0, %f0, %f0
+0xb3 0x3a 0x00 0x00
+
+# CHECK: mayr %f0, %f0, %f15
+0xb3 0x3a 0x00 0x0f
+
+# CHECK: mayr %f0, %f15, %f0
+0xb3 0x3a 0x00 0xf0
+
+# CHECK: mayr %f13, %f0, %f0
+0xb3 0x3a 0xd0 0x00
+
+# CHECK: mayr %f5, %f8, %f9
+0xb3 0x3a 0x50 0x89
+
+# CHECK: mayr %f13, %f15, %f15
+0xb3 0x3a 0xd0 0xff
+
 # CHECK: mc 0, 0
 0xaf 0x00 0x00 0x00
 
@@ -8290,6 +10380,27 @@
 # CHECK: mc 4095(%r15), 42
 0xaf 0x2a 0xff 0xff
 
+# CHECK: md %f0, 0
+0x6c 0x00 0x00 0x00
+
+# CHECK: md %f0, 4095
+0x6c 0x00 0x0f 0xff
+
+# CHECK: md %f0, 0(%r1)
+0x6c 0x00 0x10 0x00
+
+# CHECK: md %f0, 0(%r15)
+0x6c 0x00 0xf0 0x00
+
+# CHECK: md %f0, 4095(%r1,%r15)
+0x6c 0x01 0xff 0xff
+
+# CHECK: md %f0, 4095(%r15,%r1)
+0x6c 0x0f 0x1f 0xff
+
+# CHECK: md %f15, 0
+0x6c 0xf0 0x00 0x00
+
 # CHECK: mdb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x1c
 
@@ -8323,6 +10434,27 @@
 # CHECK: mdbr %f15, %f0
 0xb3 0x1c 0x00 0xf0
 
+# CHECK: mde %f0, 0
+0x7c 0x00 0x00 0x00
+
+# CHECK: mde %f0, 4095
+0x7c 0x00 0x0f 0xff
+
+# CHECK: mde %f0, 0(%r1)
+0x7c 0x00 0x10 0x00
+
+# CHECK: mde %f0, 0(%r15)
+0x7c 0x00 0xf0 0x00
+
+# CHECK: mde %f0, 4095(%r1,%r15)
+0x7c 0x01 0xff 0xff
+
+# CHECK: mde %f0, 4095(%r15,%r1)
+0x7c 0x0f 0x1f 0xff
+
+# CHECK: mde %f15, 0
+0x7c 0xf0 0x00 0x00
+
 # CHECK: mdeb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x0c
 
@@ -8356,6 +10488,84 @@
 # CHECK: mdebr %f15, %f0
 0xb3 0x0c 0x00 0xf0
 
+# CHECK: mder %f0, %f0
+0x3c 0x00
+
+# CHECK: mder %f0, %f15
+0x3c 0x0f
+
+# CHECK: mder %f7, %f8
+0x3c 0x78
+
+# CHECK: mder %f15, %f0
+0x3c 0xf0
+
+# CHECK: mdr %f0, %f0
+0x2c 0x00
+
+# CHECK: mdr %f0, %f15
+0x2c 0x0f
+
+# CHECK: mdr %f7, %f8
+0x2c 0x78
+
+# CHECK: mdr %f15, %f0
+0x2c 0xf0
+
+# CHECK: mdtr %f0, %f0, %f0
+0xb3 0xd0 0x00 0x00
+
+# CHECK: mdtr %f0, %f0, %f15
+0xb3 0xd0 0xf0 0x00
+
+# CHECK: mdtr %f0, %f15, %f0
+0xb3 0xd0 0x00 0x0f
+
+# CHECK: mdtr %f15, %f0, %f0
+0xb3 0xd0 0x00 0xf0
+
+# CHECK: mdtr %f7, %f8, %f9
+0xb3 0xd0 0x90 0x78
+
+# CHECK: mdtra %f0, %f0, %f0, 1
+0xb3 0xd0 0x01 0x00
+
+# CHECK: mdtra %f0, %f0, %f0, 15
+0xb3 0xd0 0x0f 0x00
+
+# CHECK: mdtra %f0, %f0, %f15, 1
+0xb3 0xd0 0xf1 0x00
+
+# CHECK: mdtra %f0, %f15, %f0, 1
+0xb3 0xd0 0x01 0x0f
+
+# CHECK: mdtra %f15, %f0, %f0, 1
+0xb3 0xd0 0x01 0xf0
+
+# CHECK: mdtra %f7, %f8, %f9, 10
+0xb3 0xd0 0x9a 0x78
+
+# CHECK: mee %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x37
+
+# CHECK: mee %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x37
+
+# CHECK: mee %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x37
+
+# CHECK: mee %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x37
+
+# CHECK: mee %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x37
+
+# CHECK: mee %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x37
+
+# CHECK: mee %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x37
+
 # CHECK: meeb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x17
 
@@ -8389,6 +10599,18 @@
 # CHECK: meebr %f15, %f0
 0xb3 0x17 0x00 0xf0
 
+# CHECK: meer %f0, %f0
+0xb3 0x37 0x00 0x00
+
+# CHECK: meer %f0, %f15
+0xb3 0x37 0x00 0x0f
+
+# CHECK: meer %f7, %f8
+0xb3 0x37 0x00 0x78
+
+# CHECK: meer %f15, %f0
+0xb3 0x37 0x00 0xf0
+
 # CHECK: mfy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x5c
 
@@ -8665,6 +10887,33 @@
 # CHECK: ms %r15, 0
 0x71 0xf0 0x00 0x00
 
+# CHECK: msd %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x3f
+
+# CHECK: msd %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x3f
+
+# CHECK: msd %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x3f
+
+# CHECK: msd %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x3f
+
+# CHECK: msd %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x3f
+
+# CHECK: msd %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x3f
+
+# CHECK: msd %f0, %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x3f
+
+# CHECK: msd %f15, %f0, 0
+0xed 0x00 0x00 0x00 0xf0 0x3f
+
+# CHECK: msd %f15, %f15, 0
+0xed 0xf0 0x00 0x00 0xf0 0x3f
+
 # CHECK: msdb %f0, %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x1f
 
@@ -8710,6 +10959,51 @@
 # CHECK: msdbr %f15, %f15, %f15
 0xb3 0x1f 0xf0 0xff
 
+# CHECK: msdr %f0, %f0, %f0
+0xb3 0x3f 0x00 0x00
+
+# CHECK: msdr %f0, %f0, %f15
+0xb3 0x3f 0x00 0x0f
+
+# CHECK: msdr %f0, %f15, %f0
+0xb3 0x3f 0x00 0xf0
+
+# CHECK: msdr %f15, %f0, %f0
+0xb3 0x3f 0xf0 0x00
+
+# CHECK: msdr %f7, %f8, %f9
+0xb3 0x3f 0x70 0x89
+
+# CHECK: msdr %f15, %f15, %f15
+0xb3 0x3f 0xf0 0xff
+
+# CHECK: mse %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x2f
+
+# CHECK: mse %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x2f
+
+# CHECK: mse %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x2f
+
+# CHECK: mse %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x2f
+
+# CHECK: mse %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x2f
+
+# CHECK: mse %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x2f
+
+# CHECK: mse %f0, %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x2f
+
+# CHECK: mse %f15, %f0, 0
+0xed 0x00 0x00 0x00 0xf0 0x2f
+
+# CHECK: mse %f15, %f15, 0
+0xed 0xf0 0x00 0x00 0xf0 0x2f
+
 # CHECK: mseb %f0, %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x0f
 
@@ -8755,6 +11049,24 @@
 # CHECK: msebr %f15, %f15, %f15
 0xb3 0x0f 0xf0 0xff
 
+# CHECK: mser %f0, %f0, %f0
+0xb3 0x2f 0x00 0x00
+
+# CHECK: mser %f0, %f0, %f15
+0xb3 0x2f 0x00 0x0f
+
+# CHECK: mser %f0, %f15, %f0
+0xb3 0x2f 0x00 0xf0
+
+# CHECK: mser %f15, %f0, %f0
+0xb3 0x2f 0xf0 0x00
+
+# CHECK: mser %f7, %f8, %f9
+0xb3 0x2f 0x70 0x89
+
+# CHECK: mser %f15, %f15, %f15
+0xb3 0x2f 0xf0 0xff
+
 # CHECK: msfi %r0, -2147483648
 0xc2 0x01 0x80 0x00 0x00 0x00
 
@@ -9361,6 +11673,27 @@
 # CHECK: mxbr %f13, %f13
 0xb3 0x4c 0x00 0xdd
 
+# CHECK: mxd %f0, 0
+0x67 0x00 0x00 0x00
+
+# CHECK: mxd %f0, 4095
+0x67 0x00 0x0f 0xff
+
+# CHECK: mxd %f0, 0(%r1)
+0x67 0x00 0x10 0x00
+
+# CHECK: mxd %f0, 0(%r15)
+0x67 0x00 0xf0 0x00
+
+# CHECK: mxd %f0, 4095(%r1,%r15)
+0x67 0x01 0xff 0xff
+
+# CHECK: mxd %f0, 4095(%r15,%r1)
+0x67 0x0f 0x1f 0xff
+
+# CHECK: mxd %f13, 0
+0x67 0xd0 0x00 0x00
+
 # CHECK: mxdb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x07
 
@@ -9394,6 +11727,198 @@
 # CHECK: mxdbr %f13, %f0
 0xb3 0x07 0x00 0xd0
 
+# CHECK: mxdr %f0, %f0
+0x27 0x00
+
+# CHECK: mxdr %f0, %f15
+0x27 0x0f
+
+# CHECK: mxdr %f8, %f8
+0x27 0x88
+
+# CHECK: mxdr %f13, %f0
+0x27 0xd0
+
+# CHECK: mxr %f0, %f0
+0x26 0x00
+
+# CHECK: mxr %f0, %f13
+0x26 0x0d
+
+# CHECK: mxr %f8, %f5
+0x26 0x85
+
+# CHECK: mxr %f13, %f13
+0x26 0xdd
+
+# CHECK: mxtr %f0, %f0, %f0
+0xb3 0xd8 0x00 0x00
+
+# CHECK: mxtr %f0, %f0, %f13
+0xb3 0xd8 0xd0 0x00
+
+# CHECK: mxtr %f0, %f13, %f0
+0xb3 0xd8 0x00 0x0d
+
+# CHECK: mxtr %f13, %f0, %f0
+0xb3 0xd8 0x00 0xd0
+
+# CHECK: mxtr %f8, %f8, %f8
+0xb3 0xd8 0x80 0x88
+
+# CHECK: mxtra %f0, %f0, %f0, 1
+0xb3 0xd8 0x01 0x00
+
+# CHECK: mxtra %f0, %f0, %f0, 15
+0xb3 0xd8 0x0f 0x00
+
+# CHECK: mxtra %f0, %f0, %f13, 1
+0xb3 0xd8 0xd1 0x00
+
+# CHECK: mxtra %f0, %f13, %f0, 1
+0xb3 0xd8 0x01 0x0d
+
+# CHECK: mxtra %f13, %f0, %f0, 1
+0xb3 0xd8 0x01 0xd0
+
+# CHECK: mxtra %f8, %f8, %f8, 8
+0xb3 0xd8 0x88 0x88
+
+# CHECK: my %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x3b
+
+# CHECK: my %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x3b
+
+# CHECK: my %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x3b
+
+# CHECK: my %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x3b
+
+# CHECK: my %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x3b
+
+# CHECK: my %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x3b
+
+# CHECK: my %f0, %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x3b
+
+# CHECK: my %f13, %f0, 0
+0xed 0x00 0x00 0x00 0xd0 0x3b
+
+# CHECK: my %f13, %f15, 0
+0xed 0xf0 0x00 0x00 0xd0 0x3b
+
+# CHECK: myh %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x3d
+
+# CHECK: myh %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x3d
+
+# CHECK: myh %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x3d
+
+# CHECK: myh %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x3d
+
+# CHECK: myh %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x3d
+
+# CHECK: myh %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x3d
+
+# CHECK: myh %f0, %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x3d
+
+# CHECK: myh %f15, %f0, 0
+0xed 0x00 0x00 0x00 0xf0 0x3d
+
+# CHECK: myh %f15, %f15, 0
+0xed 0xf0 0x00 0x00 0xf0 0x3d
+
+# CHECK: myhr %f0, %f0, %f0
+0xb3 0x3d 0x00 0x00
+
+# CHECK: myhr %f0, %f0, %f15
+0xb3 0x3d 0x00 0x0f
+
+# CHECK: myhr %f0, %f15, %f0
+0xb3 0x3d 0x00 0xf0
+
+# CHECK: myhr %f15, %f0, %f0
+0xb3 0x3d 0xf0 0x00
+
+# CHECK: myhr %f7, %f8, %f9
+0xb3 0x3d 0x70 0x89
+
+# CHECK: myhr %f15, %f15, %f15
+0xb3 0x3d 0xf0 0xff
+
+# CHECK: myl %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x39
+
+# CHECK: myl %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x39
+
+# CHECK: myl %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x39
+
+# CHECK: myl %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x39
+
+# CHECK: myl %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x39
+
+# CHECK: myl %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x39
+
+# CHECK: myl %f0, %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x39
+
+# CHECK: myl %f15, %f0, 0
+0xed 0x00 0x00 0x00 0xf0 0x39
+
+# CHECK: myl %f15, %f15, 0
+0xed 0xf0 0x00 0x00 0xf0 0x39
+
+# CHECK: mylr %f0, %f0, %f0
+0xb3 0x39 0x00 0x00
+
+# CHECK: mylr %f0, %f0, %f15
+0xb3 0x39 0x00 0x0f
+
+# CHECK: mylr %f0, %f15, %f0
+0xb3 0x39 0x00 0xf0
+
+# CHECK: mylr %f15, %f0, %f0
+0xb3 0x39 0xf0 0x00
+
+# CHECK: mylr %f7, %f8, %f9
+0xb3 0x39 0x70 0x89
+
+# CHECK: mylr %f15, %f15, %f15
+0xb3 0x39 0xf0 0xff
+
+# CHECK: myr %f0, %f0, %f0
+0xb3 0x3b 0x00 0x00
+
+# CHECK: myr %f0, %f0, %f15
+0xb3 0x3b 0x00 0x0f
+
+# CHECK: myr %f0, %f15, %f0
+0xb3 0x3b 0x00 0xf0
+
+# CHECK: myr %f13, %f0, %f0
+0xb3 0x3b 0xd0 0x00
+
+# CHECK: myr %f5, %f8, %f9
+0xb3 0x3b 0x50 0x89
+
+# CHECK: myr %f13, %f15, %f15
+0xb3 0x3b 0xd0 0xff
+
 # CHECK: n %r0, 0
 0x54 0x00 0x00 0x00
 
@@ -10051,6 +12576,9 @@
 # CHECK: pfd 15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x36
 
+# CHECK: pfpo
+0x01 0x0a
+
 # CHECK: pka 0, 0(1)
 0xe9 0x00 0x00 0x00 0x00 0x00
 
@@ -10174,6 +12702,42 @@
 # CHECK: pr
 0x01 0x01
 
+# CHECK: qadtr %f0, %f0, %f0, 0
+0xb3 0xf5 0x00 0x00
+
+# CHECK: qadtr %f0, %f0, %f0, 15
+0xb3 0xf5 0x0f 0x00
+
+# CHECK: qadtr %f0, %f0, %f15, 0
+0xb3 0xf5 0x00 0x0f
+
+# CHECK: qadtr %f0, %f15, %f0, 0
+0xb3 0xf5 0xf0 0x00
+
+# CHECK: qadtr %f4, %f5, %f6, 7
+0xb3 0xf5 0x57 0x46
+
+# CHECK: qadtr %f15, %f0, %f0, 0
+0xb3 0xf5 0x00 0xf0
+
+# CHECK: qaxtr %f0, %f0, %f0, 0
+0xb3 0xfd 0x00 0x00
+
+# CHECK: qaxtr %f0, %f0, %f0, 15
+0xb3 0xfd 0x0f 0x00
+
+# CHECK: qaxtr %f0, %f0, %f13, 0
+0xb3 0xfd 0x00 0x0d
+
+# CHECK: qaxtr %f0, %f13, %f0, 0
+0xb3 0xfd 0xd0 0x00
+
+# CHECK: qaxtr %f8, %f8, %f8, 8
+0xb3 0xfd 0x88 0x88
+
+# CHECK: qaxtr %f13, %f0, %f0, 0
+0xb3 0xfd 0x00 0xd0
+
 # CHECK: risbg %r0, %r0, 0, 0, 0
 0xec 0x00 0x00 0x00 0x00 0x55
 
@@ -10372,6 +12936,42 @@
 # CHECK: rosbg %r4, %r5, 6, 7, 8
 0xec 0x45 0x06 0x07 0x08 0x56
 
+# CHECK: rrdtr %f0, %f0, %f0, 0
+0xb3 0xf7 0x00 0x00
+
+# CHECK: rrdtr %f0, %f0, %f0, 15
+0xb3 0xf7 0x0f 0x00
+
+# CHECK: rrdtr %f0, %f0, %f15, 0
+0xb3 0xf7 0x00 0x0f
+
+# CHECK: rrdtr %f0, %f15, %f0, 0
+0xb3 0xf7 0xf0 0x00
+
+# CHECK: rrdtr %f4, %f5, %f6, 7
+0xb3 0xf7 0x57 0x46
+
+# CHECK: rrdtr %f15, %f0, %f0, 0
+0xb3 0xf7 0x00 0xf0
+
+# CHECK: rrxtr %f0, %f0, %f0, 0
+0xb3 0xff 0x00 0x00
+
+# CHECK: rrxtr %f0, %f0, %f0, 15
+0xb3 0xff 0x0f 0x00
+
+# CHECK: rrxtr %f0, %f0, %f13, 0
+0xb3 0xff 0x00 0x0d
+
+# CHECK: rrxtr %f0, %f13, %f0, 0
+0xb3 0xff 0xd0 0x00
+
+# CHECK: rrxtr %f8, %f8, %f8, 8
+0xb3 0xff 0x88 0x88
+
+# CHECK: rrxtr %f13, %f0, %f0, 0
+0xb3 0xff 0x00 0xd0
+
 # CHECK: rxsbg %r0, %r0, 0, 0, 0
 0xec 0x00 0x00 0x00 0x00 0x57
 
@@ -10438,6 +13038,27 @@
 # CHECK: sar %a15, %r15
 0xb2 0x4e 0x00 0xff
 
+# CHECK: sd %f0, 0
+0x6b 0x00 0x00 0x00
+
+# CHECK: sd %f0, 4095
+0x6b 0x00 0x0f 0xff
+
+# CHECK: sd %f0, 0(%r1)
+0x6b 0x00 0x10 0x00
+
+# CHECK: sd %f0, 0(%r15)
+0x6b 0x00 0xf0 0x00
+
+# CHECK: sd %f0, 4095(%r1,%r15)
+0x6b 0x01 0xff 0xff
+
+# CHECK: sd %f0, 4095(%r15,%r1)
+0x6b 0x0f 0x1f 0xff
+
+# CHECK: sd %f15, 0
+0x6b 0xf0 0x00 0x00
+
 # CHECK: sdb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x1b
 
@@ -10471,6 +13092,72 @@
 # CHECK: sdbr %f15, %f0
 0xb3 0x1b 0x00 0xf0
 
+# CHECK: sdr %f0, %f0
+0x2b 0x00
+
+# CHECK: sdr %f0, %f15
+0x2b 0x0f
+
+# CHECK: sdr %f7, %f8
+0x2b 0x78
+
+# CHECK: sdr %f15, %f0
+0x2b 0xf0
+
+# CHECK: sdtr %f0, %f0, %f0
+0xb3 0xd3 0x00 0x00
+
+# CHECK: sdtr %f0, %f0, %f15
+0xb3 0xd3 0xf0 0x00
+
+# CHECK: sdtr %f0, %f15, %f0
+0xb3 0xd3 0x00 0x0f
+
+# CHECK: sdtr %f15, %f0, %f0
+0xb3 0xd3 0x00 0xf0
+
+# CHECK: sdtr %f7, %f8, %f9
+0xb3 0xd3 0x90 0x78
+
+# CHECK: sdtra %f0, %f0, %f0, 1
+0xb3 0xd3 0x01 0x00
+
+# CHECK: sdtra %f0, %f0, %f0, 15
+0xb3 0xd3 0x0f 0x00
+
+# CHECK: sdtra %f0, %f0, %f15, 1
+0xb3 0xd3 0xf1 0x00
+
+# CHECK: sdtra %f0, %f15, %f0, 1
+0xb3 0xd3 0x01 0x0f
+
+# CHECK: sdtra %f15, %f0, %f0, 1
+0xb3 0xd3 0x01 0xf0
+
+# CHECK: sdtra %f7, %f8, %f9, 10
+0xb3 0xd3 0x9a 0x78
+
+# CHECK: se %f0, 0
+0x7b 0x00 0x00 0x00
+
+# CHECK: se %f0, 4095
+0x7b 0x00 0x0f 0xff
+
+# CHECK: se %f0, 0(%r1)
+0x7b 0x00 0x10 0x00
+
+# CHECK: se %f0, 0(%r15)
+0x7b 0x00 0xf0 0x00
+
+# CHECK: se %f0, 4095(%r1,%r15)
+0x7b 0x01 0xff 0xff
+
+# CHECK: se %f0, 4095(%r15,%r1)
+0x7b 0x0f 0x1f 0xff
+
+# CHECK: se %f15, 0
+0x7b 0xf0 0x00 0x00
+
 # CHECK: seb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x0b
 
@@ -10504,6 +13191,18 @@
 # CHECK: sebr %f15, %f0
 0xb3 0x0b 0x00 0xf0
 
+# CHECK: ser %f0, %f0
+0x3b 0x00
+
+# CHECK: ser %f0, %f15
+0x3b 0x0f
+
+# CHECK: ser %f7, %f8
+0x3b 0x78
+
+# CHECK: ser %f15, %f0
+0x3b 0xf0
+
 # CHECK: sfasr %r0
 0xb3 0x85 0x00 0x00
 
@@ -10912,6 +13611,33 @@
 # CHECK: sldl %r0, 4095(%r15)
 0x8d 0x00 0xff 0xff
 
+# CHECK: sldt %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x40
+
+# CHECK: sldt %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x40
+
+# CHECK: sldt %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x40
+
+# CHECK: sldt %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x40
+
+# CHECK: sldt %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x40
+
+# CHECK: sldt %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x40
+
+# CHECK: sldt %f0, %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x40
+
+# CHECK: sldt %f15, %f0, 0
+0xed 0x00 0x00 0x00 0xf0 0x40
+
+# CHECK: sldt %f15, %f15, 0
+0xed 0xf0 0x00 0x00 0xf0 0x40
+
 # CHECK: slfi %r0, 0
 0xc2 0x05 0x00 0x00 0x00 0x00
 
@@ -11134,6 +13860,33 @@
 # CHECK: slrk %r2, %r3, %r4
 0xb9 0xfb 0x40 0x23
 
+# CHECK: slxt %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x48
+
+# CHECK: slxt %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x48
+
+# CHECK: slxt %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x48
+
+# CHECK: slxt %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x48
+
+# CHECK: slxt %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x48
+
+# CHECK: slxt %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x48
+
+# CHECK: slxt %f0, %f13, 0
+0xed 0xd0 0x00 0x00 0x00 0x48
+
+# CHECK: slxt %f13, %f0, 0
+0xed 0x00 0x00 0x00 0xd0 0x48
+
+# CHECK: slxt %f13, %f13, 0
+0xed 0xd0 0x00 0x00 0xd0 0x48
+
 # CHECK: sly %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x5f
 
@@ -11215,6 +13968,27 @@
 # CHECK: spm %r15
 0x04 0xf0
 
+# CHECK: sqd %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x35
+
+# CHECK: sqd %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x35
+
+# CHECK: sqd %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x35
+
+# CHECK: sqd %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x35
+
+# CHECK: sqd %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x35
+
+# CHECK: sqd %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x35
+
+# CHECK: sqd %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x35
+
 # CHECK: sqdb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x15
 
@@ -11248,6 +14022,39 @@
 # CHECK: sqdbr %f15, %f0
 0xb3 0x15 0x00 0xf0
 
+# CHECK: sqdr %f0, %f0
+0xb2 0x44 0x00 0x00
+
+# CHECK: sqdr %f0, %f15
+0xb2 0x44 0x00 0x0f
+
+# CHECK: sqdr %f7, %f8
+0xb2 0x44 0x00 0x78
+
+# CHECK: sqdr %f15, %f0
+0xb2 0x44 0x00 0xf0
+
+# CHECK: sqe %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x34
+
+# CHECK: sqe %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x34
+
+# CHECK: sqe %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x34
+
+# CHECK: sqe %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x34
+
+# CHECK: sqe %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x34
+
+# CHECK: sqe %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x34
+
+# CHECK: sqe %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x34
+
 # CHECK: sqeb %f0, 0
 0xed 0x00 0x00 0x00 0x00 0x14
 
@@ -11281,6 +14088,18 @@
 # CHECK: sqebr %f15, %f0
 0xb3 0x14 0x00 0xf0
 
+# CHECK: sqer %f0, %f0
+0xb2 0x45 0x00 0x00
+
+# CHECK: sqer %f0, %f15
+0xb2 0x45 0x00 0x0f
+
+# CHECK: sqer %f7, %f8
+0xb2 0x45 0x00 0x78
+
+# CHECK: sqer %f15, %f0
+0xb2 0x45 0x00 0xf0
+
 # CHECK: sqxbr %f0, %f0
 0xb3 0x16 0x00 0x00
 
@@ -11293,6 +14112,18 @@
 # CHECK: sqxbr %f13, %f0
 0xb3 0x16 0x00 0xd0
 
+# CHECK: sqxr %f0, %f0
+0xb3 0x36 0x00 0x00
+
+# CHECK: sqxr %f0, %f13
+0xb3 0x36 0x00 0x0d
+
+# CHECK: sqxr %f8, %f8
+0xb3 0x36 0x00 0x88
+
+# CHECK: sqxr %f13, %f0
+0xb3 0x36 0x00 0xd0
+
 # CHECK: sr %r0, %r0
 0x1b 0x00
 
@@ -11449,6 +14280,33 @@
 # CHECK: srdl %r0, 4095(%r15)
 0x8c 0x00 0xff 0xff
 
+# CHECK: srdt %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x41
+
+# CHECK: srdt %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x41
+
+# CHECK: srdt %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x41
+
+# CHECK: srdt %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x41
+
+# CHECK: srdt %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x41
+
+# CHECK: srdt %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x41
+
+# CHECK: srdt %f0, %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x41
+
+# CHECK: srdt %f15, %f0, 0
+0xed 0x00 0x00 0x00 0xf0 0x41
+
+# CHECK: srdt %f15, %f15, 0
+0xed 0xf0 0x00 0x00 0xf0 0x41
+
 # CHECK: srk %r0, %r0, %r0
 0xb9 0xf9 0x00 0x00
 
@@ -11668,6 +14526,33 @@
 # CHECK: srstu %r7, %r8
 0xb9 0xbe 0x00 0x78
 
+# CHECK: srxt %f0, %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x49
+
+# CHECK: srxt %f0, %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x49
+
+# CHECK: srxt %f0, %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x49
+
+# CHECK: srxt %f0, %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x49
+
+# CHECK: srxt %f0, %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x49
+
+# CHECK: srxt %f0, %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x49
+
+# CHECK: srxt %f0, %f13, 0
+0xed 0xd0 0x00 0x00 0x00 0x49
+
+# CHECK: srxt %f13, %f0, 0
+0xed 0x00 0x00 0x00 0xd0 0x49
+
+# CHECK: srxt %f13, %f13, 0
+0xed 0xd0 0x00 0x00 0xd0 0x49
+
 # CHECK: st %r0, 0
 0x50 0x00 0x00 0x00
 
@@ -12664,6 +15549,39 @@
 # CHECK: sty %r15, 0
 0xe3 0xf0 0x00 0x00 0x00 0x50
 
+# CHECK: su %f0, 0
+0x7f 0x00 0x00 0x00
+
+# CHECK: su %f0, 4095
+0x7f 0x00 0x0f 0xff
+
+# CHECK: su %f0, 0(%r1)
+0x7f 0x00 0x10 0x00
+
+# CHECK: su %f0, 0(%r15)
+0x7f 0x00 0xf0 0x00
+
+# CHECK: su %f0, 4095(%r1,%r15)
+0x7f 0x01 0xff 0xff
+
+# CHECK: su %f0, 4095(%r15,%r1)
+0x7f 0x0f 0x1f 0xff
+
+# CHECK: su %f15, 0
+0x7f 0xf0 0x00 0x00
+
+# CHECK: sur %f0, %f0
+0x3f 0x00
+
+# CHECK: sur %f0, %f15
+0x3f 0x0f
+
+# CHECK: sur %f7, %f8
+0x3f 0x78
+
+# CHECK: sur %f15, %f0
+0x3f 0xf0
+
 # CHECK: svc 0
 0x0a 0x00
 
@@ -12676,6 +15594,39 @@
 # CHECK: svc 255
 0x0a 0xff
 
+# CHECK: sw %f0, 0
+0x6f 0x00 0x00 0x00
+
+# CHECK: sw %f0, 4095
+0x6f 0x00 0x0f 0xff
+
+# CHECK: sw %f0, 0(%r1)
+0x6f 0x00 0x10 0x00
+
+# CHECK: sw %f0, 0(%r15)
+0x6f 0x00 0xf0 0x00
+
+# CHECK: sw %f0, 4095(%r1,%r15)
+0x6f 0x01 0xff 0xff
+
+# CHECK: sw %f0, 4095(%r15,%r1)
+0x6f 0x0f 0x1f 0xff
+
+# CHECK: sw %f15, 0
+0x6f 0xf0 0x00 0x00
+
+# CHECK: swr %f0, %f0
+0x2f 0x00
+
+# CHECK: swr %f0, %f15
+0x2f 0x0f
+
+# CHECK: swr %f7, %f8
+0x2f 0x78
+
+# CHECK: swr %f15, %f0
+0x2f 0xf0
+
 # CHECK: sxbr %f0, %f0
 0xb3 0x4b 0x00 0x00
 
@@ -12688,6 +15639,51 @@
 # CHECK: sxbr %f13, %f0
 0xb3 0x4b 0x00 0xd0
 
+# CHECK: sxr %f0, %f0
+0x37 0x00
+
+# CHECK: sxr %f0, %f13
+0x37 0x0d
+
+# CHECK: sxr %f8, %f8
+0x37 0x88
+
+# CHECK: sxr %f13, %f0
+0x37 0xd0
+
+# CHECK: sxtr %f0, %f0, %f0
+0xb3 0xdb 0x00 0x00
+
+# CHECK: sxtr %f0, %f0, %f13
+0xb3 0xdb 0xd0 0x00
+
+# CHECK: sxtr %f0, %f13, %f0
+0xb3 0xdb 0x00 0x0d
+
+# CHECK: sxtr %f13, %f0, %f0
+0xb3 0xdb 0x00 0xd0
+
+# CHECK: sxtr %f8, %f8, %f8
+0xb3 0xdb 0x80 0x88
+
+# CHECK: sxtra %f0, %f0, %f0, 1
+0xb3 0xdb 0x01 0x00
+
+# CHECK: sxtra %f0, %f0, %f0, 15
+0xb3 0xdb 0x0f 0x00
+
+# CHECK: sxtra %f0, %f0, %f13, 1
+0xb3 0xdb 0xd1 0x00
+
+# CHECK: sxtra %f0, %f13, %f0, 1
+0xb3 0xdb 0x01 0x0d
+
+# CHECK: sxtra %f13, %f0, %f0, 1
+0xb3 0xdb 0x01 0xd0
+
+# CHECK: sxtra %f8, %f8, %f8, 8
+0xb3 0xdb 0x88 0x88
+
 # CHECK: sy %r0, -524288
 0xe3 0x00 0x00 0x00 0x80 0x5b
 
@@ -12739,6 +15735,36 @@
 # CHECK: tam
 0x01 0x0b
 
+# CHECK: tbdr %f0, 0, %f0
+0xb3 0x51 0x00 0x00
+
+# CHECK: tbdr %f0, 0, %f15
+0xb3 0x51 0x00 0x0f
+
+# CHECK: tbdr %f0, 15, %f0
+0xb3 0x51 0xf0 0x00
+
+# CHECK: tbdr %f4, 5, %f6
+0xb3 0x51 0x50 0x46
+
+# CHECK: tbdr %f15, 0, %f0
+0xb3 0x51 0x00 0xf0
+
+# CHECK: tbedr %f0, 0, %f0
+0xb3 0x50 0x00 0x00
+
+# CHECK: tbedr %f0, 0, %f15
+0xb3 0x50 0x00 0x0f
+
+# CHECK: tbedr %f0, 15, %f0
+0xb3 0x50 0xf0 0x00
+
+# CHECK: tbedr %f4, 5, %f6
+0xb3 0x50 0x50 0x46
+
+# CHECK: tbedr %f15, 0, %f0
+0xb3 0x50 0x00 0xf0
+
 # CHECK: tbegin 0, 0
 0xe5 0x60 0x00 0x00 0x00 0x00
 
@@ -12868,9 +15894,159 @@
 # CHECK: tcxb %f13, 0
 0xed 0xd0 0x00 0x00 0x00 0x12
 
+# CHECK: tdcdt %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x54
+
+# CHECK: tdcdt %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x54
+
+# CHECK: tdcdt %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x54
+
+# CHECK: tdcdt %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x54
+
+# CHECK: tdcdt %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x54
+
+# CHECK: tdcdt %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x54
+
+# CHECK: tdcdt %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x54
+
+# CHECK: tdcet %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x50
+
+# CHECK: tdcet %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x50
+
+# CHECK: tdcet %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x50
+
+# CHECK: tdcet %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x50
+
+# CHECK: tdcet %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x50
+
+# CHECK: tdcet %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x50
+
+# CHECK: tdcet %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x50
+
+# CHECK: tdcxt %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x58
+
+# CHECK: tdcxt %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x58
+
+# CHECK: tdcxt %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x58
+
+# CHECK: tdcxt %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x58
+
+# CHECK: tdcxt %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x58
+
+# CHECK: tdcxt %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x58
+
+# CHECK: tdcxt %f13, 0
+0xed 0xd0 0x00 0x00 0x00 0x58
+
+# CHECK: tdgdt %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x55
+
+# CHECK: tdgdt %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x55
+
+# CHECK: tdgdt %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x55
+
+# CHECK: tdgdt %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x55
+
+# CHECK: tdgdt %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x55
+
+# CHECK: tdgdt %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x55
+
+# CHECK: tdgdt %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x55
+
+# CHECK: tdget %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x51
+
+# CHECK: tdget %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x51
+
+# CHECK: tdget %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x51
+
+# CHECK: tdget %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x51
+
+# CHECK: tdget %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x51
+
+# CHECK: tdget %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x51
+
+# CHECK: tdget %f15, 0
+0xed 0xf0 0x00 0x00 0x00 0x51
+
+# CHECK: tdgxt %f0, 0
+0xed 0x00 0x00 0x00 0x00 0x59
+
+# CHECK: tdgxt %f0, 4095
+0xed 0x00 0x0f 0xff 0x00 0x59
+
+# CHECK: tdgxt %f0, 0(%r1)
+0xed 0x00 0x10 0x00 0x00 0x59
+
+# CHECK: tdgxt %f0, 0(%r15)
+0xed 0x00 0xf0 0x00 0x00 0x59
+
+# CHECK: tdgxt %f0, 4095(%r1,%r15)
+0xed 0x01 0xff 0xff 0x00 0x59
+
+# CHECK: tdgxt %f0, 4095(%r15,%r1)
+0xed 0x0f 0x1f 0xff 0x00 0x59
+
+# CHECK: tdgxt %f13, 0
+0xed 0xd0 0x00 0x00 0x00 0x59
+
 # CHECK: tend
 0xb2 0xf8 0x00 0x00
 
+# CHECK: thder %f0, %f9
+0xb3 0x58 0x00 0x09
+
+# CHECK: thder %f0, %f15
+0xb3 0x58 0x00 0x0f
+
+# CHECK: thder %f15, %f0
+0xb3 0x58 0x00 0xf0
+
+# CHECK: thder %f15, %f9
+0xb3 0x58 0x00 0xf9
+
+# CHECK: thdr %f0, %f9
+0xb3 0x59 0x00 0x09
+
+# CHECK: thdr %f0, %f15
+0xb3 0x59 0x00 0x0f
+
+# CHECK: thdr %f15, %f0
+0xb3 0x59 0x00 0xf0
+
+# CHECK: thdr %f15, %f9
+0xb3 0x59 0x00 0xf9
+
 # CHECK: tm 0, 0
 0x91 0x00 0x00 0x00
 
diff --git a/test/MC/Mips/macro-li.d.s b/test/MC/Mips/macro-li.d.s
new file mode 100644
index 000000000000..e54b69ea0862
--- /dev/null
+++ b/test/MC/Mips/macro-li.d.s
@@ -0,0 +1,443 @@
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32 -target-abi=o32  | FileCheck %s --check-prefixes=ALL,O32-N32-NO-PIC,O32
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32r2 -target-abi=o32 | FileCheck %s --check-prefixes=ALL,CHECK-MIPS32r2
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips32 -target-abi=o32 -position-independent | FileCheck %s --check-prefixes=ALL,O32-N32-PIC,O32
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips64 -target-abi=n32 | FileCheck %s --check-prefixes=ALL,O32-N32-NO-PIC,N32-N64
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips64 -target-abi=n32 -position-independent | FileCheck %s --check-prefixes=ALL,O32-N32-PIC,N32-N64
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips64 -target-abi=n64 | FileCheck %s --check-prefixes=ALL,N64-NO-PIC,N32-N64
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips64 -target-abi=n64 -position-independent | FileCheck %s --check-prefixes=ALL,N64-PIC,N32-N64
+
+li.d	$4, 0
+# O32:     addiu   $4, $zero, 0                # encoding: [0x00,0x00,0x04,0x24]
+# O32:     addiu   $5, $zero, 0                # encoding: [0x00,0x00,0x05,0x24]
+# N32-N64: daddiu  $4, $zero, 0                # encoding: [0x00,0x00,0x04,0x64]
+
+li.d	$4, 0.0
+# O32:     addiu   $4, $zero, 0                # encoding: [0x00,0x00,0x04,0x24]
+# O32:     addiu   $5, $zero, 0                # encoding: [0x00,0x00,0x05,0x24]
+# N32-N64: daddiu  $4, $zero, 0                # encoding: [0x00,0x00,0x04,0x64]
+
+li.d	$4, 1.12345
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1072822694
+# ALL:	.4byte	3037400872
+# ALL:	.text
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# O32-N32-NO-PIC:  addiu   $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x24]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-PIC:     addiu   $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x24]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         daddiu  $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# O32:             lw      $4, 0($1)                  # encoding: [0x00,0x00,0x24,0x8c]
+# O32:             lw      $5, 4($1)                  # encoding: [0x04,0x00,0x25,0x8c]
+# N32-N64:         ld      $4, 0($1)                  # encoding: [0x00,0x00,0x24,0xdc]
+
+li.d	$4, 1
+# ALL:   lui     $4, 16368                   # encoding: [0xf0,0x3f,0x04,0x3c]
+# O32:   addiu   $5, $zero, 0                # encoding: [0x00,0x00,0x05,0x24]
+
+li.d	$4, 1.0
+# ALL:   lui     $4, 16368                   # encoding: [0xf0,0x3f,0x04,0x3c]
+# O32:   addiu   $5, $zero, 0                # encoding: [0x00,0x00,0x05,0x24]
+
+li.d	$4, 12345678910
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1107754720
+# ALL:	.4byte	3790602240
+# ALL:	.text
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# O32-N32-NO-PIC:  addiu   $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x24]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-PIC:     addiu   $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x24]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         daddiu  $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# O32:             lw      $4, 0($1)                  # encoding: [0x00,0x00,0x24,0x8c]
+# O32:             lw      $5, 4($1)                  # encoding: [0x04,0x00,0x25,0x8c]
+# N32-N64:         ld      $4, 0($1)                  # encoding: [0x00,0x00,0x24,0xdc]
+
+li.d	$4, 12345678910.0
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1107754720
+# ALL:	.4byte	3790602240
+# ALL:	.text
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# O32-N32-NO-PIC:  addiu   $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x24]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-PIC:     addiu   $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x24]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         daddiu  $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# O32:             lw      $4, 0($1)                  # encoding: [0x00,0x00,0x24,0x8c]
+# O32:             lw      $5, 4($1)                  # encoding: [0x04,0x00,0x25,0x8c]
+# N32-N64:         ld      $4, 0($1)                  # encoding: [0x00,0x00,0x24,0xdc]
+
+li.d	$4, 0.4
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1071225241
+# ALL:	.4byte	2576980378
+# ALL:	.text
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# O32-N32-NO-PIC:  addiu   $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x24]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-PIC:     addiu   $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x24]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         daddiu  $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# O32:             lw      $4, 0($1)                  # encoding: [0x00,0x00,0x24,0x8c]
+# O32:             lw      $5, 4($1)                  # encoding: [0x04,0x00,0x25,0x8c]
+# N32-N64:         ld      $4, 0($1)                  # encoding: [0x00,0x00,0x24,0xdc]
+
+li.d	$4, 1.5
+# ALL:  lui     $4, 16376               # encoding: [0xf8,0x3f,0x04,0x3c]
+# O32:  addiu   $5, $zero, 0            # encoding: [0x00,0x00,0x05,0x24]
+
+li.d	$4, 12345678910.12345678910
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1107754720
+# ALL:	.4byte	3790666967
+# ALL:	.text
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# O32-N32-NO-PIC:  addiu   $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x24]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-PIC:     addiu   $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x24]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         daddiu  $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# O32:             lw      $4, 0($1)                  # encoding: [0x00,0x00,0x24,0x8c]
+# O32:             lw      $5, 4($1)                  # encoding: [0x04,0x00,0x25,0x8c]
+# N32-N64:         ld      $4, 0($1)                  # encoding: [0x00,0x00,0x24,0xdc]
+
+
+li.d	$4, 12345678910123456789.12345678910
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1139108501
+# ALL:	.4byte	836738583
+# ALL:	.text
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# O32-N32-NO-PIC:  addiu   $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x24]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-PIC:     addiu   $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x24]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         daddiu  $1, $1, %lo([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+# O32:             lw      $4, 0($1)                  # encoding: [0x00,0x00,0x24,0x8c]
+# O32:             lw      $5, 4($1)                  # encoding: [0x04,0x00,0x25,0x8c]
+# N32-N64:         ld      $4, 0($1)                  # encoding: [0x00,0x00,0x24,0xdc]
+
+li.d	$f4, 0
+# O32:            addiu   $1, $zero, 0       # encoding: [0x00,0x00,0x01,0x24]
+# O32:            mtc1    $1, $f5            # encoding: [0x00,0x28,0x81,0x44]
+# O32:            mtc1    $zero, $f4         # encoding: [0x00,0x20,0x80,0x44]
+# CHECK-MIPS32r2: addiu   $1, $zero, 0       # encoding: [0x00,0x00,0x01,0x24]
+# CHECK-MIPS32r2: mtc1    $zero, $f4         # encoding: [0x00,0x20,0x80,0x44]
+# CHECK-MIPS32r2: mthc1   $1, $f4            # encoding: [0x00,0x20,0xe1,0x44]
+# N32-N64:        addiu   $1, $zero, 0       # encoding: [0x00,0x00,0x01,0x24]
+# N32-N64:        dmtc1   $1, $f4            # encoding: [0x00,0x20,0xa1,0x44]
+
+li.d	$f4, 0.0
+# O32:            addiu   $1, $zero, 0       # encoding: [0x00,0x00,0x01,0x24]
+# O32:            mtc1    $1, $f5            # encoding: [0x00,0x28,0x81,0x44]
+# O32:            mtc1    $zero, $f4         # encoding: [0x00,0x20,0x80,0x44]
+# CHECK-MIPS32r2: addiu   $1, $zero, 0       # encoding: [0x00,0x00,0x01,0x24]
+# CHECK-MIPS32r2: mtc1    $zero, $f4         # encoding: [0x00,0x20,0x80,0x44]
+# CHECK-MIPS32r2: mthc1   $1, $f4            # encoding: [0x00,0x20,0xe1,0x44]
+# N32-N64:        addiu   $1, $zero, 0       # encoding: [0x00,0x00,0x01,0x24]
+# N32-N64:        dmtc1   $1, $f4            # encoding: [0x00,0x20,0xa1,0x44]
+
+li.d	$f4, 1.12345
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1072822694
+# ALL:	.4byte	3037400872
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             ldc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xd4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+
+li.d	$f4, 1
+# O32:            lui     $1, 16368          # encoding: [0xf0,0x3f,0x01,0x3c]
+# O32:            mtc1    $1, $f5            # encoding: [0x00,0x28,0x81,0x44]
+# O32:            mtc1    $zero, $f4         # encoding: [0x00,0x20,0x80,0x44]
+# CHECK-MIPS32r2: lui     $1, 16368          # encoding: [0xf0,0x3f,0x01,0x3c]
+# CHECK-MIPS32r2: mtc1    $zero, $f4         # encoding: [0x00,0x20,0x80,0x44]
+# CHECK-MIPS32r2: mthc1   $1, $f4            # encoding: [0x00,0x20,0xe1,0x44]
+# N32-N64:        lui     $1, 16368          # encoding: [0xf0,0x3f,0x01,0x3c]
+# N32-N64:        dmtc1   $1, $f4            # encoding: [0x00,0x20,0xa1,0x44]
+
+li.d	$f4, 1.0
+# O32:            lui     $1, 16368          # encoding: [0xf0,0x3f,0x01,0x3c]
+# O32:            mtc1    $1, $f5            # encoding: [0x00,0x28,0x81,0x44]
+# O32:            mtc1    $zero, $f4         # encoding: [0x00,0x20,0x80,0x44]
+# CHECK-MIPS32r2: lui     $1, 16368          # encoding: [0xf0,0x3f,0x01,0x3c]
+# CHECK-MIPS32r2: mtc1    $zero, $f4         # encoding: [0x00,0x20,0x80,0x44]
+# CHECK-MIPS32r2: mthc1   $1, $f4            # encoding: [0x00,0x20,0xe1,0x44]
+# N32-N64:        lui     $1, 16368          # encoding: [0xf0,0x3f,0x01,0x3c]
+# N32-N64:        dmtc1   $1, $f4            # encoding: [0x00,0x20,0xa1,0x44]
+
+li.d	$f4, 12345678910
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1107754720
+# ALL:	.4byte	3790602240
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             ldc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xd4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+
+li.d	$f4, 12345678910.0
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1107754720
+# ALL:	.4byte	3790602240
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             ldc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xd4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+
+li.d	$f4, 0.4
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1071225241
+# ALL:	.4byte	2576980378
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             ldc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xd4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+
+li.d	$f4, 1.5
+# O32:            lui     $1, 16376          # encoding: [0xf8,0x3f,0x01,0x3c]
+# O32:            mtc1    $1, $f5            # encoding: [0x00,0x28,0x81,0x44]
+# O32:            mtc1    $zero, $f4         # encoding: [0x00,0x20,0x80,0x44]
+# CHECK-MIPS32r2: lui     $1, 16376          # encoding: [0xf8,0x3f,0x01,0x3c]
+# CHECK-MIPS32r2: mtc1    $zero, $f4         # encoding: [0x00,0x20,0x80,0x44]
+# CHECK-MIPS32r2: mthc1   $1, $f4            # encoding: [0x00,0x20,0xe1,0x44]
+# N32-N64:        lui     $1, 16376          # encoding: [0xf8,0x3f,0x01,0x3c]
+# N32-N64:        dmtc1   $1, $f4            # encoding: [0x00,0x20,0xa1,0x44]
+
+li.d	$f4, 2.5
+# O32:            lui     $1, 16388          # encoding: [0x04,0x40,0x01,0x3c]
+# O32:            mtc1    $1, $f5            # encoding: [0x00,0x28,0x81,0x44]
+# O32:            mtc1    $zero, $f4         # encoding: [0x00,0x20,0x80,0x44]
+# CHECK-MIPS32r2: lui     $1, 16388          # encoding: [0x04,0x40,0x01,0x3c]
+# CHECK-MIPS32r2: mtc1    $zero, $f4         # encoding: [0x00,0x20,0x80,0x44]
+# CHECK-MIPS32r2: mthc1   $1, $f4            # encoding: [0x00,0x20,0xe1,0x44]
+# N32-N64:        lui     $1, 16388          # encoding: [0x04,0x40,0x01,0x3c]
+# N32-N64:        dmtc1   $1, $f4            # encoding: [0x00,0x20,0xa1,0x44]
+
+li.d	$f4, 2.515625
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1074012160
+# ALL:	.4byte	0
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             ldc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xd4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+
+li.d	$f4, 12345678910.12345678910
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1107754720
+# ALL:	.4byte	3790666967
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             ldc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xd4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+
+li.d	$f4, 12345678910123456789.12345678910
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1139108501
+# ALL:	.4byte	836738583
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             ldc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xd4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
diff --git a/test/MC/Mips/macro-li.s.s b/test/MC/Mips/macro-li.s.s
new file mode 100644
index 000000000000..01eb3646211f
--- /dev/null
+++ b/test/MC/Mips/macro-li.s.s
@@ -0,0 +1,198 @@
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -target-abi=o32 | FileCheck %s --check-prefixes=ALL,O32-N32-NO-PIC
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -target-abi=o32 -position-independent | FileCheck %s --check-prefixes=ALL,O32-N32-PIC
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips64 -target-abi=n32 | FileCheck %s --check-prefixes=ALL,O32-N32-NO-PIC
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips64 -target-abi=n32 -position-independent | FileCheck %s --check-prefixes=ALL,O32-N32-PIC
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips64 -target-abi=n64 | FileCheck %s --check-prefixes=ALL,N64-NO-PIC
+# RUN: llvm-mc  %s -triple=mipsel-unknown-linux -show-encoding -mcpu=mips64 -target-abi=n64 -position-independent | FileCheck %s --check-prefixes=ALL,N64-PIC
+
+li.s	$4, 0
+# ALL:   addiu   $4, $zero, 0                # encoding: [0x00,0x00,0x04,0x24]
+
+li.s	$4, 0.0
+# ALL:   addiu   $4, $zero, 0                # encoding: [0x00,0x00,0x04,0x24]
+
+li.s	$4, 1.12345
+# ALL:   lui     $4, 16271                   # encoding: [0x8f,0x3f,0x04,0x3c]
+# ALL:   ori     $4, $4, 52534               # encoding: [0x36,0xcd,0x84,0x34]
+
+li.s	$4, 1
+# ALL:   lui     $4, 16256                   # encoding: [0x80,0x3f,0x04,0x3c]
+
+li.s	$4, 1.0
+# ALL:   lui     $4, 16256                   # encoding: [0x80,0x3f,0x04,0x3c]
+
+li.s	$4, 12345678910
+# ALL:   lui     $4, 20535                   # encoding: [0x37,0x50,0x04,0x3c]
+# ALL:   ori     $4, $4, 63239               # encoding: [0x07,0xf7,0x84,0x34]
+
+li.s	$4, 12345678910.0
+# ALL:   lui     $4, 20535                   # encoding: [0x37,0x50,0x04,0x3c]
+# ALL:   ori     $4, $4, 63239               # encoding: [0x07,0xf7,0x84,0x34]
+
+li.s	$4, 0.4
+# ALL:   lui     $4, 16076                   # encoding: [0xcc,0x3e,0x04,0x3c]
+# ALL:   ori     $4, $4, 52429               # encoding: [0xcd,0xcc,0x84,0x34]
+
+li.s	$4, 1.5
+# ALL:   lui     $4, 16320                   # encoding: [0xc0,0x3f,0x04,0x3c]
+
+li.s	$4, 12345678910.12345678910
+# ALL:   lui     $4, 20535                   # encoding: [0x37,0x50,0x04,0x3c]
+# ALL:   ori     $4, $4, 63239               # encoding: [0x07,0xf7,0x84,0x34]
+
+li.s	$4, 12345678910123456789.12345678910
+# ALL:   lui     $4, 24363                   # encoding: [0x2b,0x5f,0x04,0x3c]
+# ALL:   ori     $4, $4, 21674               # encoding: [0xaa,0x54,0x84,0x34]
+
+li.s	$f4, 0
+# ALL:   addiu   $1, $zero, 0                # encoding: [0x00,0x00,0x01,0x24]
+# ALL:   mtc1    $1, $f4                     # encoding: [0x00,0x20,0x81,0x44]
+
+li.s	$f4, 0.0
+# ALL:   addiu   $1, $zero, 0                # encoding: [0x00,0x00,0x01,0x24]
+# ALL:   mtc1    $1, $f4                     # encoding: [0x00,0x20,0x81,0x44]
+
+li.s	$f4, 1.12345
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1066388790
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             lwc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xc4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+
+li.s	$f4, 1
+# ALL:   lui     $1, 16256                   # encoding: [0x80,0x3f,0x01,0x3c]
+# ALL:   mtc1    $1, $f4                     # encoding: [0x00,0x20,0x81,0x44]
+
+li.s	$f4, 1.0
+# ALL:   lui     $1, 16256                   # encoding: [0x80,0x3f,0x01,0x3c]
+# ALL:   mtc1    $1, $f4                     # encoding: [0x00,0x20,0x81,0x44]
+
+li.s	$f4, 12345678910
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1345844999
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             lwc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xc4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+
+li.s	$f4, 12345678910.0
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1345844999
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             lwc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xc4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+
+
+li.s	$f4, 0.4
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1053609165
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             lwc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xc4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+
+li.s	$f4, 1.5
+# ALL:   lui     $1, 16320                   # encoding: [0xc0,0x3f,0x01,0x3c]
+# ALL:   mtc1    $1, $f4                     # encoding: [0x00,0x20,0x81,0x44]
+
+li.s	$f4, 12345678910.12345678910
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1345844999
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             lwc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xc4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
+
+li.s	$f4, 12345678910123456789.12345678910
+# ALL:	.section	.rodata,"a",@progbits
+# ALL:  [[LABEL:\$tmp[0-9]+]]:
+# ALL:	.4byte	1596675242
+# ALL:	.text
+# O32-N32-PIC:     lw      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0x8f]
+# O32-N32-PIC:                                        #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# O32-N32-NO-PIC:  lui     $1, %hi([[LABEL]])         # encoding: [A,A,0x01,0x3c]
+# O32-N32-NO-PIC:                                     #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-PIC:         ld      $1, %got([[LABEL]])($gp)   # encoding: [A,A,0x81,0xdf]
+# N64-PIC:                                            #   fixup A - offset: 0, value: %got([[LABEL]]), kind: fixup_Mips_GOT
+# N64-NO-PIC:      lui     $1, %highest([[LABEL]])    # encoding: [A,A,0x01,0x3c]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %highest([[LABEL]]), kind: fixup_Mips_HIGHEST
+# N64-NO-PIC:      daddiu  $1, $1, %higher([[LABEL]]) # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %higher([[LABEL]]), kind: fixup_Mips_HIGHER
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# N64-NO-PIC:      daddiu  $1, $1, %hi([[LABEL]])     # encoding: [A,A,0x21,0x64]
+# N64-NO-PIC:                                         #   fixup A - offset: 0, value: %hi([[LABEL]]), kind: fixup_Mips_HI16
+# N64-NO-PIC:      dsll    $1, $1, 16                 # encoding: [0x38,0x0c,0x01,0x00]
+# ALL:             lwc1    $f4, %lo([[LABEL]])($1)    # encoding: [A,A,0x24,0xc4]
+# ALL:                                                #   fixup A - offset: 0, value: %lo([[LABEL]]), kind: fixup_Mips_LO16
diff --git a/test/MC/SystemZ/insn-bad-z13.s b/test/MC/SystemZ/insn-bad-z13.s
index 82f47feeb8a9..e9fac44aa883 100644
--- a/test/MC/SystemZ/insn-bad-z13.s
+++ b/test/MC/SystemZ/insn-bad-z13.s
@@ -4,6 +4,152 @@
 # RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=arch11 < %s 2> %t
 # RUN: FileCheck < %t %s
 
+#CHECK: error: invalid operand
+#CHECK: cdpt	%f0, 0(1), -1
+#CHECK: error: invalid operand
+#CHECK: cdpt	%f0, 0(1), 16
+#CHECK: error: missing length in address
+#CHECK: cdpt	%f0, 0, 0
+#CHECK: error: missing length in address
+#CHECK: cdpt	%f0, 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cdpt	%f0, 0(0,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cdpt	%f0, 0(257,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cdpt	%f0, -1(1,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cdpt	%f0, 4096(1,%r1), 0
+#CHECK: error: %r0 used in an address
+#CHECK: cdpt	%f0, 0(1,%r0), 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: cdpt	%f0, 0(%r1,%r2), 0
+#CHECK: error: unknown token in expression
+#CHECK: cdpt	%f0, 0(-), 0
+
+	cdpt	%f0, 0(1), -1
+	cdpt	%f0, 0(1), 16
+	cdpt	%f0, 0, 0
+	cdpt	%f0, 0(%r1), 0
+	cdpt	%f0, 0(0,%r1), 0
+	cdpt	%f0, 0(257,%r1), 0
+	cdpt	%f0, -1(1,%r1), 0
+	cdpt	%f0, 4096(1,%r1), 0
+	cdpt	%f0, 0(1,%r0), 0
+	cdpt	%f0, 0(%r1,%r2), 0
+	cdpt	%f0, 0(-), 0
+
+#CHECK: error: invalid operand
+#CHECK: cpdt	%f0, 0(1), -1
+#CHECK: error: invalid operand
+#CHECK: cpdt	%f0, 0(1), 16
+#CHECK: error: missing length in address
+#CHECK: cpdt	%f0, 0, 0
+#CHECK: error: missing length in address
+#CHECK: cpdt	%f0, 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cpdt	%f0, 0(0,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cpdt	%f0, 0(257,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cpdt	%f0, -1(1,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cpdt	%f0, 4096(1,%r1), 0
+#CHECK: error: %r0 used in an address
+#CHECK: cpdt	%f0, 0(1,%r0), 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: cpdt	%f0, 0(%r1,%r2), 0
+#CHECK: error: unknown token in expression
+#CHECK: cpdt	%f0, 0(-), 0
+
+	cpdt	%f0, 0(1), -1
+	cpdt	%f0, 0(1), 16
+	cpdt	%f0, 0, 0
+	cpdt	%f0, 0(%r1), 0
+	cpdt	%f0, 0(0,%r1), 0
+	cpdt	%f0, 0(257,%r1), 0
+	cpdt	%f0, -1(1,%r1), 0
+	cpdt	%f0, 4096(1,%r1), 0
+	cpdt	%f0, 0(1,%r0), 0
+	cpdt	%f0, 0(%r1,%r2), 0
+	cpdt	%f0, 0(-), 0
+
+#CHECK: error: invalid operand
+#CHECK: cpxt	%f0, 0(1), -1
+#CHECK: error: invalid operand
+#CHECK: cpxt	%f0, 0(1), 16
+#CHECK: error: missing length in address
+#CHECK: cpxt	%f0, 0, 0
+#CHECK: error: missing length in address
+#CHECK: cpxt	%f0, 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cpxt	%f0, 0(0,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cpxt	%f0, 0(257,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cpxt	%f0, -1(1,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cpxt	%f0, 4096(1,%r1), 0
+#CHECK: error: %r0 used in an address
+#CHECK: cpxt	%f0, 0(1,%r0), 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: cpxt	%f0, 0(%r1,%r2), 0
+#CHECK: error: unknown token in expression
+#CHECK: cpxt	%f0, 0(-), 0
+#CHECK: error: invalid register pair
+#CHECK: cpxt	%f15, 0(1), 0
+
+	cpxt	%f0, 0(1), -1
+	cpxt	%f0, 0(1), 16
+	cpxt	%f0, 0, 0
+	cpxt	%f0, 0(%r1), 0
+	cpxt	%f0, 0(0,%r1), 0
+	cpxt	%f0, 0(257,%r1), 0
+	cpxt	%f0, -1(1,%r1), 0
+	cpxt	%f0, 4096(1,%r1), 0
+	cpxt	%f0, 0(1,%r0), 0
+	cpxt	%f0, 0(%r1,%r2), 0
+	cpxt	%f0, 0(-), 0
+	cpxt	%f15, 0(1), 0
+
+#CHECK: error: invalid operand
+#CHECK: cxpt	%f0, 0(1), -1
+#CHECK: error: invalid operand
+#CHECK: cxpt	%f0, 0(1), 16
+#CHECK: error: missing length in address
+#CHECK: cxpt	%f0, 0, 0
+#CHECK: error: missing length in address
+#CHECK: cxpt	%f0, 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cxpt	%f0, 0(0,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cxpt	%f0, 0(257,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cxpt	%f0, -1(1,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cxpt	%f0, 4096(1,%r1), 0
+#CHECK: error: %r0 used in an address
+#CHECK: cxpt	%f0, 0(1,%r0), 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: cxpt	%f0, 0(%r1,%r2), 0
+#CHECK: error: unknown token in expression
+#CHECK: cxpt	%f0, 0(-), 0
+#CHECK: error: invalid register pair
+#CHECK: cxpt	%f15, 0(1), 0
+
+	cxpt	%f0, 0(1), -1
+	cxpt	%f0, 0(1), 16
+	cxpt	%f0, 0, 0
+	cxpt	%f0, 0(%r1), 0
+	cxpt	%f0, 0(0,%r1), 0
+	cxpt	%f0, 0(257,%r1), 0
+	cxpt	%f0, -1(1,%r1), 0
+	cxpt	%f0, 4096(1,%r1), 0
+	cxpt	%f0, 0(1,%r0), 0
+	cxpt	%f0, 0(%r1,%r2), 0
+	cxpt	%f0, 0(-), 0
+	cxpt	%f15, 0(1), 0
+
 #CHECK: error: invalid operand
 #CHECK: lcbb	%r0, 0, -1
 #CHECK: error: invalid operand
diff --git a/test/MC/SystemZ/insn-bad-z196.s b/test/MC/SystemZ/insn-bad-z196.s
index 04c19ff6319c..78d50bca9746 100644
--- a/test/MC/SystemZ/insn-bad-z196.s
+++ b/test/MC/SystemZ/insn-bad-z196.s
@@ -4,6 +4,14 @@
 # RUN: not llvm-mc -triple s390x-linux-gnu -mcpu=arch9 < %s 2> %t
 # RUN: FileCheck < %t %s
 
+#CHECK: error: invalid operand
+#CHECK: adtra	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: adtra	%f0, %f0, %f0, 16
+
+	adtra	%f0, %f0, %f0, -1
+	adtra	%f0, %f0, %f0, 16
+
 #CHECK: error: invalid operand
 #CHECK: aghik	%r0, %r1, -32769
 #CHECK: error: invalid operand
@@ -34,6 +42,23 @@
 	aih	%r0, (-1 << 31) - 1
 	aih	%r0, (1 << 31)
 
+#CHECK: error: invalid operand
+#CHECK: axtra	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: axtra	%f0, %f0, %f0, 16
+#CHECK: error: invalid register pair
+#CHECK: axtra	%f0, %f0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: axtra	%f0, %f2, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: axtra	%f2, %f0, %f0, 0
+
+	axtra	%f0, %f0, %f0, -1
+	axtra	%f0, %f0, %f0, 16
+	axtra	%f0, %f0, %f2, 0
+	axtra	%f0, %f2, %f0, 0
+	axtra	%f2, %f0, %f0, 0
+
 #CHECK: error: instruction requires: execution-hint
 #CHECK: bpp	0, 0, 0
 
@@ -72,6 +97,20 @@
 	cdfbra	%f0, -1, %r0, 0
 	cdfbra	%f0, 16, %r0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cdftr	%f0, 0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: cdftr	%f0, 0, %r0, 16
+#CHECK: error: invalid operand
+#CHECK: cdftr	%f0, -1, %r0, 0
+#CHECK: error: invalid operand
+#CHECK: cdftr	%f0, 16, %r0, 0
+
+	cdftr	%f0, 0, %r0, -1
+	cdftr	%f0, 0, %r0, 16
+	cdftr	%f0, -1, %r0, 0
+	cdftr	%f0, 16, %r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cdgbra	%f0, 0, %r0, -1
 #CHECK: error: invalid operand
@@ -86,6 +125,20 @@
 	cdgbra	%f0, -1, %r0, 0
 	cdgbra	%f0, 16, %r0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cdgtra	%f0, 0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: cdgtra	%f0, 0, %r0, 16
+#CHECK: error: invalid operand
+#CHECK: cdgtra	%f0, -1, %r0, 0
+#CHECK: error: invalid operand
+#CHECK: cdgtra	%f0, 16, %r0, 0
+
+	cdgtra	%f0, 0, %r0, -1
+	cdgtra	%f0, 0, %r0, 16
+	cdgtra	%f0, -1, %r0, 0
+	cdgtra	%f0, 16, %r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cdlfbr	%f0, 0, %r0, -1
 #CHECK: error: invalid operand
@@ -100,6 +153,25 @@
 	cdlfbr	%f0, -1, %r0, 0
 	cdlfbr	%f0, 16, %r0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cdlftr	%f0, 0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: cdlftr	%f0, 0, %r0, 16
+#CHECK: error: invalid operand
+#CHECK: cdlftr	%f0, -1, %r0, 0
+#CHECK: error: invalid operand
+#CHECK: cdlftr	%f0, 16, %r0, 0
+
+	cdlftr	%f0, 0, %r0, -1
+	cdlftr	%f0, 0, %r0, 16
+	cdlftr	%f0, -1, %r0, 0
+	cdlftr	%f0, 16, %r0, 0
+
+#CHECK: error: instruction requires: dfp-zoned-conversion
+#CHECK: cdzt	%f0, 0(1), 0
+
+	cdzt	%f0, 0(1), 0
+
 #CHECK: error: invalid operand
 #CHECK: cdlgbr	%f0, 0, %r0, -1
 #CHECK: error: invalid operand
@@ -114,6 +186,20 @@
 	cdlgbr	%f0, -1, %r0, 0
 	cdlgbr	%f0, 16, %r0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cdlgtr	%f0, 0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: cdlgtr	%f0, 0, %r0, 16
+#CHECK: error: invalid operand
+#CHECK: cdlgtr	%f0, -1, %r0, 0
+#CHECK: error: invalid operand
+#CHECK: cdlgtr	%f0, 16, %r0, 0
+
+	cdlgtr	%f0, 0, %r0, -1
+	cdlgtr	%f0, 0, %r0, 16
+	cdlgtr	%f0, -1, %r0, 0
+	cdlgtr	%f0, 16, %r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cefbra	%f0, 0, %r0, -1
 #CHECK: error: invalid operand
@@ -184,6 +270,20 @@
 	cfdbra	%r0, -1, %f0, 0
 	cfdbra	%r0, 16, %f0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cfdtr	%r0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: cfdtr	%r0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: cfdtr	%r0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: cfdtr	%r0, 16, %f0, 0
+
+	cfdtr	%r0, 0, %f0, -1
+	cfdtr	%r0, 0, %f0, 16
+	cfdtr	%r0, -1, %f0, 0
+	cfdtr	%r0, 16, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cfebra	%r0, 0, %f0, -1
 #CHECK: error: invalid operand
@@ -215,6 +315,23 @@
 	cfxbra	%r0, 16, %f0, 0
 	cfxbra	%r0, 0, %f14, 0
 
+#CHECK: error: invalid operand
+#CHECK: cfxtr	%r0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: cfxtr	%r0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: cfxtr	%r0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: cfxtr	%r0, 16, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: cfxtr	%r0, 0, %f14, 0
+
+	cfxtr	%r0, 0, %f0, -1
+	cfxtr	%r0, 0, %f0, 16
+	cfxtr	%r0, -1, %f0, 0
+	cfxtr	%r0, 16, %f0, 0
+	cfxtr	%r0, 0, %f14, 0
+
 #CHECK: error: invalid operand
 #CHECK: cgdbra	%r0, 0, %f0, -1
 #CHECK: error: invalid operand
@@ -229,6 +346,20 @@
 	cgdbra	%r0, -1, %f0, 0
 	cgdbra	%r0, 16, %f0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cgdtra	%r0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: cgdtra	%r0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: cgdtra	%r0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: cgdtra	%r0, 16, %f0, 0
+
+	cgdtra	%r0, 0, %f0, -1
+	cgdtra	%r0, 0, %f0, 16
+	cgdtra	%r0, -1, %f0, 0
+	cgdtra	%r0, 16, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cgebra	%r0, 0, %f0, -1
 #CHECK: error: invalid operand
@@ -260,6 +391,23 @@
 	cgxbra	%r0, 16, %f0, 0
 	cgxbra	%r0, 0, %f14, 0
 
+#CHECK: error: invalid operand
+#CHECK: cgxtra	%r0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: cgxtra	%r0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: cgxtra	%r0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: cgxtra	%r0, 16, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: cgxtra	%r0, 0, %f14, 0
+
+	cgxtra	%r0, 0, %f0, -1
+	cgxtra	%r0, 0, %f0, 16
+	cgxtra	%r0, -1, %f0, 0
+	cgxtra	%r0, 16, %f0, 0
+	cgxtra	%r0, 0, %f14, 0
+
 #CHECK: error: invalid operand
 #CHECK: chf	%r0, -524289
 #CHECK: error: invalid operand
@@ -290,6 +438,20 @@
 	clfdbr	%r0, -1, %f0, 0
 	clfdbr	%r0, 16, %f0, 0
 
+#CHECK: error: invalid operand
+#CHECK: clfdtr	%r0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: clfdtr	%r0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: clfdtr	%r0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: clfdtr	%r0, 16, %f0, 0
+
+	clfdtr	%r0, 0, %f0, -1
+	clfdtr	%r0, 0, %f0, 16
+	clfdtr	%r0, -1, %f0, 0
+	clfdtr	%r0, 16, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: clfebr	%r0, 0, %f0, -1
 #CHECK: error: invalid operand
@@ -321,6 +483,23 @@
 	clfxbr	%r0, 16, %f0, 0
 	clfxbr	%r0, 0, %f14, 0
 
+#CHECK: error: invalid operand
+#CHECK: clfxtr	%r0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: clfxtr	%r0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: clfxtr	%r0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: clfxtr	%r0, 16, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: clfxtr	%r0, 0, %f14, 0
+
+	clfxtr	%r0, 0, %f0, -1
+	clfxtr	%r0, 0, %f0, 16
+	clfxtr	%r0, -1, %f0, 0
+	clfxtr	%r0, 16, %f0, 0
+	clfxtr	%r0, 0, %f14, 0
+
 #CHECK: error: invalid operand
 #CHECK: clgdbr	%r0, 0, %f0, -1
 #CHECK: error: invalid operand
@@ -335,6 +514,20 @@
 	clgdbr	%r0, -1, %f0, 0
 	clgdbr	%r0, 16, %f0, 0
 
+#CHECK: error: invalid operand
+#CHECK: clgdtr	%r0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: clgdtr	%r0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: clgdtr	%r0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: clgdtr	%r0, 16, %f0, 0
+
+	clgdtr	%r0, 0, %f0, -1
+	clgdtr	%r0, 0, %f0, 16
+	clgdtr	%r0, -1, %f0, 0
+	clgdtr	%r0, 16, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: clgebr	%r0, 0, %f0, -1
 #CHECK: error: invalid operand
@@ -366,6 +559,23 @@
 	clgxbr	%r0, 16, %f0, 0
 	clgxbr	%r0, 0, %f14, 0
 
+#CHECK: error: invalid operand
+#CHECK: clgxtr	%r0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: clgxtr	%r0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: clgxtr	%r0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: clgxtr	%r0, 16, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: clgxtr	%r0, 0, %f14, 0
+
+	clgxtr	%r0, 0, %f0, -1
+	clgxtr	%r0, 0, %f0, 16
+	clgxtr	%r0, -1, %f0, 0
+	clgxtr	%r0, 16, %f0, 0
+	clgxtr	%r0, 0, %f14, 0
+
 #CHECK: error: invalid operand
 #CHECK: clhf	%r0, -524289
 #CHECK: error: invalid operand
@@ -399,6 +609,23 @@
 	cxfbra	%f0, 16, %r0, 0
 	cxfbra	%f2, 0, %r0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cxftr	%f0, 0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: cxftr	%f0, 0, %r0, 16
+#CHECK: error: invalid operand
+#CHECK: cxftr	%f0, -1, %r0, 0
+#CHECK: error: invalid operand
+#CHECK: cxftr	%f0, 16, %r0, 0
+#CHECK: error: invalid register pair
+#CHECK: cxftr	%f2, 0, %r0, 0
+
+	cxftr	%f0, 0, %r0, -1
+	cxftr	%f0, 0, %r0, 16
+	cxftr	%f0, -1, %r0, 0
+	cxftr	%f0, 16, %r0, 0
+	cxftr	%f2, 0, %r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cxgbra	%f0, 0, %r0, -1
 #CHECK: error: invalid operand
@@ -416,6 +643,23 @@
 	cxgbra	%f0, 16, %r0, 0
 	cxgbra	%f2, 0, %r0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cxgtra	%f0, 0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: cxgtra	%f0, 0, %r0, 16
+#CHECK: error: invalid operand
+#CHECK: cxgtra	%f0, -1, %r0, 0
+#CHECK: error: invalid operand
+#CHECK: cxgtra	%f0, 16, %r0, 0
+#CHECK: error: invalid register pair
+#CHECK: cxgtra	%f2, 0, %r0, 0
+
+	cxgtra	%f0, 0, %r0, -1
+	cxgtra	%f0, 0, %r0, 16
+	cxgtra	%f0, -1, %r0, 0
+	cxgtra	%f0, 16, %r0, 0
+	cxgtra	%f2, 0, %r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cxlfbr	%f0, 0, %r0, -1
 #CHECK: error: invalid operand
@@ -433,6 +677,23 @@
 	cxlfbr	%f0, 16, %r0, 0
 	cxlfbr	%f2, 0, %r0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cxlftr	%f0, 0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: cxlftr	%f0, 0, %r0, 16
+#CHECK: error: invalid operand
+#CHECK: cxlftr	%f0, -1, %r0, 0
+#CHECK: error: invalid operand
+#CHECK: cxlftr	%f0, 16, %r0, 0
+#CHECK: error: invalid register pair
+#CHECK: cxlftr	%f2, 0, %r0, 0
+
+	cxlftr	%f0, 0, %r0, -1
+	cxlftr	%f0, 0, %r0, 16
+	cxlftr	%f0, -1, %r0, 0
+	cxlftr	%f0, 16, %r0, 0
+	cxlftr	%f2, 0, %r0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cxlgbr	%f0, 0, %r0, -1
 #CHECK: error: invalid operand
@@ -450,6 +711,63 @@
 	cxlgbr	%f0, 16, %r0, 0
 	cxlgbr	%f2, 0, %r0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cxlgtr	%f0, 0, %r0, -1
+#CHECK: error: invalid operand
+#CHECK: cxlgtr	%f0, 0, %r0, 16
+#CHECK: error: invalid operand
+#CHECK: cxlgtr	%f0, -1, %r0, 0
+#CHECK: error: invalid operand
+#CHECK: cxlgtr	%f0, 16, %r0, 0
+#CHECK: error: invalid register pair
+#CHECK: cxlgtr	%f2, 0, %r0, 0
+
+	cxlgtr	%f0, 0, %r0, -1
+	cxlgtr	%f0, 0, %r0, 16
+	cxlgtr	%f0, -1, %r0, 0
+	cxlgtr	%f0, 16, %r0, 0
+	cxlgtr	%f2, 0, %r0, 0
+
+#CHECK: error: instruction requires: dfp-zoned-conversion
+#CHECK: cxzt	%f0, 0(1), 0
+
+	cxzt	%f0, 0(1), 0
+
+#CHECK: error: instruction requires: dfp-zoned-conversion
+#CHECK: czdt	%f0, 0(1), 0
+
+	czdt	%f0, 0(1), 0
+
+#CHECK: error: instruction requires: dfp-zoned-conversion
+#CHECK: czxt	%f0, 0(1), 0
+
+	czxt	%f0, 0(1), 0
+
+#CHECK: error: invalid operand
+#CHECK: ddtra	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: ddtra	%f0, %f0, %f0, 16
+
+	ddtra	%f0, %f0, %f0, -1
+	ddtra	%f0, %f0, %f0, 16
+
+#CHECK: error: invalid operand
+#CHECK: dxtra	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: dxtra	%f0, %f0, %f0, 16
+#CHECK: error: invalid register pair
+#CHECK: dxtra	%f0, %f0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: dxtra	%f0, %f2, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: dxtra	%f2, %f0, %f0, 0
+
+	dxtra	%f0, %f0, %f0, -1
+	dxtra	%f0, %f0, %f0, 16
+	dxtra	%f0, %f0, %f2, 0
+	dxtra	%f0, %f2, %f0, 0
+	dxtra	%f2, %f0, %f0, 0
+
 #CHECK: error: instruction requires: transactional-execution
 #CHECK: etnd	%r7
 
@@ -824,6 +1142,31 @@
 	lpdg	%r2, 0(%r1), -1(%r15)
 	lpdg	%r2, 0(%r1), 4096(%r15)
 
+#CHECK: error: invalid operand
+#CHECK: mdtra	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: mdtra	%f0, %f0, %f0, 16
+
+	mdtra	%f0, %f0, %f0, -1
+	mdtra	%f0, %f0, %f0, 16
+
+#CHECK: error: invalid operand
+#CHECK: mxtra	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: mxtra	%f0, %f0, %f0, 16
+#CHECK: error: invalid register pair
+#CHECK: mxtra	%f0, %f0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: mxtra	%f0, %f2, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: mxtra	%f2, %f0, %f0, 0
+
+	mxtra	%f0, %f0, %f0, -1
+	mxtra	%f0, %f0, %f0, 16
+	mxtra	%f0, %f0, %f2, 0
+	mxtra	%f0, %f2, %f0, 0
+	mxtra	%f2, %f0, %f0, 0
+
 #CHECK: error: instruction requires: execution-hint
 #CHECK: niai	0, 0
 
@@ -884,6 +1227,14 @@
 	risblg	%r0,%r0,-1,0,0
 	risblg	%r0,%r0,256,0,0
 
+#CHECK: error: invalid operand
+#CHECK: sdtra	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: sdtra	%f0, %f0, %f0, 16
+
+	sdtra	%f0, %f0, %f0, -1
+	sdtra	%f0, %f0, %f0, 16
+
 #CHECK: error: invalid operand
 #CHECK: slak	%r0,%r0,-524289
 #CHECK: error: invalid operand
@@ -1009,6 +1360,23 @@
 	stocg	%r0,524288,1
 	stocg	%r0,0(%r1,%r2),1
 
+#CHECK: error: invalid operand
+#CHECK: sxtra	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: sxtra	%f0, %f0, %f0, 16
+#CHECK: error: invalid register pair
+#CHECK: sxtra	%f0, %f0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: sxtra	%f0, %f2, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: sxtra	%f2, %f0, %f0, 0
+
+	sxtra	%f0, %f0, %f0, -1
+	sxtra	%f0, %f0, %f0, 16
+	sxtra	%f0, %f0, %f2, 0
+	sxtra	%f0, %f2, %f0, 0
+	sxtra	%f2, %f0, %f0, 0
+
 #CHECK: error: instruction requires: transactional-execution
 #CHECK: tabort	4095(%r1)
 
diff --git a/test/MC/SystemZ/insn-bad-zEC12.s b/test/MC/SystemZ/insn-bad-zEC12.s
index 4bc3be3292e4..80197a3c1ef1 100644
--- a/test/MC/SystemZ/insn-bad-zEC12.s
+++ b/test/MC/SystemZ/insn-bad-zEC12.s
@@ -62,6 +62,46 @@
 	bprp	0, 0, 1
 	bprp	0, 0, 0x1000000
 
+#CHECK: error: instruction requires: dfp-packed-conversion
+#CHECK: cdpt	%f0, 0(1), 0
+
+	cdpt	%f0, 0(1), 0
+
+#CHECK: error: invalid operand
+#CHECK: cdzt	%f0, 0(1), -1
+#CHECK: error: invalid operand
+#CHECK: cdzt	%f0, 0(1), 16
+#CHECK: error: missing length in address
+#CHECK: cdzt	%f0, 0, 0
+#CHECK: error: missing length in address
+#CHECK: cdzt	%f0, 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cdzt	%f0, 0(0,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cdzt	%f0, 0(257,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cdzt	%f0, -1(1,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cdzt	%f0, 4096(1,%r1), 0
+#CHECK: error: %r0 used in an address
+#CHECK: cdzt	%f0, 0(1,%r0), 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: cdzt	%f0, 0(%r1,%r2), 0
+#CHECK: error: unknown token in expression
+#CHECK: cdzt	%f0, 0(-), 0
+
+	cdzt	%f0, 0(1), -1
+	cdzt	%f0, 0(1), 16
+	cdzt	%f0, 0, 0
+	cdzt	%f0, 0(%r1), 0
+	cdzt	%f0, 0(0,%r1), 0
+	cdzt	%f0, 0(257,%r1), 0
+	cdzt	%f0, -1(1,%r1), 0
+	cdzt	%f0, 4096(1,%r1), 0
+	cdzt	%f0, 0(1,%r0), 0
+	cdzt	%f0, 0(%r1,%r2), 0
+	cdzt	%f0, 0(-), 0
+
 #CHECK: error: invalid operand
 #CHECK: clgt	%r0, -1, 0
 #CHECK: error: invalid operand
@@ -112,6 +152,132 @@
         cltno   %r0, 0
         clto    %r0, 0
 
+#CHECK: error: instruction requires: dfp-packed-conversion
+#CHECK: cpdt	%f0, 0(1), 0
+
+	cpdt	%f0, 0(1), 0
+
+#CHECK: error: instruction requires: dfp-packed-conversion
+#CHECK: cpxt	%f0, 0(1), 0
+
+	cpxt	%f0, 0(1), 0
+
+#CHECK: error: instruction requires: dfp-packed-conversion
+#CHECK: cxpt	%f0, 0(1), 0
+
+	cxpt	%f0, 0(1), 0
+
+#CHECK: error: invalid operand
+#CHECK: cxzt	%f0, 0(1), -1
+#CHECK: error: invalid operand
+#CHECK: cxzt	%f0, 0(1), 16
+#CHECK: error: missing length in address
+#CHECK: cxzt	%f0, 0, 0
+#CHECK: error: missing length in address
+#CHECK: cxzt	%f0, 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cxzt	%f0, 0(0,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cxzt	%f0, 0(257,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cxzt	%f0, -1(1,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: cxzt	%f0, 4096(1,%r1), 0
+#CHECK: error: %r0 used in an address
+#CHECK: cxzt	%f0, 0(1,%r0), 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: cxzt	%f0, 0(%r1,%r2), 0
+#CHECK: error: unknown token in expression
+#CHECK: cxzt	%f0, 0(-), 0
+#CHECK: error: invalid register pair
+#CHECK: cxzt	%f15, 0(1), 0
+
+	cxzt	%f0, 0(1), -1
+	cxzt	%f0, 0(1), 16
+	cxzt	%f0, 0, 0
+	cxzt	%f0, 0(%r1), 0
+	cxzt	%f0, 0(0,%r1), 0
+	cxzt	%f0, 0(257,%r1), 0
+	cxzt	%f0, -1(1,%r1), 0
+	cxzt	%f0, 4096(1,%r1), 0
+	cxzt	%f0, 0(1,%r0), 0
+	cxzt	%f0, 0(%r1,%r2), 0
+	cxzt	%f0, 0(-), 0
+	cxzt	%f15, 0(1), 0
+
+#CHECK: error: invalid operand
+#CHECK: czdt	%f0, 0(1), -1
+#CHECK: error: invalid operand
+#CHECK: czdt	%f0, 0(1), 16
+#CHECK: error: missing length in address
+#CHECK: czdt	%f0, 0, 0
+#CHECK: error: missing length in address
+#CHECK: czdt	%f0, 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: czdt	%f0, 0(0,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: czdt	%f0, 0(257,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: czdt	%f0, -1(1,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: czdt	%f0, 4096(1,%r1), 0
+#CHECK: error: %r0 used in an address
+#CHECK: czdt	%f0, 0(1,%r0), 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: czdt	%f0, 0(%r1,%r2), 0
+#CHECK: error: unknown token in expression
+#CHECK: czdt	%f0, 0(-), 0
+
+	czdt	%f0, 0(1), -1
+	czdt	%f0, 0(1), 16
+	czdt	%f0, 0, 0
+	czdt	%f0, 0(%r1), 0
+	czdt	%f0, 0(0,%r1), 0
+	czdt	%f0, 0(257,%r1), 0
+	czdt	%f0, -1(1,%r1), 0
+	czdt	%f0, 4096(1,%r1), 0
+	czdt	%f0, 0(1,%r0), 0
+	czdt	%f0, 0(%r1,%r2), 0
+	czdt	%f0, 0(-), 0
+
+#CHECK: error: invalid operand
+#CHECK: czxt	%f0, 0(1), -1
+#CHECK: error: invalid operand
+#CHECK: czxt	%f0, 0(1), 16
+#CHECK: error: missing length in address
+#CHECK: czxt	%f0, 0, 0
+#CHECK: error: missing length in address
+#CHECK: czxt	%f0, 0(%r1), 0
+#CHECK: error: invalid operand
+#CHECK: czxt	%f0, 0(0,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: czxt	%f0, 0(257,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: czxt	%f0, -1(1,%r1), 0
+#CHECK: error: invalid operand
+#CHECK: czxt	%f0, 4096(1,%r1), 0
+#CHECK: error: %r0 used in an address
+#CHECK: czxt	%f0, 0(1,%r0), 0
+#CHECK: error: invalid use of indexed addressing
+#CHECK: czxt	%f0, 0(%r1,%r2), 0
+#CHECK: error: unknown token in expression
+#CHECK: czxt	%f0, 0(-), 0
+#CHECK: error: invalid register pair
+#CHECK: czxt	%f15, 0(1), 0
+
+	czxt	%f0, 0(1), -1
+	czxt	%f0, 0(1), 16
+	czxt	%f0, 0, 0
+	czxt	%f0, 0(%r1), 0
+	czxt	%f0, 0(0,%r1), 0
+	czxt	%f0, 0(257,%r1), 0
+	czxt	%f0, -1(1,%r1), 0
+	czxt	%f0, 4096(1,%r1), 0
+	czxt	%f0, 0(1,%r0), 0
+	czxt	%f0, 0(%r1,%r2), 0
+	czxt	%f0, 0(-), 0
+	czxt	%f15, 0(1), 0
+
 #CHECK: error: invalid operand
 #CHECK: lat	%r0, -524289
 #CHECK: error: invalid operand
diff --git a/test/MC/SystemZ/insn-bad.s b/test/MC/SystemZ/insn-bad.s
index b96c661ae3da..259ad05e5f4a 100644
--- a/test/MC/SystemZ/insn-bad.s
+++ b/test/MC/SystemZ/insn-bad.s
@@ -12,6 +12,14 @@
 	a	%r0, -1
 	a	%r0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: ad	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: ad	%f0, 4096
+
+	ad	%f0, -1
+	ad	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: adb	%f0, -1
 #CHECK: error: invalid operand
@@ -20,6 +28,19 @@
 	adb	%f0, -1
 	adb	%f0, 4096
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: adtra	%f0, %f0, %f0, 0
+
+	adtra	%f0, %f0, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: ae	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: ae	%f0, 4096
+
+	ae	%f0, -1
+	ae	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: aeb	%f0, -1
 #CHECK: error: invalid operand
@@ -328,6 +349,22 @@
 	asi	0, -129
 	asi	0, 128
 
+#CHECK: error: invalid operand
+#CHECK: au	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: au	%f0, 4096
+
+	au	%f0, -1
+	au	%f0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: aw	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: aw	%f0, 4096
+
+	aw	%f0, -1
+	aw	%f0, 4096
+
 #CHECK: error: invalid register pair
 #CHECK: axbr	%f0, %f2
 #CHECK: error: invalid register pair
@@ -336,6 +373,29 @@
 	axbr	%f0, %f2
 	axbr	%f2, %f0
 
+#CHECK: error: invalid register pair
+#CHECK: axr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: axr	%f2, %f0
+
+	axr	%f0, %f2
+	axr	%f2, %f0
+
+#CHECK: error: invalid register pair
+#CHECK: axtr	%f0, %f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: axtr	%f0, %f2, %f0
+#CHECK: error: invalid register pair
+#CHECK: axtr	%f2, %f0, %f0
+
+	axtr	%f0, %f0, %f2
+	axtr	%f0, %f2, %f0
+	axtr	%f2, %f0, %f0
+
+#CHECK: error: instruction requires: fp-extension
+#CHECK: axtra	%f0, %f0, %f0, 0
+
+	axtra	%f0, %f0, %f0, 0
 
 #CHECK: error: invalid operand
 #CHECK: ay	%r0, -524289
@@ -612,6 +672,14 @@
 	c	%r0, -1
 	c	%r0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: cd	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: cd	%f0, 4096
+
+	cd	%f0, -1
+	cd	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: cdb	%f0, -1
 #CHECK: error: invalid operand
@@ -625,21 +693,41 @@
 
 	cdfbra	%f0, 0, %r0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: cdftr	%f0, 0, %r0, 0
+
+	cdftr	%f0, 0, %r0, 0
+
 #CHECK: error: instruction requires: fp-extension
 #CHECK: cdgbra	%f0, 0, %r0, 0
 
 	cdgbra	%f0, 0, %r0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: cdgtra	%f0, 0, %r0, 0
+
+	cdgtra	%f0, 0, %r0, 0
+
 #CHECK: error: instruction requires: fp-extension
 #CHECK: cdlfbr	%f0, 0, %r0, 0
 
 	cdlfbr	%f0, 0, %r0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: cdlftr	%f0, 0, %r0, 0
+
+	cdlftr	%f0, 0, %r0, 0
+
 #CHECK: error: instruction requires: fp-extension
 #CHECK: cdlgbr	%f0, 0, %r0, 0
 
 	cdlgbr	%f0, 0, %r0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: cdlgtr	%f0, 0, %r0, 0
+
+	cdlgtr	%f0, 0, %r0, 0
+
 #CHECK: error: invalid register pair
 #CHECK: cds	%r1, %r0, 0
 #CHECK: error: invalid register pair
@@ -691,6 +779,14 @@
 	cdsy	%r0, %r0, 524288
 	cdsy	%r0, %r0, 0(%r1,%r2)
 
+#CHECK: error: invalid operand
+#CHECK: ce	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: ce	%f0, 4096
+
+	ce	%f0, -1
+	ce	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: ceb	%f0, -1
 #CHECK: error: invalid operand
@@ -719,6 +815,14 @@
 
 	celgbr	%f0, 0, %r0, 0
 
+#CHECK: error: invalid register pair
+#CHECK: cextr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: cextr	%f2, %f0
+
+	cextr	%f0, %f2
+	cextr	%f2, %f0
+
 #CHECK: error: invalid operand
 #CHECK: cfc	-1
 #CHECK: error: invalid operand
@@ -743,6 +847,11 @@
 
 	cfdbra	%r0, 0, %f0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: cfdtr	%r0, 0, %f0, 0
+
+	cfdtr	%r0, 0, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cfebr	%r0, -1, %f0
 #CHECK: error: invalid operand
@@ -780,6 +889,22 @@
 
 	cfxbra	%r0, 0, %f0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: cfxtr	%r0, 0, %f0, 0
+
+	cfxtr	%r0, 0, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: cfxr	%r0, -1, %f0
+#CHECK: error: invalid operand
+#CHECK: cfxr	%r0, 16, %f0
+#CHECK: error: invalid register pair
+#CHECK: cfxr	%r0, 0, %f2
+
+	cfxr	%r0, -1, %f0
+	cfxr	%r0, 16, %f0
+	cfxr	%r0, 0, %f2
+
 #CHECK: error: invalid operand
 #CHECK: cg	%r0, -524289
 #CHECK: error: invalid operand
@@ -801,6 +926,19 @@
 
 	cgdbra	%r0, 0, %f0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cgdtr	%r0, -1, %f0
+#CHECK: error: invalid operand
+#CHECK: cgdtr	%r0, 16, %f0
+
+	cgdtr	%r0, -1, %f0
+	cgdtr	%r0, 16, %f0
+
+#CHECK: error: instruction requires: fp-extension
+#CHECK: cgdtra	%r0, 0, %f0, 0
+
+	cgdtra	%r0, 0, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: cgebr	%r0, -1, %f0
 #CHECK: error: invalid operand
@@ -998,6 +1136,33 @@
 
 	cgxbra	%r0, 0, %f0, 0
 
+#CHECK: error: invalid operand
+#CHECK: cgxtr	%r0, -1, %f0
+#CHECK: error: invalid operand
+#CHECK: cgxtr	%r0, 16, %f0
+#CHECK: error: invalid register pair
+#CHECK: cgxtr	%r0, 0, %f2
+
+	cgxtr	%r0, -1, %f0
+	cgxtr	%r0, 16, %f0
+	cgxtr	%r0, 0, %f2
+
+#CHECK: error: instruction requires: fp-extension
+#CHECK: cgxtra	%r0, 0, %f0, 0
+
+	cgxtra	%r0, 0, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: cgxr	%r0, -1, %f0
+#CHECK: error: invalid operand
+#CHECK: cgxr	%r0, 16, %f0
+#CHECK: error: invalid register pair
+#CHECK: cgxr	%r0, 0, %f2
+
+	cgxr	%r0, -1, %f0
+	cgxr	%r0, 16, %f0
+	cgxr	%r0, 0, %f2
+
 #CHECK: error: invalid operand
 #CHECK: ch	%r0, -1
 #CHECK: error: invalid operand
@@ -1225,6 +1390,11 @@
 
 	clfdbr	%r0, 0, %f0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: clfdtr	%r0, 0, %f0, 0
+
+	clfdtr	%r0, 0, %f0, 0
+
 #CHECK: error: instruction requires: fp-extension
 #CHECK: clfebr	%r0, 0, %f0, 0
 
@@ -1274,6 +1444,11 @@
 
 	clfxbr	%r0, 0, %f0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: clfxtr	%r0, 0, %f0, 0
+
+	clfxtr	%r0, 0, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: clg	%r0, -524289
 #CHECK: error: invalid operand
@@ -1287,6 +1462,11 @@
 
 	clgdbr	%r0, 0, %f0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: clgdtr	%r0, 0, %f0, 0
+
+	clgdtr	%r0, 0, %f0, 0
+
 #CHECK: error: instruction requires: fp-extension
 #CHECK: clgebr	%r0, 0, %f0, 0
 
@@ -1438,6 +1618,11 @@
 
 	clgxbr	%r0, 0, %f0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: clgxtr	%r0, 0, %f0, 0
+
+	clgxtr	%r0, 0, %f0, 0
+
 #CHECK: error: instruction requires: high-word
 #CHECK: clhf	%r0, 0
 
@@ -1753,6 +1938,14 @@
 	cs	%r0, %r0, 4096
 	cs	%r0, %r0, 0(%r1,%r2)
 
+#CHECK: error: invalid operand
+#CHECK: csdtr	%r0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: csdtr	%r0, %f0, 16
+
+	csdtr	%r0, %f0, -1
+	csdtr	%r0, %f0, 16
+
 #CHECK: error: invalid operand
 #CHECK: csg	%r0, %r0, -524289
 #CHECK: error: invalid operand
@@ -1781,6 +1974,20 @@
         csst	0(%r1), -1(%r15), %r2
         csst	0(%r1), 4096(%r15), %r2
 
+#CHECK: error: invalid operand
+#CHECK: csxtr	%r0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: csxtr	%r0, %f0, 16
+#CHECK: error: invalid register pair
+#CHECK: csxtr	%r0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: csxtr	%r1, %f0, 0
+
+	csxtr	%r0, %f0, -1
+	csxtr	%r0, %f0, 16
+	csxtr	%r0, %f2, 0
+	csxtr	%r1, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: csy	%r0, %r0, -524289
 #CHECK: error: invalid operand
@@ -1900,6 +2107,14 @@
 	cuutf	%r2, %r4, -1
 	cuutf	%r2, %r4, 16
 
+#CHECK: error: invalid register pair
+#CHECK: cuxtr	%r0, %f2
+#CHECK: error: invalid register pair
+#CHECK: cuxtr	%r1, %f0
+
+	cuxtr	%r0, %f2
+	cuxtr	%r1, %f0
+
 #CHECK: error: invalid operand
 #CHECK: cvb	%r0, -1
 #CHECK: error: invalid operand
@@ -1966,6 +2181,16 @@
 
 	cxfbra	%f0, 0, %r0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: cxftr	%f0, 0, %r0, 0
+
+	cxftr	%f0, 0, %r0, 0
+
+#CHECK: error: invalid register pair
+#CHECK: cxfr	%f2, %r0
+
+	cxfr	%f2, %r0
+
 #CHECK: error: invalid register pair
 #CHECK: cxgbr	%f2, %r0
 
@@ -1976,16 +2201,73 @@
 
 	cxgbra	%f0, 0, %r0, 0
 
+#CHECK: error: invalid register pair
+#CHECK: cxgr	%f2, %r0
+
+	cxgr	%f2, %r0
+
+#CHECK: error: invalid register pair
+#CHECK: cxgtr	%f2, %r0
+
+	cxgtr	%f2, %r0
+
+#CHECK: error: instruction requires: fp-extension
+#CHECK: cxgtra	%f0, 0, %r0, 0
+
+	cxgtra	%f0, 0, %r0, 0
+
 #CHECK: error: instruction requires: fp-extension
 #CHECK: cxlfbr	%f0, 0, %r0, 0
 
 	cxlfbr	%f0, 0, %r0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: cxlftr	%f0, 0, %r0, 0
+
+	cxlftr	%f0, 0, %r0, 0
+
 #CHECK: error: instruction requires: fp-extension
 #CHECK: cxlgbr	%f0, 0, %r0, 0
 
 	cxlgbr	%f0, 0, %r0, 0
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: cxlgtr	%f0, 0, %r0, 0
+
+	cxlgtr	%f0, 0, %r0, 0
+
+#CHECK: error: invalid register pair
+#CHECK: cxr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: cxr	%f2, %f0
+
+	cxr	%f0, %f2
+	cxr	%f2, %f0
+
+#CHECK: error: invalid register pair
+#CHECK: cxstr	%f0, %r1
+#CHECK: error: invalid register pair
+#CHECK: cxstr	%f2, %r0
+
+	cxstr	%f0, %r1
+	cxstr	%f2, %r0
+
+#CHECK: error: invalid register pair
+#CHECK: cxtr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: cxtr	%f2, %f0
+
+	cxtr	%f0, %f2
+	cxtr	%f2, %f0
+
+#CHECK: error: invalid register pair
+#CHECK: cxutr	%f0, %r1
+#CHECK: error: invalid register pair
+#CHECK: cxutr	%f2, %r0
+
+	cxutr	%f0, %r1
+	cxutr	%f2, %r0
+
 #CHECK: error: invalid operand
 #CHECK: cy	%r0, -524289
 #CHECK: error: invalid operand
@@ -2005,6 +2287,14 @@
 	d	%r0, 4096
 	d	%r1, 0
 
+#CHECK: error: invalid operand
+#CHECK: dd	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: dd	%f0, 4096
+
+	dd	%f0, -1
+	dd	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: ddb	%f0, -1
 #CHECK: error: invalid operand
@@ -2013,6 +2303,19 @@
 	ddb	%f0, -1
 	ddb	%f0, 4096
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: ddtra	%f0, %f0, %f0, 0
+
+	ddtra	%f0, %f0, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: de	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: de	%f0, 4096
+
+	de	%f0, -1
+	de	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: deb	%f0, -1
 #CHECK: error: invalid operand
@@ -2167,6 +2470,30 @@
 	dxbr	%f0, %f2
 	dxbr	%f2, %f0
 
+#CHECK: error: invalid register pair
+#CHECK: dxr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: dxr	%f2, %f0
+
+	dxr	%f0, %f2
+	dxr	%f2, %f0
+
+#CHECK: error: invalid register pair
+#CHECK: dxtr	%f0, %f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: dxtr	%f0, %f2, %f0
+#CHECK: error: invalid register pair
+#CHECK: dxtr	%f2, %f0, %f0
+
+	dxtr	%f0, %f0, %f2
+	dxtr	%f0, %f2, %f0
+	dxtr	%f2, %f0, %f0
+
+#CHECK: error: instruction requires: fp-extension
+#CHECK: dxtra	%f0, %f0, %f0, 0
+
+	dxtra	%f0, %f0, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: ecag	%r0, %r0, -524289
 #CHECK: error: invalid operand
@@ -2283,6 +2610,22 @@
 	edmk	0(1,%r2), 0(%r1,%r2)
 	edmk	0(-), 0
 
+#CHECK: error: invalid register pair
+#CHECK: eextr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: eextr	%f2, %f0
+
+	eextr	%f0, %f2
+	eextr	%f2, %f0
+
+#CHECK: error: invalid register pair
+#CHECK: esxtr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: esxtr	%f2, %f0
+
+	esxtr	%f0, %f2
+	esxtr	%f2, %f0
+
 #CHECK: error: invalid operand
 #CHECK: ex      %r0, -1
 #CHECK: error: invalid operand
@@ -2304,6 +2647,20 @@
 
 	fidbra	%f0, 0, %f0, 0
 
+#CHECK: error: invalid operand
+#CHECK: fidtr	%f0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: fidtr	%f0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: fidtr	%f0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: fidtr	%f0, 16, %f0, 0
+
+	fidtr	%f0, 0, %f0, -1
+	fidtr	%f0, 0, %f0, 16
+	fidtr	%f0, -1, %f0, 0
+	fidtr	%f0, 16, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: fiebr	%f0, -1, %f0
 #CHECK: error: invalid operand
@@ -2336,6 +2693,34 @@
 
 	fixbra	%f0, 0, %f0, 0
 
+#CHECK: error: invalid register pair
+#CHECK: fixr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: fixr	%f2, %f0
+
+	fixr	%f0, %f2
+	fixr	%f2, %f0
+
+#CHECK: error: invalid operand
+#CHECK: fixtr	%f0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: fixtr	%f0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: fixtr	%f0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: fixtr	%f0, 16, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: fixtr	%f0, 0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: fixtr	%f2, 0, %f0, 0
+
+	fixtr	%f0, 0, %f0, -1
+	fixtr	%f0, 0, %f0, 16
+	fixtr	%f0, -1, %f0, 0
+	fixtr	%f0, 16, %f0, 0
+	fixtr	%f0, 0, %f2, 0
+	fixtr	%f2, 0, %f0, 0
+
 #CHECK: error: invalid register pair
 #CHECK: flogr	%r1, %r0
 
@@ -2399,6 +2784,17 @@
 	icy	%r0, -524289
 	icy	%r0, 524288
 
+#CHECK: error: invalid register pair
+#CHECK: iextr	%f0, %f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: iextr	%f0, %f2, %f0
+#CHECK: error: invalid register pair
+#CHECK: iextr	%f2, %f0, %f0
+
+	iextr	%f0, %f0, %f2
+	iextr	%f0, %f2, %f0
+	iextr	%f2, %f0, %f0
+
 #CHECK: error: invalid operand
 #CHECK: iihf	%r0, -1
 #CHECK: error: invalid operand
@@ -2517,6 +2913,14 @@
 	kxbr	%f0, %f2
 	kxbr	%f2, %f0
 
+#CHECK: error: invalid register pair
+#CHECK: kxtr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: kxtr	%f2, %f0
+
+	kxtr	%f0, %f2
+	kxtr	%f2, %f0
+
 #CHECK: error: invalid operand
 #CHECK: l	%r0, -1
 #CHECK: error: invalid operand
@@ -2651,6 +3055,14 @@
 	lcxbr	%f0, %f2
 	lcxbr	%f2, %f0
 
+#CHECK: error: invalid register pair
+#CHECK: lcxr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: lcxr	%f2, %f0
+
+	lcxr	%f0, %f2
+	lcxr	%f2, %f0
+
 #CHECK: error: invalid operand
 #CHECK: ld	%f0, -1
 #CHECK: error: invalid operand
@@ -2667,6 +3079,14 @@
 	ldeb	%f0, -1
 	ldeb	%f0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: ldetr	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: ldetr	%f0, %f0, 16
+
+	ldetr	%f0, %f0, -1
+	ldetr	%f0, %f0, 16
+
 #CHECK: error: invalid register pair
 #CHECK: ldxbr	%f0, %f2
 #CHECK: error: invalid register pair
@@ -2680,6 +3100,31 @@
 
 	ldxbra	%f0, 0, %f0, 0
 
+#CHECK: error: invalid register pair
+#CHECK: ldxr	%f0, %f2
+
+	ldxr	%f0, %f2
+
+#CHECK: error: invalid operand
+#CHECK: ldxtr	%f0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: ldxtr	%f0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: ldxtr	%f0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: ldxtr	%f0, 16, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: ldxtr	%f0, 0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: ldxtr	%f2, 0, %f0, 0
+
+	ldxtr	%f0, 0, %f0, -1
+	ldxtr	%f0, 0, %f0, 16
+	ldxtr	%f0, -1, %f0, 0
+	ldxtr	%f0, 16, %f0, 0
+	ldxtr	%f0, 0, %f2, 0
+	ldxtr	%f2, 0, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: ldy	%f0, -524289
 #CHECK: error: invalid operand
@@ -2701,6 +3146,20 @@
 
 	ledbra	%f0, 0, %f0, 0
 
+#CHECK: error: invalid operand
+#CHECK: ledtr	%f0, 0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: ledtr	%f0, 0, %f0, 16
+#CHECK: error: invalid operand
+#CHECK: ledtr	%f0, -1, %f0, 0
+#CHECK: error: invalid operand
+#CHECK: ledtr	%f0, 16, %f0, 0
+
+	ledtr	%f0, 0, %f0, -1
+	ledtr	%f0, 0, %f0, 16
+	ledtr	%f0, -1, %f0, 0
+	ledtr	%f0, 16, %f0, 0
+
 #CHECK: error: invalid register pair
 #CHECK: lexbr	%f0, %f2
 #CHECK: error: invalid register pair
@@ -2714,6 +3173,11 @@
 
 	lexbra	%f0, 0, %f0, 0
 
+#CHECK: error: invalid register pair
+#CHECK: lexr	%f0, %f2
+
+	lexr	%f0, %f2
+
 #CHECK: error: invalid operand
 #CHECK: ley	%f0, -524289
 #CHECK: error: invalid operand
@@ -3102,6 +3566,14 @@
 	lnxbr	%f0, %f2
 	lnxbr	%f2, %f0
 
+#CHECK: error: invalid register pair
+#CHECK: lnxr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: lnxr	%f2, %f0
+
+	lnxr	%f0, %f2
+	lnxr	%f2, %f0
+
 #CHECK: error: instruction requires: interlocked-access1
 #CHECK: lpd	%r0, 0, 0
 	lpd	%r0, 0, 0
@@ -3129,6 +3601,19 @@
 	lpxbr	%f0, %f2
 	lpxbr	%f2, %f0
 
+#CHECK: error: invalid register pair
+#CHECK: lpxr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: lpxr	%f2, %f0
+
+	lpxr	%f0, %f2
+	lpxr	%f2, %f0
+
+#CHECK: error: invalid register pair
+#CHECK: lrdr	%f0, %f2
+
+	lrdr	%f0, %f2
+
 #CHECK: error: offset out of range
 #CHECK: lrl	%r0, -0x1000000002
 #CHECK: error: offset out of range
@@ -3191,6 +3676,97 @@
 	ltxbr	%f0, %f14
 	ltxbr	%f14, %f0
 
+#CHECK: error: invalid register pair
+#CHECK: ltxr	%f0, %f14
+#CHECK: error: invalid register pair
+#CHECK: ltxr	%f14, %f0
+
+	ltxr	%f0, %f14
+	ltxr	%f14, %f0
+
+#CHECK: error: invalid register pair
+#CHECK: ltxtr	%f0, %f14
+#CHECK: error: invalid register pair
+#CHECK: ltxtr	%f14, %f0
+
+	ltxtr	%f0, %f14
+	ltxtr	%f14, %f0
+
+#CHECK: error: invalid operand
+#CHECK: lxd	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: lxd	%f0, 4096
+#CHECK: error: invalid register pair
+#CHECK: lxd	%f2, 0
+
+	lxd	%f0, -1
+	lxd	%f0, 4096
+	lxd	%f2, 0
+
+#CHECK: error: invalid operand
+#CHECK: lxdb	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: lxdb	%f0, 4096
+#CHECK: error: invalid register pair
+#CHECK: lxdb	%f2, 0
+
+	lxdb	%f0, -1
+	lxdb	%f0, 4096
+	lxdb	%f2, 0
+
+#CHECK: error: invalid register pair
+#CHECK: lxdbr	%f2, %f0
+
+	lxdbr	%f2, %f0
+
+#CHECK: error: invalid register pair
+#CHECK: lxdr	%f2, %f0
+
+	lxdr	%f2, %f0
+
+#CHECK: error: invalid operand
+#CHECK: lxdtr	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: lxdtr	%f0, %f0, 16
+#CHECK: error: invalid register pair
+#CHECK: lxdtr	%f2, %f0, 0
+
+	lxdtr	%f0, %f0, -1
+	lxdtr	%f0, %f0, 16
+	lxdtr	%f2, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: lxe	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: lxe	%f0, 4096
+#CHECK: error: invalid register pair
+#CHECK: lxe	%f2, 0
+
+	lxe	%f0, -1
+	lxe	%f0, 4096
+	lxe	%f2, 0
+
+#CHECK: error: invalid operand
+#CHECK: lxeb	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: lxeb	%f0, 4096
+#CHECK: error: invalid register pair
+#CHECK: lxeb	%f2, 0
+
+	lxeb	%f0, -1
+	lxeb	%f0, 4096
+	lxeb	%f2, 0
+
+#CHECK: error: invalid register pair
+#CHECK: lxebr	%f2, %f0
+
+	lxebr	%f2, %f0
+
+#CHECK: error: invalid register pair
+#CHECK: lxer	%f2, %f0
+
+	lxer	%f2, %f0
+
 #CHECK: error: invalid register pair
 #CHECK: lxr	%f0, %f2
 #CHECK: error: invalid register pair
@@ -3223,6 +3799,14 @@
 	m	%r0, 4096
 	m	%r1, 0
 
+#CHECK: error: invalid operand
+#CHECK: mad	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: mad	%f0, %f0, 4096
+
+	mad	%f0, %f0, -1
+	mad	%f0, %f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: madb	%f0, %f0, -1
 #CHECK: error: invalid operand
@@ -3231,6 +3815,14 @@
 	madb	%f0, %f0, -1
 	madb	%f0, %f0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: mae	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: mae	%f0, %f0, 4096
+
+	mae	%f0, %f0, -1
+	mae	%f0, %f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: maeb	%f0, %f0, -1
 #CHECK: error: invalid operand
@@ -3239,6 +3831,38 @@
 	maeb	%f0, %f0, -1
 	maeb	%f0, %f0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: may	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: may	%f0, %f0, 4096
+#CHECK: error: invalid register pair
+#CHECK: may	%f2, %f0, 0
+
+	may	%f0, %f0, -1
+	may	%f0, %f0, 4096
+	may	%f2, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: mayh	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: mayh	%f0, %f0, 4096
+
+	mayh	%f0, %f0, -1
+	mayh	%f0, %f0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: mayl	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: mayl	%f0, %f0, 4096
+
+	mayl	%f0, %f0, -1
+	mayl	%f0, %f0, 4096
+
+#CHECK: error: invalid register pair
+#CHECK: mayr	%f2, %f0, %f0
+
+	mayr	%f2, %f0, %f0
+
 #CHECK: error: invalid operand
 #CHECK: mc	-1, 0
 #CHECK: error: invalid operand
@@ -3256,6 +3880,14 @@
 	mc	0, -1
 	mc	0, 256
 
+#CHECK: error: invalid operand
+#CHECK: md	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: md	%f0, 4096
+
+	md	%f0, -1
+	md	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: mdb	%f0, -1
 #CHECK: error: invalid operand
@@ -3264,6 +3896,14 @@
 	mdb	%f0, -1
 	mdb	%f0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: mde	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: mde	%f0, 4096
+
+	mde	%f0, -1
+	mde	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: mdeb	%f0, -1
 #CHECK: error: invalid operand
@@ -3272,6 +3912,27 @@
 	mdeb	%f0, -1
 	mdeb	%f0, 4096
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: mdtra	%f0, %f0, %f0, 0
+
+	mdtra	%f0, %f0, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: me	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: me	%f0, 4096
+
+	me	%f0, -1
+	me	%f0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: mee	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: mee	%f0, 4096
+
+	mee	%f0, -1
+	mee	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: meeb	%f0, -1
 #CHECK: error: invalid operand
@@ -3427,6 +4088,14 @@
 	ms	%r0, -1
 	ms	%r0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: msd	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: msd	%f0, %f0, 4096
+
+	msd	%f0, %f0, -1
+	msd	%f0, %f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: msdb	%f0, %f0, -1
 #CHECK: error: invalid operand
@@ -3435,6 +4104,14 @@
 	msdb	%f0, %f0, -1
 	msdb	%f0, %f0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: mse	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: mse	%f0, %f0, 4096
+
+	mse	%f0, %f0, -1
+	mse	%f0, %f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: mseb	%f0, %f0, -1
 #CHECK: error: invalid operand
@@ -3870,6 +4547,17 @@
 	mxbr	%f0, %f2
 	mxbr	%f2, %f0
 
+#CHECK: error: invalid register pair
+#CHECK: mxd	%f2, 0
+#CHECK: error: invalid operand
+#CHECK: mxd	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: mxd	%f0, 4096
+
+	mxd	%f2, 0
+	mxd	%f0, -1
+	mxd	%f0, 4096
+
 #CHECK: error: invalid register pair
 #CHECK: mxdb	%f2, 0
 #CHECK: error: invalid operand
@@ -3886,6 +4574,67 @@
 
 	mxdbr	%f2, %f0
 
+#CHECK: error: invalid register pair
+#CHECK: mxdr	%f2, %f0
+
+	mxdr	%f2, %f0
+
+#CHECK: error: invalid register pair
+#CHECK: mxr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: mxr	%f2, %f0
+
+	mxr	%f0, %f2
+	mxr	%f2, %f0
+
+#CHECK: error: invalid register pair
+#CHECK: mxtr	%f0, %f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: mxtr	%f0, %f2, %f0
+#CHECK: error: invalid register pair
+#CHECK: mxtr	%f2, %f0, %f0
+
+	mxtr	%f0, %f0, %f2
+	mxtr	%f0, %f2, %f0
+	mxtr	%f2, %f0, %f0
+
+#CHECK: error: instruction requires: fp-extension
+#CHECK: mxtra	%f0, %f0, %f0, 0
+
+	mxtra	%f0, %f0, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: my	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: my	%f0, %f0, 4096
+#CHECK: error: invalid register pair
+#CHECK: my	%f2, %f0, 0
+
+	my	%f0, %f0, -1
+	my	%f0, %f0, 4096
+	my	%f2, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: myh	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: myh	%f0, %f0, 4096
+
+	myh	%f0, %f0, -1
+	myh	%f0, %f0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: myl	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: myl	%f0, %f0, 4096
+
+	myl	%f0, %f0, -1
+	myl	%f0, %f0, 4096
+
+#CHECK: error: invalid register pair
+#CHECK: myr	%f2, %f0, %f0
+
+	myr	%f2, %f0, %f0
+
 #CHECK: error: invalid operand
 #CHECK: n	%r0, -1
 #CHECK: error: invalid operand
@@ -4412,6 +5161,31 @@
 #CHECK: pr    %r0
         pr    %r0
 
+#CHECK: error: invalid operand
+#CHECK: qadtr	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: qadtr	%f0, %f0, %f0, 16
+
+	qadtr	%f0, %f0, %f0, -1
+	qadtr	%f0, %f0, %f0, 16
+
+#CHECK: error: invalid operand
+#CHECK: qaxtr	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: qaxtr	%f0, %f0, %f0, 16
+#CHECK: error: invalid register pair
+#CHECK: qaxtr	%f0, %f0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: qaxtr	%f0, %f2, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: qaxtr	%f2, %f0, %f0, 0
+
+	qaxtr	%f0, %f0, %f0, -1
+	qaxtr	%f0, %f0, %f0, 16
+	qaxtr	%f0, %f0, %f2, 0
+	qaxtr	%f0, %f2, %f0, 0
+	qaxtr	%f2, %f0, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: risbg	%r0,%r0,0,0,-1
 #CHECK: error: invalid operand
@@ -4510,6 +5284,31 @@
 	rosbg	%r0,%r0,-1,0,0
 	rosbg	%r0,%r0,256,0,0
 
+#CHECK: error: invalid operand
+#CHECK: rrdtr	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: rrdtr	%f0, %f0, %f0, 16
+
+	rrdtr	%f0, %f0, %f0, -1
+	rrdtr	%f0, %f0, %f0, 16
+
+#CHECK: error: invalid operand
+#CHECK: rrxtr	%f0, %f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: rrxtr	%f0, %f0, %f0, 16
+#CHECK: error: invalid register pair
+#CHECK: rrxtr	%f0, %f0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: rrxtr	%f0, %f2, %f0, 0
+#CHECK: error: invalid register pair
+#CHECK: rrxtr	%f2, %f0, %f0, 0
+
+	rrxtr	%f0, %f0, %f0, -1
+	rrxtr	%f0, %f0, %f0, 16
+	rrxtr	%f0, %f0, %f2, 0
+	rrxtr	%f0, %f2, %f0, 0
+	rrxtr	%f2, %f0, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: rxsbg	%r0,%r0,0,0,-1
 #CHECK: error: invalid operand
@@ -4538,6 +5337,14 @@
 	s	%r0, -1
 	s	%r0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: sd	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: sd	%f0, 4096
+
+	sd	%f0, -1
+	sd	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: sdb	%f0, -1
 #CHECK: error: invalid operand
@@ -4546,6 +5353,19 @@
 	sdb	%f0, -1
 	sdb	%f0, 4096
 
+#CHECK: error: instruction requires: fp-extension
+#CHECK: sdtra	%f0, %f0, %f0, 0
+
+	sdtra	%f0, %f0, %f0, 0
+
+#CHECK: error: invalid operand
+#CHECK: se	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: se	%f0, 4096
+
+	se	%f0, -1
+	se	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: seb	%f0, -1
 #CHECK: error: invalid operand
@@ -4682,6 +5502,14 @@
 	sldl	%r0,0(%r0)
 	sldl	%r0,0(%r1,%r2)
 
+#CHECK: error: invalid operand
+#CHECK: sldt	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: sldt	%f0, %f0, 4096
+
+	sldt	%f0, %f0, -1
+	sldt	%f0, %f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: slfi	%r0, -1
 #CHECK: error: invalid operand
@@ -4757,6 +5585,20 @@
 
 	slrk	%r2,%r3,%r4
 
+#CHECK: error: invalid operand
+#CHECK: slxt	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: slxt	%f0, %f0, 4096
+#CHECK: error: invalid register pair
+#CHECK: slxt	%f0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: slxt	%f2, %f0, 0
+
+	slxt	%f0, %f0, -1
+	slxt	%f0, %f0, 4096
+	slxt	%f0, %f2, 0
+	slxt	%f2, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: sly	%r0, -524289
 #CHECK: error: invalid operand
@@ -4818,6 +5660,14 @@
 	sp	0(1,%r2), 0(%r1,%r2)
 	sp	0(-), 0(1)
 
+#CHECK: error: invalid operand
+#CHECK: sqd	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: sqd	%f0, 4096
+
+	sqd	%f0, -1
+	sqd	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: sqdb	%f0, -1
 #CHECK: error: invalid operand
@@ -4826,6 +5676,14 @@
 	sqdb	%f0, -1
 	sqdb	%f0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: sqe	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: sqe	%f0, 4096
+
+	sqe	%f0, -1
+	sqe	%f0, 4096
+
 #CHECK: error: invalid operand
 #CHECK: sqeb	%f0, -1
 #CHECK: error: invalid operand
@@ -4842,6 +5700,14 @@
 	sqxbr	%f0, %f2
 	sqxbr	%f2, %f0
 
+#CHECK: error: invalid register pair
+#CHECK: sqxr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: sqxr	%f2, %f0
+
+	sqxr	%f0, %f2
+	sqxr	%f2, %f0
+
 #CHECK: error: invalid operand
 #CHECK: sra	%r0,-1
 #CHECK: error: invalid operand
@@ -4909,6 +5775,14 @@
 	srdl	%r0,0(%r0)
 	srdl	%r0,0(%r1,%r2)
 
+#CHECK: error: invalid operand
+#CHECK: srdt	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: srdt	%f0, %f0, 4096
+
+	srdt	%f0, %f0, -1
+	srdt	%f0, %f0, 4096
+
 #CHECK: error: instruction requires: distinct-ops
 #CHECK: srk	%r2,%r3,%r4
 
@@ -5024,6 +5898,20 @@
 	srp	0(1), 0, 16
 	srp	0(-), 0, 0
 
+#CHECK: error: invalid operand
+#CHECK: srxt	%f0, %f0, -1
+#CHECK: error: invalid operand
+#CHECK: srxt	%f0, %f0, 4096
+#CHECK: error: invalid register pair
+#CHECK: srxt	%f0, %f2, 0
+#CHECK: error: invalid register pair
+#CHECK: srxt	%f2, %f0, 0
+
+	srxt	%f0, %f0, -1
+	srxt	%f0, %f0, 4096
+	srxt	%f0, %f2, 0
+	srxt	%f2, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: st	%r0, -1
 #CHECK: error: invalid operand
@@ -5326,6 +6214,22 @@
 	sty	%r0, -524289
 	sty	%r0, 524288
 
+#CHECK: error: invalid operand
+#CHECK: su	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: su	%f0, 4096
+
+	su	%f0, -1
+	su	%f0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: sw	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: sw	%f0, 4096
+
+	sw	%f0, -1
+	sw	%f0, 4096
+
 #CHECK: error: invalid register pair
 #CHECK: sxbr	%f0, %f2
 #CHECK: error: invalid register pair
@@ -5334,6 +6238,30 @@
 	sxbr	%f0, %f2
 	sxbr	%f2, %f0
 
+#CHECK: error: invalid register pair
+#CHECK: sxr	%f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: sxr	%f2, %f0
+
+	sxr	%f0, %f2
+	sxr	%f2, %f0
+
+#CHECK: error: invalid register pair
+#CHECK: sxtr	%f0, %f0, %f2
+#CHECK: error: invalid register pair
+#CHECK: sxtr	%f0, %f2, %f0
+#CHECK: error: invalid register pair
+#CHECK: sxtr	%f2, %f0, %f0
+
+	sxtr	%f0, %f0, %f2
+	sxtr	%f0, %f2, %f0
+	sxtr	%f2, %f0, %f0
+
+#CHECK: error: instruction requires: fp-extension
+#CHECK: sxtra	%f0, %f0, %f0, 0
+
+	sxtra	%f0, %f0, %f0, 0
+
 #CHECK: error: invalid operand
 #CHECK: sy	%r0, -524289
 #CHECK: error: invalid operand
@@ -5342,6 +6270,22 @@
 	sy	%r0, -524289
 	sy	%r0, 524288
 
+#CHECK: error: invalid operand
+#CHECK: tbdr	%f0, -1, %f0
+#CHECK: error: invalid operand
+#CHECK: tbdr	%f0, 16, %f0
+
+	tbdr	%f0, -1, %f0
+	tbdr	%f0, 16, %f0
+
+#CHECK: error: invalid operand
+#CHECK: tbedr	%f0, -1, %f0
+#CHECK: error: invalid operand
+#CHECK: tbedr	%f0, 16, %f0
+
+	tbedr	%f0, -1, %f0
+	tbedr	%f0, 16, %f0
+
 #CHECK: error: invalid operand
 #CHECK: tcdb	%f0, -1
 #CHECK: error: invalid operand
@@ -5366,6 +6310,60 @@
 	tcxb	%f0, -1
 	tcxb	%f0, 4096
 
+#CHECK: error: invalid operand
+#CHECK: tdcdt	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: tdcdt	%f0, 4096
+
+	tdcdt	%f0, -1
+	tdcdt	%f0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: tdcet	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: tdcet	%f0, 4096
+
+	tdcet	%f0, -1
+	tdcet	%f0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: tdcxt	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: tdcxt	%f0, 4096
+#CHECK: error: invalid register pair
+#CHECK: tdcxt	%f2, 0
+
+	tdcxt	%f0, -1
+	tdcxt	%f0, 4096
+	tdcxt	%f2, 0
+
+#CHECK: error: invalid operand
+#CHECK: tdgdt	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: tdgdt	%f0, 4096
+
+	tdgdt	%f0, -1
+	tdgdt	%f0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: tdget	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: tdget	%f0, 4096
+
+	tdget	%f0, -1
+	tdget	%f0, 4096
+
+#CHECK: error: invalid operand
+#CHECK: tdgxt	%f0, -1
+#CHECK: error: invalid operand
+#CHECK: tdgxt	%f0, 4096
+#CHECK: error: invalid register pair
+#CHECK: tdgxt	%f2, 0
+
+	tdgxt	%f0, -1
+	tdgxt	%f0, 4096
+	tdgxt	%f2, 0
+
 #CHECK: error: invalid operand
 #CHECK: tm	-1, 0
 #CHECK: error: invalid operand
diff --git a/test/MC/SystemZ/insn-good-z13.s b/test/MC/SystemZ/insn-good-z13.s
index cbfcfa9a89af..6a4beff7638c 100644
--- a/test/MC/SystemZ/insn-good-z13.s
+++ b/test/MC/SystemZ/insn-good-z13.s
@@ -4,6 +4,86 @@
 # RUN: llvm-mc -triple s390x-linux-gnu -mcpu=arch11 -show-encoding %s \
 # RUN:   | FileCheck %s
 
+#CHECK: cdpt	%f0, 0(1), 0                # encoding: [0xed,0x00,0x00,0x00,0x00,0xae]
+#CHECK: cdpt	%f15, 0(1), 0               # encoding: [0xed,0x00,0x00,0x00,0xf0,0xae]
+#CHECK: cdpt	%f0, 0(1), 15               # encoding: [0xed,0x00,0x00,0x00,0x0f,0xae]
+#CHECK: cdpt	%f0, 0(1,%r1), 0            # encoding: [0xed,0x00,0x10,0x00,0x00,0xae]
+#CHECK: cdpt	%f0, 0(1,%r15), 0           # encoding: [0xed,0x00,0xf0,0x00,0x00,0xae]
+#CHECK: cdpt	%f0, 4095(1,%r1), 0         # encoding: [0xed,0x00,0x1f,0xff,0x00,0xae]
+#CHECK: cdpt	%f0, 4095(1,%r15), 0        # encoding: [0xed,0x00,0xff,0xff,0x00,0xae]
+#CHECK: cdpt	%f0, 0(256,%r1), 0          # encoding: [0xed,0xff,0x10,0x00,0x00,0xae]
+#CHECK: cdpt	%f0, 0(256,%r15), 0         # encoding: [0xed,0xff,0xf0,0x00,0x00,0xae]
+
+	cdpt	%f0, 0(1), 0
+	cdpt	%f15, 0(1), 0
+	cdpt	%f0, 0(1), 15
+	cdpt	%f0, 0(1,%r1), 0
+	cdpt	%f0, 0(1,%r15), 0
+	cdpt	%f0, 4095(1,%r1), 0
+	cdpt	%f0, 4095(1,%r15), 0
+	cdpt	%f0, 0(256,%r1), 0
+	cdpt	%f0, 0(256,%r15), 0
+
+#CHECK: cpdt	%f0, 0(1), 0                # encoding: [0xed,0x00,0x00,0x00,0x00,0xac]
+#CHECK: cpdt	%f15, 0(1), 0               # encoding: [0xed,0x00,0x00,0x00,0xf0,0xac]
+#CHECK: cpdt	%f0, 0(1), 15               # encoding: [0xed,0x00,0x00,0x00,0x0f,0xac]
+#CHECK: cpdt	%f0, 0(1,%r1), 0            # encoding: [0xed,0x00,0x10,0x00,0x00,0xac]
+#CHECK: cpdt	%f0, 0(1,%r15), 0           # encoding: [0xed,0x00,0xf0,0x00,0x00,0xac]
+#CHECK: cpdt	%f0, 4095(1,%r1), 0         # encoding: [0xed,0x00,0x1f,0xff,0x00,0xac]
+#CHECK: cpdt	%f0, 4095(1,%r15), 0        # encoding: [0xed,0x00,0xff,0xff,0x00,0xac]
+#CHECK: cpdt	%f0, 0(256,%r1), 0          # encoding: [0xed,0xff,0x10,0x00,0x00,0xac]
+#CHECK: cpdt	%f0, 0(256,%r15), 0         # encoding: [0xed,0xff,0xf0,0x00,0x00,0xac]
+
+	cpdt	%f0, 0(1), 0
+	cpdt	%f15, 0(1), 0
+	cpdt	%f0, 0(1), 15
+	cpdt	%f0, 0(1,%r1), 0
+	cpdt	%f0, 0(1,%r15), 0
+	cpdt	%f0, 4095(1,%r1), 0
+	cpdt	%f0, 4095(1,%r15), 0
+	cpdt	%f0, 0(256,%r1), 0
+	cpdt	%f0, 0(256,%r15), 0
+
+#CHECK: cpxt	%f0, 0(1), 0                # encoding: [0xed,0x00,0x00,0x00,0x00,0xad]
+#CHECK: cpxt	%f13, 0(1), 0               # encoding: [0xed,0x00,0x00,0x00,0xd0,0xad]
+#CHECK: cpxt	%f0, 0(1), 15               # encoding: [0xed,0x00,0x00,0x00,0x0f,0xad]
+#CHECK: cpxt	%f0, 0(1,%r1), 0            # encoding: [0xed,0x00,0x10,0x00,0x00,0xad]
+#CHECK: cpxt	%f0, 0(1,%r15), 0           # encoding: [0xed,0x00,0xf0,0x00,0x00,0xad]
+#CHECK: cpxt	%f0, 4095(1,%r1), 0         # encoding: [0xed,0x00,0x1f,0xff,0x00,0xad]
+#CHECK: cpxt	%f0, 4095(1,%r15), 0        # encoding: [0xed,0x00,0xff,0xff,0x00,0xad]
+#CHECK: cpxt	%f0, 0(256,%r1), 0          # encoding: [0xed,0xff,0x10,0x00,0x00,0xad]
+#CHECK: cpxt	%f0, 0(256,%r15), 0         # encoding: [0xed,0xff,0xf0,0x00,0x00,0xad]
+
+	cpxt	%f0, 0(1), 0
+	cpxt	%f13, 0(1), 0
+	cpxt	%f0, 0(1), 15
+	cpxt	%f0, 0(1,%r1), 0
+	cpxt	%f0, 0(1,%r15), 0
+	cpxt	%f0, 4095(1,%r1), 0
+	cpxt	%f0, 4095(1,%r15), 0
+	cpxt	%f0, 0(256,%r1), 0
+	cpxt	%f0, 0(256,%r15), 0
+
+#CHECK: cxpt	%f0, 0(1), 0                # encoding: [0xed,0x00,0x00,0x00,0x00,0xaf]
+#CHECK: cxpt	%f13, 0(1), 0               # encoding: [0xed,0x00,0x00,0x00,0xd0,0xaf]
+#CHECK: cxpt	%f0, 0(1), 15               # encoding: [0xed,0x00,0x00,0x00,0x0f,0xaf]
+#CHECK: cxpt	%f0, 0(1,%r1), 0            # encoding: [0xed,0x00,0x10,0x00,0x00,0xaf]
+#CHECK: cxpt	%f0, 0(1,%r15), 0           # encoding: [0xed,0x00,0xf0,0x00,0x00,0xaf]
+#CHECK: cxpt	%f0, 4095(1,%r1), 0         # encoding: [0xed,0x00,0x1f,0xff,0x00,0xaf]
+#CHECK: cxpt	%f0, 4095(1,%r15), 0        # encoding: [0xed,0x00,0xff,0xff,0x00,0xaf]
+#CHECK: cxpt	%f0, 0(256,%r1), 0          # encoding: [0xed,0xff,0x10,0x00,0x00,0xaf]
+#CHECK: cxpt	%f0, 0(256,%r15), 0         # encoding: [0xed,0xff,0xf0,0x00,0x00,0xaf]
+
+	cxpt	%f0, 0(1), 0
+	cxpt	%f13, 0(1), 0
+	cxpt	%f0, 0(1), 15
+	cxpt	%f0, 0(1,%r1), 0
+	cxpt	%f0, 0(1,%r15), 0
+	cxpt	%f0, 4095(1,%r1), 0
+	cxpt	%f0, 4095(1,%r15), 0
+	cxpt	%f0, 0(256,%r1), 0
+	cxpt	%f0, 0(256,%r15), 0
+
 #CHECK: lcbb    %r0, 0, 0               # encoding: [0xe7,0x00,0x00,0x00,0x00,0x27]
 #CHECK: lcbb    %r0, 0, 15              # encoding: [0xe7,0x00,0x00,0x00,0xf0,0x27]
 #CHECK: lcbb    %r0, 4095, 0            # encoding: [0xe7,0x00,0x0f,0xff,0x00,0x27]
diff --git a/test/MC/SystemZ/insn-good-z196.s b/test/MC/SystemZ/insn-good-z196.s
index 02c473c11a4a..31d257d7448d 100644
--- a/test/MC/SystemZ/insn-good-z196.s
+++ b/test/MC/SystemZ/insn-good-z196.s
@@ -2,6 +2,20 @@
 # RUN: llvm-mc -triple s390x-linux-gnu -mcpu=z196 -show-encoding %s | FileCheck %s
 # RUN: llvm-mc -triple s390x-linux-gnu -mcpu=arch9 -show-encoding %s | FileCheck %s
 
+#CHECK: adtra	%f0, %f0, %f0, 0        # encoding: [0xb3,0xd2,0x00,0x00]
+#CHECK: adtra	%f0, %f0, %f0, 15       # encoding: [0xb3,0xd2,0x0f,0x00]
+#CHECK: adtra	%f0, %f0, %f15, 0       # encoding: [0xb3,0xd2,0xf0,0x00]
+#CHECK: adtra	%f0, %f15, %f0, 0       # encoding: [0xb3,0xd2,0x00,0x0f]
+#CHECK: adtra	%f15, %f0, %f0, 0       # encoding: [0xb3,0xd2,0x00,0xf0]
+#CHECK: adtra	%f7, %f8, %f9, 10       # encoding: [0xb3,0xd2,0x9a,0x78]
+
+	adtra	%f0, %f0, %f0, 0
+	adtra	%f0, %f0, %f0, 15
+	adtra	%f0, %f0, %f15, 0
+	adtra	%f0, %f15, %f0, 0
+	adtra	%f15, %f0, %f0, 0
+	adtra	%f7, %f8, %f9, 10
+
 #CHECK: aghik	%r0, %r0, -32768        # encoding: [0xec,0x00,0x80,0x00,0x00,0xd9]
 #CHECK: aghik	%r0, %r0, -1            # encoding: [0xec,0x00,0xff,0xff,0x00,0xd9]
 #CHECK: aghik	%r0, %r0, 0             # encoding: [0xec,0x00,0x00,0x00,0x00,0xd9]
@@ -136,6 +150,20 @@
 	ark	%r15,%r0,%r0
 	ark	%r7,%r8,%r9
 
+#CHECK: axtra	%f0, %f0, %f0, 0        # encoding: [0xb3,0xda,0x00,0x00]
+#CHECK: axtra	%f0, %f0, %f0, 15       # encoding: [0xb3,0xda,0x0f,0x00]
+#CHECK: axtra	%f0, %f0, %f13, 0       # encoding: [0xb3,0xda,0xd0,0x00]
+#CHECK: axtra	%f0, %f13, %f0, 0       # encoding: [0xb3,0xda,0x00,0x0d]
+#CHECK: axtra	%f13, %f0, %f0, 0       # encoding: [0xb3,0xda,0x00,0xd0]
+#CHECK: axtra	%f8, %f8, %f8, 8        # encoding: [0xb3,0xda,0x88,0x88]
+
+	axtra	%f0, %f0, %f0, 0
+	axtra	%f0, %f0, %f0, 15
+	axtra	%f0, %f0, %f13, 0
+	axtra	%f0, %f13, %f0, 0
+	axtra	%f13, %f0, %f0, 0
+	axtra	%f8, %f8, %f8, 8
+
 #CHECK: brcth	%r0, .[[LAB:L.*]]-4294967296 # encoding: [0xcc,0x06,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL
 	brcth	%r0, -0x100000000
@@ -187,6 +215,20 @@
 	cdfbra	%f4, 5, %r6, 7
 	cdfbra	%f15, 0, %r0, 0
 
+#CHECK: cdftr	%f0, 0, %r0, 0          # encoding: [0xb9,0x51,0x00,0x00]
+#CHECK: cdftr	%f0, 0, %r0, 15         # encoding: [0xb9,0x51,0x0f,0x00]
+#CHECK: cdftr	%f0, 0, %r15, 0         # encoding: [0xb9,0x51,0x00,0x0f]
+#CHECK: cdftr	%f0, 15, %r0, 0         # encoding: [0xb9,0x51,0xf0,0x00]
+#CHECK: cdftr	%f4, 5, %r6, 7          # encoding: [0xb9,0x51,0x57,0x46]
+#CHECK: cdftr	%f15, 0, %r0, 0         # encoding: [0xb9,0x51,0x00,0xf0]
+
+	cdftr	%f0, 0, %r0, 0
+	cdftr	%f0, 0, %r0, 15
+	cdftr	%f0, 0, %r15, 0
+	cdftr	%f0, 15, %r0, 0
+	cdftr	%f4, 5, %r6, 7
+	cdftr	%f15, 0, %r0, 0
+
 #CHECK: cdgbra	%f0, 0, %r0, 0          # encoding: [0xb3,0xa5,0x00,0x00]
 #CHECK: cdgbra	%f0, 0, %r0, 15         # encoding: [0xb3,0xa5,0x0f,0x00]
 #CHECK: cdgbra	%f0, 0, %r15, 0         # encoding: [0xb3,0xa5,0x00,0x0f]
@@ -201,6 +243,20 @@
 	cdgbra	%f4, 5, %r6, 7
 	cdgbra	%f15, 0, %r0, 0
 
+#CHECK: cdgtra	%f0, 0, %r0, 0          # encoding: [0xb3,0xf1,0x00,0x00]
+#CHECK: cdgtra	%f0, 0, %r0, 15         # encoding: [0xb3,0xf1,0x0f,0x00]
+#CHECK: cdgtra	%f0, 0, %r15, 0         # encoding: [0xb3,0xf1,0x00,0x0f]
+#CHECK: cdgtra	%f0, 15, %r0, 0         # encoding: [0xb3,0xf1,0xf0,0x00]
+#CHECK: cdgtra	%f4, 5, %r6, 7          # encoding: [0xb3,0xf1,0x57,0x46]
+#CHECK: cdgtra	%f15, 0, %r0, 0         # encoding: [0xb3,0xf1,0x00,0xf0]
+
+	cdgtra	%f0, 0, %r0, 0
+	cdgtra	%f0, 0, %r0, 15
+	cdgtra	%f0, 0, %r15, 0
+	cdgtra	%f0, 15, %r0, 0
+	cdgtra	%f4, 5, %r6, 7
+	cdgtra	%f15, 0, %r0, 0
+
 #CHECK: cdlfbr	%f0, 0, %r0, 0          # encoding: [0xb3,0x91,0x00,0x00]
 #CHECK: cdlfbr	%f0, 0, %r0, 15         # encoding: [0xb3,0x91,0x0f,0x00]
 #CHECK: cdlfbr	%f0, 0, %r15, 0         # encoding: [0xb3,0x91,0x00,0x0f]
@@ -215,6 +271,20 @@
 	cdlfbr	%f4, 5, %r6, 7
 	cdlfbr	%f15, 0, %r0, 0
 
+#CHECK: cdlftr	%f0, 0, %r0, 0          # encoding: [0xb9,0x53,0x00,0x00]
+#CHECK: cdlftr	%f0, 0, %r0, 15         # encoding: [0xb9,0x53,0x0f,0x00]
+#CHECK: cdlftr	%f0, 0, %r15, 0         # encoding: [0xb9,0x53,0x00,0x0f]
+#CHECK: cdlftr	%f0, 15, %r0, 0         # encoding: [0xb9,0x53,0xf0,0x00]
+#CHECK: cdlftr	%f4, 5, %r6, 7          # encoding: [0xb9,0x53,0x57,0x46]
+#CHECK: cdlftr	%f15, 0, %r0, 0         # encoding: [0xb9,0x53,0x00,0xf0]
+
+	cdlftr	%f0, 0, %r0, 0
+	cdlftr	%f0, 0, %r0, 15
+	cdlftr	%f0, 0, %r15, 0
+	cdlftr	%f0, 15, %r0, 0
+	cdlftr	%f4, 5, %r6, 7
+	cdlftr	%f15, 0, %r0, 0
+
 #CHECK: cdlgbr	%f0, 0, %r0, 0          # encoding: [0xb3,0xa1,0x00,0x00]
 #CHECK: cdlgbr	%f0, 0, %r0, 15         # encoding: [0xb3,0xa1,0x0f,0x00]
 #CHECK: cdlgbr	%f0, 0, %r15, 0         # encoding: [0xb3,0xa1,0x00,0x0f]
@@ -229,6 +299,20 @@
 	cdlgbr	%f4, 5, %r6, 7
 	cdlgbr	%f15, 0, %r0, 0
 
+#CHECK: cdlgtr	%f0, 0, %r0, 0          # encoding: [0xb9,0x52,0x00,0x00]
+#CHECK: cdlgtr	%f0, 0, %r0, 15         # encoding: [0xb9,0x52,0x0f,0x00]
+#CHECK: cdlgtr	%f0, 0, %r15, 0         # encoding: [0xb9,0x52,0x00,0x0f]
+#CHECK: cdlgtr	%f0, 15, %r0, 0         # encoding: [0xb9,0x52,0xf0,0x00]
+#CHECK: cdlgtr	%f4, 5, %r6, 7          # encoding: [0xb9,0x52,0x57,0x46]
+#CHECK: cdlgtr	%f15, 0, %r0, 0         # encoding: [0xb9,0x52,0x00,0xf0]
+
+	cdlgtr	%f0, 0, %r0, 0
+	cdlgtr	%f0, 0, %r0, 15
+	cdlgtr	%f0, 0, %r15, 0
+	cdlgtr	%f0, 15, %r0, 0
+	cdlgtr	%f4, 5, %r6, 7
+	cdlgtr	%f15, 0, %r0, 0
+
 #CHECK: cefbra	%f0, 0, %r0, 0          # encoding: [0xb3,0x94,0x00,0x00]
 #CHECK: cefbra	%f0, 0, %r0, 15         # encoding: [0xb3,0x94,0x0f,0x00]
 #CHECK: cefbra	%f0, 0, %r15, 0         # encoding: [0xb3,0x94,0x00,0x0f]
@@ -299,6 +383,20 @@
 	cfdbra	%r4, 5, %f6, 7
 	cfdbra	%r15, 0, %f0, 0
 
+#CHECK: cfdtr	%r0, 0, %f0, 0          # encoding: [0xb9,0x41,0x00,0x00]
+#CHECK: cfdtr	%r0, 0, %f0, 15         # encoding: [0xb9,0x41,0x0f,0x00]
+#CHECK: cfdtr	%r0, 0, %f15, 0         # encoding: [0xb9,0x41,0x00,0x0f]
+#CHECK: cfdtr	%r0, 15, %f0, 0         # encoding: [0xb9,0x41,0xf0,0x00]
+#CHECK: cfdtr	%r4, 5, %f6, 7          # encoding: [0xb9,0x41,0x57,0x46]
+#CHECK: cfdtr	%r15, 0, %f0, 0         # encoding: [0xb9,0x41,0x00,0xf0]
+
+	cfdtr	%r0, 0, %f0, 0
+	cfdtr	%r0, 0, %f0, 15
+	cfdtr	%r0, 0, %f15, 0
+	cfdtr	%r0, 15, %f0, 0
+	cfdtr	%r4, 5, %f6, 7
+	cfdtr	%r15, 0, %f0, 0
+
 #CHECK: cfebra	%r0, 0, %f0, 0          # encoding: [0xb3,0x98,0x00,0x00]
 #CHECK: cfebra	%r0, 0, %f0, 15         # encoding: [0xb3,0x98,0x0f,0x00]
 #CHECK: cfebra	%r0, 0, %f15, 0         # encoding: [0xb3,0x98,0x00,0x0f]
@@ -327,6 +425,20 @@
 	cfxbra	%r7, 5, %f8, 9
 	cfxbra	%r15, 0, %f0, 0
 
+#CHECK: cfxtr	%r0, 0, %f0, 0          # encoding: [0xb9,0x49,0x00,0x00]
+#CHECK: cfxtr	%r0, 0, %f0, 15         # encoding: [0xb9,0x49,0x0f,0x00]
+#CHECK: cfxtr	%r0, 0, %f13, 0         # encoding: [0xb9,0x49,0x00,0x0d]
+#CHECK: cfxtr	%r0, 15, %f0, 0         # encoding: [0xb9,0x49,0xf0,0x00]
+#CHECK: cfxtr	%r7, 5, %f8, 9          # encoding: [0xb9,0x49,0x59,0x78]
+#CHECK: cfxtr	%r15, 0, %f0, 0         # encoding: [0xb9,0x49,0x00,0xf0]
+
+	cfxtr	%r0, 0, %f0, 0
+	cfxtr	%r0, 0, %f0, 15
+	cfxtr	%r0, 0, %f13, 0
+	cfxtr	%r0, 15, %f0, 0
+	cfxtr	%r7, 5, %f8, 9
+	cfxtr	%r15, 0, %f0, 0
+
 #CHECK: cgdbra	%r0, 0, %f0, 0          # encoding: [0xb3,0xa9,0x00,0x00]
 #CHECK: cgdbra	%r0, 0, %f0, 15         # encoding: [0xb3,0xa9,0x0f,0x00]
 #CHECK: cgdbra	%r0, 0, %f15, 0         # encoding: [0xb3,0xa9,0x00,0x0f]
@@ -341,6 +453,20 @@
 	cgdbra	%r4, 5, %f6, 7
 	cgdbra	%r15, 0, %f0, 0
 
+#CHECK: cgdtra	%r0, 0, %f0, 0          # encoding: [0xb3,0xe1,0x00,0x00]
+#CHECK: cgdtra	%r0, 0, %f0, 15         # encoding: [0xb3,0xe1,0x0f,0x00]
+#CHECK: cgdtra	%r0, 0, %f15, 0         # encoding: [0xb3,0xe1,0x00,0x0f]
+#CHECK: cgdtra	%r0, 15, %f0, 0         # encoding: [0xb3,0xe1,0xf0,0x00]
+#CHECK: cgdtra	%r4, 5, %f6, 7          # encoding: [0xb3,0xe1,0x57,0x46]
+#CHECK: cgdtra	%r15, 0, %f0, 0         # encoding: [0xb3,0xe1,0x00,0xf0]
+
+	cgdtra	%r0, 0, %f0, 0
+	cgdtra	%r0, 0, %f0, 15
+	cgdtra	%r0, 0, %f15, 0
+	cgdtra	%r0, 15, %f0, 0
+	cgdtra	%r4, 5, %f6, 7
+	cgdtra	%r15, 0, %f0, 0
+
 #CHECK: cgebra	%r0, 0, %f0, 0          # encoding: [0xb3,0xa8,0x00,0x00]
 #CHECK: cgebra	%r0, 0, %f0, 15         # encoding: [0xb3,0xa8,0x0f,0x00]
 #CHECK: cgebra	%r0, 0, %f15, 0         # encoding: [0xb3,0xa8,0x00,0x0f]
@@ -369,6 +495,20 @@
 	cgxbra	%r7, 5, %f8, 9
 	cgxbra	%r15, 0, %f0, 0
 
+#CHECK: cgxtra	%r0, 0, %f0, 0          # encoding: [0xb3,0xe9,0x00,0x00]
+#CHECK: cgxtra	%r0, 0, %f0, 15         # encoding: [0xb3,0xe9,0x0f,0x00]
+#CHECK: cgxtra	%r0, 0, %f13, 0         # encoding: [0xb3,0xe9,0x00,0x0d]
+#CHECK: cgxtra	%r0, 15, %f0, 0         # encoding: [0xb3,0xe9,0xf0,0x00]
+#CHECK: cgxtra	%r7, 5, %f8, 9          # encoding: [0xb3,0xe9,0x59,0x78]
+#CHECK: cgxtra	%r15, 0, %f0, 0         # encoding: [0xb3,0xe9,0x00,0xf0]
+
+	cgxtra	%r0, 0, %f0, 0
+	cgxtra	%r0, 0, %f0, 15
+	cgxtra	%r0, 0, %f13, 0
+	cgxtra	%r0, 15, %f0, 0
+	cgxtra	%r7, 5, %f8, 9
+	cgxtra	%r15, 0, %f0, 0
+
 #CHECK: chf	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xcd]
 #CHECK: chf	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xcd]
 #CHECK: chf	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xcd]
@@ -419,6 +559,20 @@
 	clfdbr	%r4, 5, %f6, 7
 	clfdbr	%r15, 0, %f0, 0
 
+#CHECK: clfdtr	%r0, 0, %f0, 0          # encoding: [0xb9,0x43,0x00,0x00]
+#CHECK: clfdtr	%r0, 0, %f0, 15         # encoding: [0xb9,0x43,0x0f,0x00]
+#CHECK: clfdtr	%r0, 0, %f15, 0         # encoding: [0xb9,0x43,0x00,0x0f]
+#CHECK: clfdtr	%r0, 15, %f0, 0         # encoding: [0xb9,0x43,0xf0,0x00]
+#CHECK: clfdtr	%r4, 5, %f6, 7          # encoding: [0xb9,0x43,0x57,0x46]
+#CHECK: clfdtr	%r15, 0, %f0, 0         # encoding: [0xb9,0x43,0x00,0xf0]
+
+	clfdtr	%r0, 0, %f0, 0
+	clfdtr	%r0, 0, %f0, 15
+	clfdtr	%r0, 0, %f15, 0
+	clfdtr	%r0, 15, %f0, 0
+	clfdtr	%r4, 5, %f6, 7
+	clfdtr	%r15, 0, %f0, 0
+
 #CHECK: clfebr	%r0, 0, %f0, 0          # encoding: [0xb3,0x9c,0x00,0x00]
 #CHECK: clfebr	%r0, 0, %f0, 15         # encoding: [0xb3,0x9c,0x0f,0x00]
 #CHECK: clfebr	%r0, 0, %f15, 0         # encoding: [0xb3,0x9c,0x00,0x0f]
@@ -447,6 +601,20 @@
 	clfxbr	%r7, 5, %f8, 9
 	clfxbr	%r15, 0, %f0, 0
 
+#CHECK: clfxtr	%r0, 0, %f0, 0          # encoding: [0xb9,0x4b,0x00,0x00]
+#CHECK: clfxtr	%r0, 0, %f0, 15         # encoding: [0xb9,0x4b,0x0f,0x00]
+#CHECK: clfxtr	%r0, 0, %f13, 0         # encoding: [0xb9,0x4b,0x00,0x0d]
+#CHECK: clfxtr	%r0, 15, %f0, 0         # encoding: [0xb9,0x4b,0xf0,0x00]
+#CHECK: clfxtr	%r7, 5, %f8, 9          # encoding: [0xb9,0x4b,0x59,0x78]
+#CHECK: clfxtr	%r15, 0, %f0, 0         # encoding: [0xb9,0x4b,0x00,0xf0]
+
+	clfxtr	%r0, 0, %f0, 0
+	clfxtr	%r0, 0, %f0, 15
+	clfxtr	%r0, 0, %f13, 0
+	clfxtr	%r0, 15, %f0, 0
+	clfxtr	%r7, 5, %f8, 9
+	clfxtr	%r15, 0, %f0, 0
+
 #CHECK: clgdbr	%r0, 0, %f0, 0          # encoding: [0xb3,0xad,0x00,0x00]
 #CHECK: clgdbr	%r0, 0, %f0, 15         # encoding: [0xb3,0xad,0x0f,0x00]
 #CHECK: clgdbr	%r0, 0, %f15, 0         # encoding: [0xb3,0xad,0x00,0x0f]
@@ -461,6 +629,20 @@
 	clgdbr	%r4, 5, %f6, 7
 	clgdbr	%r15, 0, %f0, 0
 
+#CHECK: clgdtr	%r0, 0, %f0, 0          # encoding: [0xb9,0x42,0x00,0x00]
+#CHECK: clgdtr	%r0, 0, %f0, 15         # encoding: [0xb9,0x42,0x0f,0x00]
+#CHECK: clgdtr	%r0, 0, %f15, 0         # encoding: [0xb9,0x42,0x00,0x0f]
+#CHECK: clgdtr	%r0, 15, %f0, 0         # encoding: [0xb9,0x42,0xf0,0x00]
+#CHECK: clgdtr	%r4, 5, %f6, 7          # encoding: [0xb9,0x42,0x57,0x46]
+#CHECK: clgdtr	%r15, 0, %f0, 0         # encoding: [0xb9,0x42,0x00,0xf0]
+
+	clgdtr	%r0, 0, %f0, 0
+	clgdtr	%r0, 0, %f0, 15
+	clgdtr	%r0, 0, %f15, 0
+	clgdtr	%r0, 15, %f0, 0
+	clgdtr	%r4, 5, %f6, 7
+	clgdtr	%r15, 0, %f0, 0
+
 #CHECK: clgebr	%r0, 0, %f0, 0          # encoding: [0xb3,0xac,0x00,0x00]
 #CHECK: clgebr	%r0, 0, %f0, 15         # encoding: [0xb3,0xac,0x0f,0x00]
 #CHECK: clgebr	%r0, 0, %f15, 0         # encoding: [0xb3,0xac,0x00,0x0f]
@@ -489,6 +671,20 @@
 	clgxbr	%r7, 5, %f8, 9
 	clgxbr	%r15, 0, %f0, 0
 
+#CHECK: clgxtr	%r0, 0, %f0, 0          # encoding: [0xb9,0x4a,0x00,0x00]
+#CHECK: clgxtr	%r0, 0, %f0, 15         # encoding: [0xb9,0x4a,0x0f,0x00]
+#CHECK: clgxtr	%r0, 0, %f13, 0         # encoding: [0xb9,0x4a,0x00,0x0d]
+#CHECK: clgxtr	%r0, 15, %f0, 0         # encoding: [0xb9,0x4a,0xf0,0x00]
+#CHECK: clgxtr	%r7, 5, %f8, 9          # encoding: [0xb9,0x4a,0x59,0x78]
+#CHECK: clgxtr	%r15, 0, %f0, 0         # encoding: [0xb9,0x4a,0x00,0xf0]
+
+	clgxtr	%r0, 0, %f0, 0
+	clgxtr	%r0, 0, %f0, 15
+	clgxtr	%r0, 0, %f13, 0
+	clgxtr	%r0, 15, %f0, 0
+	clgxtr	%r7, 5, %f8, 9
+	clgxtr	%r15, 0, %f0, 0
+
 #CHECK: clhf	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0xcf]
 #CHECK: clhf	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0xcf]
 #CHECK: clhf	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0xcf]
@@ -535,6 +731,20 @@
 	cxfbra	%f4, 5, %r9, 10
 	cxfbra	%f13, 0, %r0, 0
 
+#CHECK: cxftr	%f0, 0, %r0, 0          # encoding: [0xb9,0x59,0x00,0x00]
+#CHECK: cxftr	%f0, 0, %r0, 15         # encoding: [0xb9,0x59,0x0f,0x00]
+#CHECK: cxftr	%f0, 0, %r15, 0         # encoding: [0xb9,0x59,0x00,0x0f]
+#CHECK: cxftr	%f0, 15, %r0, 0         # encoding: [0xb9,0x59,0xf0,0x00]
+#CHECK: cxftr	%f4, 5, %r9, 10         # encoding: [0xb9,0x59,0x5a,0x49]
+#CHECK: cxftr	%f13, 0, %r0, 0         # encoding: [0xb9,0x59,0x00,0xd0]
+
+	cxftr	%f0, 0, %r0, 0
+	cxftr	%f0, 0, %r0, 15
+	cxftr	%f0, 0, %r15, 0
+	cxftr	%f0, 15, %r0, 0
+	cxftr	%f4, 5, %r9, 10
+	cxftr	%f13, 0, %r0, 0
+
 #CHECK: cxgbra	%f0, 0, %r0, 0          # encoding: [0xb3,0xa6,0x00,0x00]
 #CHECK: cxgbra	%f0, 0, %r0, 15         # encoding: [0xb3,0xa6,0x0f,0x00]
 #CHECK: cxgbra	%f0, 0, %r15, 0         # encoding: [0xb3,0xa6,0x00,0x0f]
@@ -549,6 +759,20 @@
 	cxgbra	%f4, 5, %r9, 10
 	cxgbra	%f13, 0, %r0, 0
 
+#CHECK: cxgtra	%f0, 0, %r0, 0          # encoding: [0xb3,0xf9,0x00,0x00]
+#CHECK: cxgtra	%f0, 0, %r0, 15         # encoding: [0xb3,0xf9,0x0f,0x00]
+#CHECK: cxgtra	%f0, 0, %r15, 0         # encoding: [0xb3,0xf9,0x00,0x0f]
+#CHECK: cxgtra	%f0, 15, %r0, 0         # encoding: [0xb3,0xf9,0xf0,0x00]
+#CHECK: cxgtra	%f4, 5, %r9, 10         # encoding: [0xb3,0xf9,0x5a,0x49]
+#CHECK: cxgtra	%f13, 0, %r0, 0         # encoding: [0xb3,0xf9,0x00,0xd0]
+
+	cxgtra	%f0, 0, %r0, 0
+	cxgtra	%f0, 0, %r0, 15
+	cxgtra	%f0, 0, %r15, 0
+	cxgtra	%f0, 15, %r0, 0
+	cxgtra	%f4, 5, %r9, 10
+	cxgtra	%f13, 0, %r0, 0
+
 #CHECK: cxlfbr	%f0, 0, %r0, 0          # encoding: [0xb3,0x92,0x00,0x00]
 #CHECK: cxlfbr	%f0, 0, %r0, 15         # encoding: [0xb3,0x92,0x0f,0x00]
 #CHECK: cxlfbr	%f0, 0, %r15, 0         # encoding: [0xb3,0x92,0x00,0x0f]
@@ -563,6 +787,20 @@
 	cxlfbr	%f4, 5, %r9, 10
 	cxlfbr	%f13, 0, %r0, 0
 
+#CHECK: cxlftr	%f0, 0, %r0, 0          # encoding: [0xb9,0x5b,0x00,0x00]
+#CHECK: cxlftr	%f0, 0, %r0, 15         # encoding: [0xb9,0x5b,0x0f,0x00]
+#CHECK: cxlftr	%f0, 0, %r15, 0         # encoding: [0xb9,0x5b,0x00,0x0f]
+#CHECK: cxlftr	%f0, 15, %r0, 0         # encoding: [0xb9,0x5b,0xf0,0x00]
+#CHECK: cxlftr	%f4, 5, %r9, 10         # encoding: [0xb9,0x5b,0x5a,0x49]
+#CHECK: cxlftr	%f13, 0, %r0, 0         # encoding: [0xb9,0x5b,0x00,0xd0]
+
+	cxlftr	%f0, 0, %r0, 0
+	cxlftr	%f0, 0, %r0, 15
+	cxlftr	%f0, 0, %r15, 0
+	cxlftr	%f0, 15, %r0, 0
+	cxlftr	%f4, 5, %r9, 10
+	cxlftr	%f13, 0, %r0, 0
+
 #CHECK: cxlgbr	%f0, 0, %r0, 0          # encoding: [0xb3,0xa2,0x00,0x00]
 #CHECK: cxlgbr	%f0, 0, %r0, 15         # encoding: [0xb3,0xa2,0x0f,0x00]
 #CHECK: cxlgbr	%f0, 0, %r15, 0         # encoding: [0xb3,0xa2,0x00,0x0f]
@@ -577,6 +815,48 @@
 	cxlgbr	%f4, 5, %r9, 10
 	cxlgbr	%f13, 0, %r0, 0
 
+#CHECK: cxlgtr	%f0, 0, %r0, 0          # encoding: [0xb9,0x5a,0x00,0x00]
+#CHECK: cxlgtr	%f0, 0, %r0, 15         # encoding: [0xb9,0x5a,0x0f,0x00]
+#CHECK: cxlgtr	%f0, 0, %r15, 0         # encoding: [0xb9,0x5a,0x00,0x0f]
+#CHECK: cxlgtr	%f0, 15, %r0, 0         # encoding: [0xb9,0x5a,0xf0,0x00]
+#CHECK: cxlgtr	%f4, 5, %r9, 10         # encoding: [0xb9,0x5a,0x5a,0x49]
+#CHECK: cxlgtr	%f13, 0, %r0, 0         # encoding: [0xb9,0x5a,0x00,0xd0]
+
+	cxlgtr	%f0, 0, %r0, 0
+	cxlgtr	%f0, 0, %r0, 15
+	cxlgtr	%f0, 0, %r15, 0
+	cxlgtr	%f0, 15, %r0, 0
+	cxlgtr	%f4, 5, %r9, 10
+	cxlgtr	%f13, 0, %r0, 0
+
+#CHECK: ddtra	%f0, %f0, %f0, 0        # encoding: [0xb3,0xd1,0x00,0x00]
+#CHECK: ddtra	%f0, %f0, %f0, 15       # encoding: [0xb3,0xd1,0x0f,0x00]
+#CHECK: ddtra	%f0, %f0, %f15, 0       # encoding: [0xb3,0xd1,0xf0,0x00]
+#CHECK: ddtra	%f0, %f15, %f0, 0       # encoding: [0xb3,0xd1,0x00,0x0f]
+#CHECK: ddtra	%f15, %f0, %f0, 0       # encoding: [0xb3,0xd1,0x00,0xf0]
+#CHECK: ddtra	%f7, %f8, %f9, 10       # encoding: [0xb3,0xd1,0x9a,0x78]
+
+	ddtra	%f0, %f0, %f0, 0
+	ddtra	%f0, %f0, %f0, 15
+	ddtra	%f0, %f0, %f15, 0
+	ddtra	%f0, %f15, %f0, 0
+	ddtra	%f15, %f0, %f0, 0
+	ddtra	%f7, %f8, %f9, 10
+
+#CHECK: dxtra	%f0, %f0, %f0, 0        # encoding: [0xb3,0xd9,0x00,0x00]
+#CHECK: dxtra	%f0, %f0, %f0, 15       # encoding: [0xb3,0xd9,0x0f,0x00]
+#CHECK: dxtra	%f0, %f0, %f13, 0       # encoding: [0xb3,0xd9,0xd0,0x00]
+#CHECK: dxtra	%f0, %f13, %f0, 0       # encoding: [0xb3,0xd9,0x00,0x0d]
+#CHECK: dxtra	%f13, %f0, %f0, 0       # encoding: [0xb3,0xd9,0x00,0xd0]
+#CHECK: dxtra	%f8, %f8, %f8, 8        # encoding: [0xb3,0xd9,0x88,0x88]
+
+	dxtra	%f0, %f0, %f0, 0
+	dxtra	%f0, %f0, %f0, 15
+	dxtra	%f0, %f0, %f13, 0
+	dxtra	%f0, %f13, %f0, 0
+	dxtra	%f13, %f0, %f0, 0
+	dxtra	%f8, %f8, %f8, 8
+
 #CHECK: fidbra	%f0, 0, %f0, 0          # encoding: [0xb3,0x5f,0x00,0x00]
 #CHECK: fidbra	%f0, 0, %f0, 15         # encoding: [0xb3,0x5f,0x0f,0x00]
 #CHECK: fidbra	%f0, 0, %f15, 0         # encoding: [0xb3,0x5f,0x00,0x0f]
@@ -1285,6 +1565,34 @@
 	lpdg	%r2, 0(%r1), 1(%r15)
 	lpdg	%r2, 0(%r1), 4095(%r15)
 
+#CHECK: mdtra	%f0, %f0, %f0, 0        # encoding: [0xb3,0xd0,0x00,0x00]
+#CHECK: mdtra	%f0, %f0, %f0, 15       # encoding: [0xb3,0xd0,0x0f,0x00]
+#CHECK: mdtra	%f0, %f0, %f15, 0       # encoding: [0xb3,0xd0,0xf0,0x00]
+#CHECK: mdtra	%f0, %f15, %f0, 0       # encoding: [0xb3,0xd0,0x00,0x0f]
+#CHECK: mdtra	%f15, %f0, %f0, 0       # encoding: [0xb3,0xd0,0x00,0xf0]
+#CHECK: mdtra	%f7, %f8, %f9, 10       # encoding: [0xb3,0xd0,0x9a,0x78]
+
+	mdtra	%f0, %f0, %f0, 0
+	mdtra	%f0, %f0, %f0, 15
+	mdtra	%f0, %f0, %f15, 0
+	mdtra	%f0, %f15, %f0, 0
+	mdtra	%f15, %f0, %f0, 0
+	mdtra	%f7, %f8, %f9, 10
+
+#CHECK: mxtra	%f0, %f0, %f0, 0        # encoding: [0xb3,0xd8,0x00,0x00]
+#CHECK: mxtra	%f0, %f0, %f0, 15       # encoding: [0xb3,0xd8,0x0f,0x00]
+#CHECK: mxtra	%f0, %f0, %f13, 0       # encoding: [0xb3,0xd8,0xd0,0x00]
+#CHECK: mxtra	%f0, %f13, %f0, 0       # encoding: [0xb3,0xd8,0x00,0x0d]
+#CHECK: mxtra	%f13, %f0, %f0, 0       # encoding: [0xb3,0xd8,0x00,0xd0]
+#CHECK: mxtra	%f8, %f8, %f8, 8        # encoding: [0xb3,0xd8,0x88,0x88]
+
+	mxtra	%f0, %f0, %f0, 0
+	mxtra	%f0, %f0, %f0, 15
+	mxtra	%f0, %f0, %f13, 0
+	mxtra	%f0, %f13, %f0, 0
+	mxtra	%f13, %f0, %f0, 0
+	mxtra	%f8, %f8, %f8, 8
+
 #CHECK: ngrk	%r0, %r0, %r0           # encoding: [0xb9,0xe4,0x00,0x00]
 #CHECK: ngrk	%r0, %r0, %r15          # encoding: [0xb9,0xe4,0xf0,0x00]
 #CHECK: ngrk	%r0, %r15, %r0          # encoding: [0xb9,0xe4,0x00,0x0f]
@@ -1379,6 +1687,20 @@
 	risblg	%r15,%r0,0,0,0
 	risblg	%r4,%r5,6,7,8
 
+#CHECK: sdtra	%f0, %f0, %f0, 0        # encoding: [0xb3,0xd3,0x00,0x00]
+#CHECK: sdtra	%f0, %f0, %f0, 15       # encoding: [0xb3,0xd3,0x0f,0x00]
+#CHECK: sdtra	%f0, %f0, %f15, 0       # encoding: [0xb3,0xd3,0xf0,0x00]
+#CHECK: sdtra	%f0, %f15, %f0, 0       # encoding: [0xb3,0xd3,0x00,0x0f]
+#CHECK: sdtra	%f15, %f0, %f0, 0       # encoding: [0xb3,0xd3,0x00,0xf0]
+#CHECK: sdtra	%f7, %f8, %f9, 10       # encoding: [0xb3,0xd3,0x9a,0x78]
+
+	sdtra	%f0, %f0, %f0, 0
+	sdtra	%f0, %f0, %f0, 15
+	sdtra	%f0, %f0, %f15, 0
+	sdtra	%f0, %f15, %f0, 0
+	sdtra	%f15, %f0, %f0, 0
+	sdtra	%f7, %f8, %f9, 10
+
 #CHECK: sgrk	%r0, %r0, %r0           # encoding: [0xb9,0xe9,0x00,0x00]
 #CHECK: sgrk	%r0, %r0, %r15          # encoding: [0xb9,0xe9,0xf0,0x00]
 #CHECK: sgrk	%r0, %r15, %r0          # encoding: [0xb9,0xe9,0x00,0x0f]
@@ -1731,6 +2053,20 @@
 	stocgnp  %r1,2(%r3)
 	stocgno  %r1,2(%r3)
 
+#CHECK: sxtra	%f0, %f0, %f0, 0        # encoding: [0xb3,0xdb,0x00,0x00]
+#CHECK: sxtra	%f0, %f0, %f0, 15       # encoding: [0xb3,0xdb,0x0f,0x00]
+#CHECK: sxtra	%f0, %f0, %f13, 0       # encoding: [0xb3,0xdb,0xd0,0x00]
+#CHECK: sxtra	%f0, %f13, %f0, 0       # encoding: [0xb3,0xdb,0x00,0x0d]
+#CHECK: sxtra	%f13, %f0, %f0, 0       # encoding: [0xb3,0xdb,0x00,0xd0]
+#CHECK: sxtra	%f8, %f8, %f8, 8        # encoding: [0xb3,0xdb,0x88,0x88]
+
+	sxtra	%f0, %f0, %f0, 0
+	sxtra	%f0, %f0, %f0, 15
+	sxtra	%f0, %f0, %f13, 0
+	sxtra	%f0, %f13, %f0, 0
+	sxtra	%f13, %f0, %f0, 0
+	sxtra	%f8, %f8, %f8, 8
+
 #CHECK: xgrk	%r0, %r0, %r0           # encoding: [0xb9,0xe7,0x00,0x00]
 #CHECK: xgrk	%r0, %r0, %r15          # encoding: [0xb9,0xe7,0xf0,0x00]
 #CHECK: xgrk	%r0, %r15, %r0          # encoding: [0xb9,0xe7,0x00,0x0f]
diff --git a/test/MC/SystemZ/insn-good-zEC12.s b/test/MC/SystemZ/insn-good-zEC12.s
index 275d68d8a619..2fe6c46ad908 100644
--- a/test/MC/SystemZ/insn-good-zEC12.s
+++ b/test/MC/SystemZ/insn-good-zEC12.s
@@ -114,6 +114,26 @@
 	bprp	8, branch, target@plt
 	bprp	9, branch@plt, target@plt
 
+#CHECK: cdzt	%f0, 0(1), 0                # encoding: [0xed,0x00,0x00,0x00,0x00,0xaa]
+#CHECK: cdzt	%f15, 0(1), 0               # encoding: [0xed,0x00,0x00,0x00,0xf0,0xaa]
+#CHECK: cdzt	%f0, 0(1), 15               # encoding: [0xed,0x00,0x00,0x00,0x0f,0xaa]
+#CHECK: cdzt	%f0, 0(1,%r1), 0            # encoding: [0xed,0x00,0x10,0x00,0x00,0xaa]
+#CHECK: cdzt	%f0, 0(1,%r15), 0           # encoding: [0xed,0x00,0xf0,0x00,0x00,0xaa]
+#CHECK: cdzt	%f0, 4095(1,%r1), 0         # encoding: [0xed,0x00,0x1f,0xff,0x00,0xaa]
+#CHECK: cdzt	%f0, 4095(1,%r15), 0        # encoding: [0xed,0x00,0xff,0xff,0x00,0xaa]
+#CHECK: cdzt	%f0, 0(256,%r1), 0          # encoding: [0xed,0xff,0x10,0x00,0x00,0xaa]
+#CHECK: cdzt	%f0, 0(256,%r15), 0         # encoding: [0xed,0xff,0xf0,0x00,0x00,0xaa]
+
+	cdzt	%f0, 0(1), 0
+	cdzt	%f15, 0(1), 0
+	cdzt	%f0, 0(1), 15
+	cdzt	%f0, 0(1,%r1), 0
+	cdzt	%f0, 0(1,%r15), 0
+	cdzt	%f0, 4095(1,%r1), 0
+	cdzt	%f0, 4095(1,%r15), 0
+	cdzt	%f0, 0(256,%r1), 0
+	cdzt	%f0, 0(256,%r15), 0
+
 #CHECK: clt	%r0, 12, -524288            # encoding: [0xeb,0x0c,0x00,0x00,0x80,0x23]
 #CHECK: clt	%r0, 12, -1                 # encoding: [0xeb,0x0c,0x0f,0xff,0xff,0x23]
 #CHECK: clt	%r0, 12, 0                  # encoding: [0xeb,0x0c,0x00,0x00,0x00,0x23]
@@ -178,6 +198,66 @@
 	clgtnl	%r0, 0(%r15)
 	clgtnh	%r0, 0(%r15)
 
+#CHECK: cxzt	%f0, 0(1), 0                # encoding: [0xed,0x00,0x00,0x00,0x00,0xab]
+#CHECK: cxzt	%f13, 0(1), 0               # encoding: [0xed,0x00,0x00,0x00,0xd0,0xab]
+#CHECK: cxzt	%f0, 0(1), 15               # encoding: [0xed,0x00,0x00,0x00,0x0f,0xab]
+#CHECK: cxzt	%f0, 0(1,%r1), 0            # encoding: [0xed,0x00,0x10,0x00,0x00,0xab]
+#CHECK: cxzt	%f0, 0(1,%r15), 0           # encoding: [0xed,0x00,0xf0,0x00,0x00,0xab]
+#CHECK: cxzt	%f0, 4095(1,%r1), 0         # encoding: [0xed,0x00,0x1f,0xff,0x00,0xab]
+#CHECK: cxzt	%f0, 4095(1,%r15), 0        # encoding: [0xed,0x00,0xff,0xff,0x00,0xab]
+#CHECK: cxzt	%f0, 0(256,%r1), 0          # encoding: [0xed,0xff,0x10,0x00,0x00,0xab]
+#CHECK: cxzt	%f0, 0(256,%r15), 0         # encoding: [0xed,0xff,0xf0,0x00,0x00,0xab]
+
+	cxzt	%f0, 0(1), 0
+	cxzt	%f13, 0(1), 0
+	cxzt	%f0, 0(1), 15
+	cxzt	%f0, 0(1,%r1), 0
+	cxzt	%f0, 0(1,%r15), 0
+	cxzt	%f0, 4095(1,%r1), 0
+	cxzt	%f0, 4095(1,%r15), 0
+	cxzt	%f0, 0(256,%r1), 0
+	cxzt	%f0, 0(256,%r15), 0
+
+#CHECK: czdt	%f0, 0(1), 0                # encoding: [0xed,0x00,0x00,0x00,0x00,0xa8]
+#CHECK: czdt	%f15, 0(1), 0               # encoding: [0xed,0x00,0x00,0x00,0xf0,0xa8]
+#CHECK: czdt	%f0, 0(1), 15               # encoding: [0xed,0x00,0x00,0x00,0x0f,0xa8]
+#CHECK: czdt	%f0, 0(1,%r1), 0            # encoding: [0xed,0x00,0x10,0x00,0x00,0xa8]
+#CHECK: czdt	%f0, 0(1,%r15), 0           # encoding: [0xed,0x00,0xf0,0x00,0x00,0xa8]
+#CHECK: czdt	%f0, 4095(1,%r1), 0         # encoding: [0xed,0x00,0x1f,0xff,0x00,0xa8]
+#CHECK: czdt	%f0, 4095(1,%r15), 0        # encoding: [0xed,0x00,0xff,0xff,0x00,0xa8]
+#CHECK: czdt	%f0, 0(256,%r1), 0          # encoding: [0xed,0xff,0x10,0x00,0x00,0xa8]
+#CHECK: czdt	%f0, 0(256,%r15), 0         # encoding: [0xed,0xff,0xf0,0x00,0x00,0xa8]
+
+	czdt	%f0, 0(1), 0
+	czdt	%f15, 0(1), 0
+	czdt	%f0, 0(1), 15
+	czdt	%f0, 0(1,%r1), 0
+	czdt	%f0, 0(1,%r15), 0
+	czdt	%f0, 4095(1,%r1), 0
+	czdt	%f0, 4095(1,%r15), 0
+	czdt	%f0, 0(256,%r1), 0
+	czdt	%f0, 0(256,%r15), 0
+
+#CHECK: czxt	%f0, 0(1), 0                # encoding: [0xed,0x00,0x00,0x00,0x00,0xa9]
+#CHECK: czxt	%f13, 0(1), 0               # encoding: [0xed,0x00,0x00,0x00,0xd0,0xa9]
+#CHECK: czxt	%f0, 0(1), 15               # encoding: [0xed,0x00,0x00,0x00,0x0f,0xa9]
+#CHECK: czxt	%f0, 0(1,%r1), 0            # encoding: [0xed,0x00,0x10,0x00,0x00,0xa9]
+#CHECK: czxt	%f0, 0(1,%r15), 0           # encoding: [0xed,0x00,0xf0,0x00,0x00,0xa9]
+#CHECK: czxt	%f0, 4095(1,%r1), 0         # encoding: [0xed,0x00,0x1f,0xff,0x00,0xa9]
+#CHECK: czxt	%f0, 4095(1,%r15), 0        # encoding: [0xed,0x00,0xff,0xff,0x00,0xa9]
+#CHECK: czxt	%f0, 0(256,%r1), 0          # encoding: [0xed,0xff,0x10,0x00,0x00,0xa9]
+#CHECK: czxt	%f0, 0(256,%r15), 0         # encoding: [0xed,0xff,0xf0,0x00,0x00,0xa9]
+
+	czxt	%f0, 0(1), 0
+	czxt	%f13, 0(1), 0
+	czxt	%f0, 0(1), 15
+	czxt	%f0, 0(1,%r1), 0
+	czxt	%f0, 0(1,%r15), 0
+	czxt	%f0, 4095(1,%r1), 0
+	czxt	%f0, 4095(1,%r15), 0
+	czxt	%f0, 0(256,%r1), 0
+	czxt	%f0, 0(256,%r15), 0
+
 #CHECK: etnd	%r0                     # encoding: [0xb2,0xec,0x00,0x00]
 #CHECK: etnd	%r15                    # encoding: [0xb2,0xec,0x00,0xf0]
 #CHECK: etnd	%r7                     # encoding: [0xb2,0xec,0x00,0x70]
diff --git a/test/MC/SystemZ/insn-good.s b/test/MC/SystemZ/insn-good.s
index a6228f23c8f8..73162e4eea71 100644
--- a/test/MC/SystemZ/insn-good.s
+++ b/test/MC/SystemZ/insn-good.s
@@ -17,6 +17,22 @@
 	a	%r0, 4095(%r15,%r1)
 	a	%r15, 0
 
+#CHECK: ad	%f0, 0                  # encoding: [0x6a,0x00,0x00,0x00]
+#CHECK: ad	%f0, 4095               # encoding: [0x6a,0x00,0x0f,0xff]
+#CHECK: ad	%f0, 0(%r1)             # encoding: [0x6a,0x00,0x10,0x00]
+#CHECK: ad	%f0, 0(%r15)            # encoding: [0x6a,0x00,0xf0,0x00]
+#CHECK: ad	%f0, 4095(%r1,%r15)     # encoding: [0x6a,0x01,0xff,0xff]
+#CHECK: ad	%f0, 4095(%r15,%r1)     # encoding: [0x6a,0x0f,0x1f,0xff]
+#CHECK: ad	%f15, 0                 # encoding: [0x6a,0xf0,0x00,0x00]
+
+	ad	%f0, 0
+	ad	%f0, 4095
+	ad	%f0, 0(%r1)
+	ad	%f0, 0(%r15)
+	ad	%f0, 4095(%r1,%r15)
+	ad	%f0, 4095(%r15,%r1)
+	ad	%f15, 0
+
 #CHECK: adb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x1a]
 #CHECK: adb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1a]
 #CHECK: adb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x1a]
@@ -43,6 +59,44 @@
 	adbr	%f7, %f8
 	adbr	%f15, %f0
 
+#CHECK: adr	%f0, %f0                # encoding: [0x2a,0x00]
+#CHECK: adr	%f0, %f15               # encoding: [0x2a,0x0f]
+#CHECK: adr	%f7, %f8                # encoding: [0x2a,0x78]
+#CHECK: adr	%f15, %f0               # encoding: [0x2a,0xf0]
+
+	adr	%f0, %f0
+	adr	%f0, %f15
+	adr	%f7, %f8
+	adr	%f15, %f0
+
+#CHECK: adtr	%f0, %f0, %f0           # encoding: [0xb3,0xd2,0x00,0x00]
+#CHECK: adtr	%f0, %f0, %f15          # encoding: [0xb3,0xd2,0xf0,0x00]
+#CHECK: adtr	%f0, %f15, %f0          # encoding: [0xb3,0xd2,0x00,0x0f]
+#CHECK: adtr	%f15, %f0, %f0          # encoding: [0xb3,0xd2,0x00,0xf0]
+#CHECK: adtr	%f7, %f8, %f9           # encoding: [0xb3,0xd2,0x90,0x78]
+
+	adtr	%f0, %f0, %f0
+	adtr	%f0, %f0, %f15
+	adtr	%f0, %f15, %f0
+	adtr	%f15, %f0, %f0
+	adtr	%f7, %f8, %f9
+
+#CHECK: ae	%f0, 0                  # encoding: [0x7a,0x00,0x00,0x00]
+#CHECK: ae	%f0, 4095               # encoding: [0x7a,0x00,0x0f,0xff]
+#CHECK: ae	%f0, 0(%r1)             # encoding: [0x7a,0x00,0x10,0x00]
+#CHECK: ae	%f0, 0(%r15)            # encoding: [0x7a,0x00,0xf0,0x00]
+#CHECK: ae	%f0, 4095(%r1,%r15)     # encoding: [0x7a,0x01,0xff,0xff]
+#CHECK: ae	%f0, 4095(%r15,%r1)     # encoding: [0x7a,0x0f,0x1f,0xff]
+#CHECK: ae	%f15, 0                 # encoding: [0x7a,0xf0,0x00,0x00]
+
+	ae	%f0, 0
+	ae	%f0, 4095
+	ae	%f0, 0(%r1)
+	ae	%f0, 0(%r15)
+	ae	%f0, 4095(%r1,%r15)
+	ae	%f0, 4095(%r15,%r1)
+	ae	%f15, 0
+
 #CHECK: aeb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x0a]
 #CHECK: aeb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x0a]
 #CHECK: aeb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x0a]
@@ -69,6 +123,16 @@
 	aebr	%f7, %f8
 	aebr	%f15, %f0
 
+#CHECK: aer	%f0, %f0                # encoding: [0x3a,0x00]
+#CHECK: aer	%f0, %f15               # encoding: [0x3a,0x0f]
+#CHECK: aer	%f7, %f8                # encoding: [0x3a,0x78]
+#CHECK: aer	%f15, %f0               # encoding: [0x3a,0xf0]
+
+	aer	%f0, %f0
+	aer	%f0, %f15
+	aer	%f7, %f8
+	aer	%f15, %f0
+
 #CHECK: afi	%r0, -2147483648        # encoding: [0xc2,0x09,0x80,0x00,0x00,0x00]
 #CHECK: afi	%r0, -1                 # encoding: [0xc2,0x09,0xff,0xff,0xff,0xff]
 #CHECK: afi	%r0, 0                  # encoding: [0xc2,0x09,0x00,0x00,0x00,0x00]
@@ -571,6 +635,58 @@
 	asi	524287(%r1), 42
 	asi	524287(%r15), 42
 
+#CHECK: au	%f0, 0                  # encoding: [0x7e,0x00,0x00,0x00]
+#CHECK: au	%f0, 4095               # encoding: [0x7e,0x00,0x0f,0xff]
+#CHECK: au	%f0, 0(%r1)             # encoding: [0x7e,0x00,0x10,0x00]
+#CHECK: au	%f0, 0(%r15)            # encoding: [0x7e,0x00,0xf0,0x00]
+#CHECK: au	%f0, 4095(%r1,%r15)     # encoding: [0x7e,0x01,0xff,0xff]
+#CHECK: au	%f0, 4095(%r15,%r1)     # encoding: [0x7e,0x0f,0x1f,0xff]
+#CHECK: au	%f15, 0                 # encoding: [0x7e,0xf0,0x00,0x00]
+
+	au	%f0, 0
+	au	%f0, 4095
+	au	%f0, 0(%r1)
+	au	%f0, 0(%r15)
+	au	%f0, 4095(%r1,%r15)
+	au	%f0, 4095(%r15,%r1)
+	au	%f15, 0
+
+#CHECK: aur	%f0, %f0                # encoding: [0x3e,0x00]
+#CHECK: aur	%f0, %f15               # encoding: [0x3e,0x0f]
+#CHECK: aur	%f7, %f8                # encoding: [0x3e,0x78]
+#CHECK: aur	%f15, %f0               # encoding: [0x3e,0xf0]
+
+	aur	%f0, %f0
+	aur	%f0, %f15
+	aur	%f7, %f8
+	aur	%f15, %f0
+
+#CHECK: aw	%f0, 0                  # encoding: [0x6e,0x00,0x00,0x00]
+#CHECK: aw	%f0, 4095               # encoding: [0x6e,0x00,0x0f,0xff]
+#CHECK: aw	%f0, 0(%r1)             # encoding: [0x6e,0x00,0x10,0x00]
+#CHECK: aw	%f0, 0(%r15)            # encoding: [0x6e,0x00,0xf0,0x00]
+#CHECK: aw	%f0, 4095(%r1,%r15)     # encoding: [0x6e,0x01,0xff,0xff]
+#CHECK: aw	%f0, 4095(%r15,%r1)     # encoding: [0x6e,0x0f,0x1f,0xff]
+#CHECK: aw	%f15, 0                 # encoding: [0x6e,0xf0,0x00,0x00]
+
+	aw	%f0, 0
+	aw	%f0, 4095
+	aw	%f0, 0(%r1)
+	aw	%f0, 0(%r15)
+	aw	%f0, 4095(%r1,%r15)
+	aw	%f0, 4095(%r15,%r1)
+	aw	%f15, 0
+
+#CHECK: awr	%f0, %f0                # encoding: [0x2e,0x00]
+#CHECK: awr	%f0, %f15               # encoding: [0x2e,0x0f]
+#CHECK: awr	%f7, %f8                # encoding: [0x2e,0x78]
+#CHECK: awr	%f15, %f0               # encoding: [0x2e,0xf0]
+
+	awr	%f0, %f0
+	awr	%f0, %f15
+	awr	%f7, %f8
+	awr	%f15, %f0
+
 #CHECK: axbr	%f0, %f0                # encoding: [0xb3,0x4a,0x00,0x00]
 #CHECK: axbr	%f0, %f13               # encoding: [0xb3,0x4a,0x00,0x0d]
 #CHECK: axbr	%f8, %f8                # encoding: [0xb3,0x4a,0x00,0x88]
@@ -581,6 +697,28 @@
 	axbr	%f8, %f8
 	axbr	%f13, %f0
 
+#CHECK: axr	%f0, %f0                # encoding: [0x36,0x00]
+#CHECK: axr	%f0, %f13               # encoding: [0x36,0x0d]
+#CHECK: axr	%f8, %f8                # encoding: [0x36,0x88]
+#CHECK: axr	%f13, %f0               # encoding: [0x36,0xd0]
+
+	axr	%f0, %f0
+	axr	%f0, %f13
+	axr	%f8, %f8
+	axr	%f13, %f0
+
+#CHECK: axtr	%f0, %f0, %f0           # encoding: [0xb3,0xda,0x00,0x00]
+#CHECK: axtr	%f0, %f0, %f13          # encoding: [0xb3,0xda,0xd0,0x00]
+#CHECK: axtr	%f0, %f13, %f0          # encoding: [0xb3,0xda,0x00,0x0d]
+#CHECK: axtr	%f13, %f0, %f0          # encoding: [0xb3,0xda,0x00,0xd0]
+#CHECK: axtr	%f8, %f8, %f8           # encoding: [0xb3,0xda,0x80,0x88]
+
+	axtr	%f0, %f0, %f0
+	axtr	%f0, %f0, %f13
+	axtr	%f0, %f13, %f0
+	axtr	%f13, %f0, %f0
+	axtr	%f8, %f8, %f8
+
 #CHECK: ay	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x5a]
 #CHECK: ay	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x5a]
 #CHECK: ay	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x5a]
@@ -1885,6 +2023,22 @@
 	c	%r0, 4095(%r15,%r1)
 	c	%r15, 0
 
+#CHECK: cd	%f0, 0                  # encoding: [0x69,0x00,0x00,0x00]
+#CHECK: cd	%f0, 4095               # encoding: [0x69,0x00,0x0f,0xff]
+#CHECK: cd	%f0, 0(%r1)             # encoding: [0x69,0x00,0x10,0x00]
+#CHECK: cd	%f0, 0(%r15)            # encoding: [0x69,0x00,0xf0,0x00]
+#CHECK: cd	%f0, 4095(%r1,%r15)     # encoding: [0x69,0x01,0xff,0xff]
+#CHECK: cd	%f0, 4095(%r15,%r1)     # encoding: [0x69,0x0f,0x1f,0xff]
+#CHECK: cd	%f15, 0                 # encoding: [0x69,0xf0,0x00,0x00]
+
+	cd	%f0, 0
+	cd	%f0, 4095
+	cd	%f0, 0(%r1)
+	cd	%f0, 0(%r15)
+	cd	%f0, 4095(%r1,%r15)
+	cd	%f0, 4095(%r15,%r1)
+	cd	%f15, 0
+
 #CHECK: cdb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x19]
 #CHECK: cdb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x19]
 #CHECK: cdb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x19]
@@ -1923,6 +2077,18 @@
 	cdfbr	%f7, %r8
 	cdfbr	%f15, %r15
 
+#CHECK: cdfr	%f0, %r0                # encoding: [0xb3,0xb5,0x00,0x00]
+#CHECK: cdfr	%f0, %r15               # encoding: [0xb3,0xb5,0x00,0x0f]
+#CHECK: cdfr	%f15, %r0               # encoding: [0xb3,0xb5,0x00,0xf0]
+#CHECK: cdfr	%f7, %r8                # encoding: [0xb3,0xb5,0x00,0x78]
+#CHECK: cdfr	%f15, %r15              # encoding: [0xb3,0xb5,0x00,0xff]
+
+	cdfr	%f0, %r0
+	cdfr	%f0, %r15
+	cdfr	%f15, %r0
+	cdfr	%f7, %r8
+	cdfr	%f15, %r15
+
 #CHECK: cdgbr	%f0, %r0                # encoding: [0xb3,0xa5,0x00,0x00]
 #CHECK: cdgbr	%f0, %r15               # encoding: [0xb3,0xa5,0x00,0x0f]
 #CHECK: cdgbr	%f15, %r0               # encoding: [0xb3,0xa5,0x00,0xf0]
@@ -1935,6 +2101,40 @@
 	cdgbr	%f7, %r8
 	cdgbr	%f15, %r15
 
+#CHECK: cdgr	%f0, %r0                # encoding: [0xb3,0xc5,0x00,0x00]
+#CHECK: cdgr	%f0, %r15               # encoding: [0xb3,0xc5,0x00,0x0f]
+#CHECK: cdgr	%f15, %r0               # encoding: [0xb3,0xc5,0x00,0xf0]
+#CHECK: cdgr	%f7, %r8                # encoding: [0xb3,0xc5,0x00,0x78]
+#CHECK: cdgr	%f15, %r15              # encoding: [0xb3,0xc5,0x00,0xff]
+
+	cdgr	%f0, %r0
+	cdgr	%f0, %r15
+	cdgr	%f15, %r0
+	cdgr	%f7, %r8
+	cdgr	%f15, %r15
+
+#CHECK: cdgtr	%f0, %r0                # encoding: [0xb3,0xf1,0x00,0x00]
+#CHECK: cdgtr	%f0, %r15               # encoding: [0xb3,0xf1,0x00,0x0f]
+#CHECK: cdgtr	%f15, %r0               # encoding: [0xb3,0xf1,0x00,0xf0]
+#CHECK: cdgtr	%f7, %r8                # encoding: [0xb3,0xf1,0x00,0x78]
+#CHECK: cdgtr	%f15, %r15              # encoding: [0xb3,0xf1,0x00,0xff]
+
+	cdgtr	%f0, %r0
+	cdgtr	%f0, %r15
+	cdgtr	%f15, %r0
+	cdgtr	%f7, %r8
+	cdgtr	%f15, %r15
+
+#CHECK: cdr	%f0, %f0                # encoding: [0x29,0x00]
+#CHECK: cdr	%f0, %f15               # encoding: [0x29,0x0f]
+#CHECK: cdr	%f7, %f8                # encoding: [0x29,0x78]
+#CHECK: cdr	%f15, %f0               # encoding: [0x29,0xf0]
+
+	cdr	%f0, %f0
+	cdr	%f0, %f15
+	cdr	%f7, %f8
+	cdr	%f15, %f0
+
 #CHECK: cds	%r0, %r0, 0             # encoding: [0xbb,0x00,0x00,0x00]
 #CHECK: cds	%r0, %r0, 4095          # encoding: [0xbb,0x00,0x0f,0xff]
 #CHECK: cds	%r0, %r0, 0(%r1)        # encoding: [0xbb,0x00,0x10,0x00]
@@ -1977,6 +2177,18 @@
 	cdsg	%r0, %r14, 0
 	cdsg	%r14, %r0, 0
 
+#CHECK: cdstr	%f0, %r0                # encoding: [0xb3,0xf3,0x00,0x00]
+#CHECK: cdstr	%f0, %r15               # encoding: [0xb3,0xf3,0x00,0x0f]
+#CHECK: cdstr	%f15, %r0               # encoding: [0xb3,0xf3,0x00,0xf0]
+#CHECK: cdstr	%f7, %r8                # encoding: [0xb3,0xf3,0x00,0x78]
+#CHECK: cdstr	%f15, %r15              # encoding: [0xb3,0xf3,0x00,0xff]
+
+	cdstr	%f0, %r0
+	cdstr	%f0, %r15
+	cdstr	%f15, %r0
+	cdstr	%f7, %r8
+	cdstr	%f15, %r15
+
 #CHECK: cdsy	%r0, %r0, -524288       # encoding: [0xeb,0x00,0x00,0x00,0x80,0x31]
 #CHECK: cdsy	%r0, %r0, -1            # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x31]
 #CHECK: cdsy	%r0, %r0, 0             # encoding: [0xeb,0x00,0x00,0x00,0x00,0x31]
@@ -2001,6 +2213,44 @@
 	cdsy	%r0, %r14, 0
 	cdsy	%r14, %r0, 0
 
+#CHECK: cdtr	%f0, %f0                # encoding: [0xb3,0xe4,0x00,0x00]
+#CHECK: cdtr	%f0, %f15               # encoding: [0xb3,0xe4,0x00,0x0f]
+#CHECK: cdtr	%f7, %f8                # encoding: [0xb3,0xe4,0x00,0x78]
+#CHECK: cdtr	%f15, %f0               # encoding: [0xb3,0xe4,0x00,0xf0]
+
+	cdtr	%f0, %f0
+	cdtr	%f0, %f15
+	cdtr	%f7, %f8
+	cdtr	%f15, %f0
+
+#CHECK: cdutr	%f0, %r0                # encoding: [0xb3,0xf2,0x00,0x00]
+#CHECK: cdutr	%f0, %r15               # encoding: [0xb3,0xf2,0x00,0x0f]
+#CHECK: cdutr	%f15, %r0               # encoding: [0xb3,0xf2,0x00,0xf0]
+#CHECK: cdutr	%f7, %r8                # encoding: [0xb3,0xf2,0x00,0x78]
+#CHECK: cdutr	%f15, %r15              # encoding: [0xb3,0xf2,0x00,0xff]
+
+	cdutr	%f0, %r0
+	cdutr	%f0, %r15
+	cdutr	%f15, %r0
+	cdutr	%f7, %r8
+	cdutr	%f15, %r15
+
+#CHECK: ce	%f0, 0                  # encoding: [0x79,0x00,0x00,0x00]
+#CHECK: ce	%f0, 4095               # encoding: [0x79,0x00,0x0f,0xff]
+#CHECK: ce	%f0, 0(%r1)             # encoding: [0x79,0x00,0x10,0x00]
+#CHECK: ce	%f0, 0(%r15)            # encoding: [0x79,0x00,0xf0,0x00]
+#CHECK: ce	%f0, 4095(%r1,%r15)     # encoding: [0x79,0x01,0xff,0xff]
+#CHECK: ce	%f0, 4095(%r15,%r1)     # encoding: [0x79,0x0f,0x1f,0xff]
+#CHECK: ce	%f15, 0                 # encoding: [0x79,0xf0,0x00,0x00]
+
+	ce	%f0, 0
+	ce	%f0, 4095
+	ce	%f0, 0(%r1)
+	ce	%f0, 0(%r15)
+	ce	%f0, 4095(%r1,%r15)
+	ce	%f0, 4095(%r15,%r1)
+	ce	%f15, 0
+
 #CHECK: ceb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x09]
 #CHECK: ceb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x09]
 #CHECK: ceb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x09]
@@ -2027,6 +2277,16 @@
 	cebr	%f7, %f8
 	cebr	%f15, %f0
 
+#CHECK: cedtr	%f0, %f0                # encoding: [0xb3,0xf4,0x00,0x00]
+#CHECK: cedtr	%f0, %f15               # encoding: [0xb3,0xf4,0x00,0x0f]
+#CHECK: cedtr	%f7, %f8                # encoding: [0xb3,0xf4,0x00,0x78]
+#CHECK: cedtr	%f15, %f0               # encoding: [0xb3,0xf4,0x00,0xf0]
+
+	cedtr	%f0, %f0
+	cedtr	%f0, %f15
+	cedtr	%f7, %f8
+	cedtr	%f15, %f0
+
 #CHECK: cefbr	%f0, %r0                # encoding: [0xb3,0x94,0x00,0x00]
 #CHECK: cefbr	%f0, %r15               # encoding: [0xb3,0x94,0x00,0x0f]
 #CHECK: cefbr	%f15, %r0               # encoding: [0xb3,0x94,0x00,0xf0]
@@ -2039,6 +2299,18 @@
 	cefbr	%f7, %r8
 	cefbr	%f15, %r15
 
+#CHECK: cefr	%f0, %r0                # encoding: [0xb3,0xb4,0x00,0x00]
+#CHECK: cefr	%f0, %r15               # encoding: [0xb3,0xb4,0x00,0x0f]
+#CHECK: cefr	%f15, %r0               # encoding: [0xb3,0xb4,0x00,0xf0]
+#CHECK: cefr	%f7, %r8                # encoding: [0xb3,0xb4,0x00,0x78]
+#CHECK: cefr	%f15, %r15              # encoding: [0xb3,0xb4,0x00,0xff]
+
+	cefr	%f0, %r0
+	cefr	%f0, %r15
+	cefr	%f15, %r0
+	cefr	%f7, %r8
+	cefr	%f15, %r15
+
 #CHECK: cegbr	%f0, %r0                # encoding: [0xb3,0xa4,0x00,0x00]
 #CHECK: cegbr	%f0, %r15               # encoding: [0xb3,0xa4,0x00,0x0f]
 #CHECK: cegbr	%f15, %r0               # encoding: [0xb3,0xa4,0x00,0xf0]
@@ -2051,6 +2323,38 @@
 	cegbr	%f7, %r8
 	cegbr	%f15, %r15
 
+#CHECK: cegr	%f0, %r0                # encoding: [0xb3,0xc4,0x00,0x00]
+#CHECK: cegr	%f0, %r15               # encoding: [0xb3,0xc4,0x00,0x0f]
+#CHECK: cegr	%f15, %r0               # encoding: [0xb3,0xc4,0x00,0xf0]
+#CHECK: cegr	%f7, %r8                # encoding: [0xb3,0xc4,0x00,0x78]
+#CHECK: cegr	%f15, %r15              # encoding: [0xb3,0xc4,0x00,0xff]
+
+	cegr	%f0, %r0
+	cegr	%f0, %r15
+	cegr	%f15, %r0
+	cegr	%f7, %r8
+	cegr	%f15, %r15
+
+#CHECK: cer	%f0, %f0                # encoding: [0x39,0x00]
+#CHECK: cer	%f0, %f15               # encoding: [0x39,0x0f]
+#CHECK: cer	%f7, %f8                # encoding: [0x39,0x78]
+#CHECK: cer	%f15, %f0               # encoding: [0x39,0xf0]
+
+	cer	%f0, %f0
+	cer	%f0, %f15
+	cer	%f7, %f8
+	cer	%f15, %f0
+
+#CHECK: cextr	%f0, %f0                # encoding: [0xb3,0xfc,0x00,0x00]
+#CHECK: cextr	%f0, %f13               # encoding: [0xb3,0xfc,0x00,0x0d]
+#CHECK: cextr	%f8, %f8                # encoding: [0xb3,0xfc,0x00,0x88]
+#CHECK: cextr	%f13, %f0               # encoding: [0xb3,0xfc,0x00,0xd0]
+
+	cextr	%f0, %f0
+	cextr	%f0, %f13
+	cextr	%f8, %f8
+	cextr	%f13, %f0
+
 #CHECK: cfc	0                       # encoding: [0xb2,0x1a,0x00,0x00]
 #CHECK: cfc	0(%r1)                  # encoding: [0xb2,0x1a,0x10,0x00]
 #CHECK: cfc	0(%r15)                 # encoding: [0xb2,0x1a,0xf0,0x00]
@@ -2077,6 +2381,18 @@
 	cfdbr	%r4, 5, %f6
 	cfdbr	%r15, 0, %f0
 
+#CHECK: cfdr	%r0, 0, %f0             # encoding: [0xb3,0xb9,0x00,0x00]
+#CHECK: cfdr	%r0, 0, %f15            # encoding: [0xb3,0xb9,0x00,0x0f]
+#CHECK: cfdr	%r0, 15, %f0            # encoding: [0xb3,0xb9,0xf0,0x00]
+#CHECK: cfdr	%r4, 5, %f6             # encoding: [0xb3,0xb9,0x50,0x46]
+#CHECK: cfdr	%r15, 0, %f0            # encoding: [0xb3,0xb9,0x00,0xf0]
+
+	cfdr	%r0, 0, %f0
+	cfdr	%r0, 0, %f15
+	cfdr	%r0, 15, %f0
+	cfdr	%r4, 5, %f6
+	cfdr	%r15, 0, %f0
+
 #CHECK: cfebr	%r0, 0, %f0             # encoding: [0xb3,0x98,0x00,0x00]
 #CHECK: cfebr	%r0, 0, %f15            # encoding: [0xb3,0x98,0x00,0x0f]
 #CHECK: cfebr	%r0, 15, %f0            # encoding: [0xb3,0x98,0xf0,0x00]
@@ -2089,6 +2405,18 @@
 	cfebr	%r4, 5, %f6
 	cfebr	%r15, 0, %f0
 
+#CHECK: cfer	%r0, 0, %f0             # encoding: [0xb3,0xb8,0x00,0x00]
+#CHECK: cfer	%r0, 0, %f15            # encoding: [0xb3,0xb8,0x00,0x0f]
+#CHECK: cfer	%r0, 15, %f0            # encoding: [0xb3,0xb8,0xf0,0x00]
+#CHECK: cfer	%r4, 5, %f6             # encoding: [0xb3,0xb8,0x50,0x46]
+#CHECK: cfer	%r15, 0, %f0            # encoding: [0xb3,0xb8,0x00,0xf0]
+
+	cfer	%r0, 0, %f0
+	cfer	%r0, 0, %f15
+	cfer	%r0, 15, %f0
+	cfer	%r4, 5, %f6
+	cfer	%r15, 0, %f0
+
 #CHECK: cfi	%r0, -2147483648        # encoding: [0xc2,0x0d,0x80,0x00,0x00,0x00]
 #CHECK: cfi	%r0, -1                 # encoding: [0xc2,0x0d,0xff,0xff,0xff,0xff]
 #CHECK: cfi	%r0, 0                  # encoding: [0xc2,0x0d,0x00,0x00,0x00,0x00]
@@ -2115,6 +2443,18 @@
 	cfxbr	%r4, 5, %f8
 	cfxbr	%r15, 0, %f0
 
+#CHECK: cfxr	%r0, 0, %f0             # encoding: [0xb3,0xba,0x00,0x00]
+#CHECK: cfxr	%r0, 0, %f13            # encoding: [0xb3,0xba,0x00,0x0d]
+#CHECK: cfxr	%r0, 15, %f0            # encoding: [0xb3,0xba,0xf0,0x00]
+#CHECK: cfxr	%r4, 5, %f8             # encoding: [0xb3,0xba,0x50,0x48]
+#CHECK: cfxr	%r15, 0, %f0            # encoding: [0xb3,0xba,0x00,0xf0]
+
+	cfxr	%r0, 0, %f0
+	cfxr	%r0, 0, %f13
+	cfxr	%r0, 15, %f0
+	cfxr	%r4, 5, %f8
+	cfxr	%r15, 0, %f0
+
 #CHECK: cg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x20]
 #CHECK: cg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x20]
 #CHECK: cg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x20]
@@ -2149,6 +2489,30 @@
 	cgdbr	%r4, 5, %f6
 	cgdbr	%r15, 0, %f0
 
+#CHECK: cgdr	%r0, 0, %f0             # encoding: [0xb3,0xc9,0x00,0x00]
+#CHECK: cgdr	%r0, 0, %f15            # encoding: [0xb3,0xc9,0x00,0x0f]
+#CHECK: cgdr	%r0, 15, %f0            # encoding: [0xb3,0xc9,0xf0,0x00]
+#CHECK: cgdr	%r4, 5, %f6             # encoding: [0xb3,0xc9,0x50,0x46]
+#CHECK: cgdr	%r15, 0, %f0            # encoding: [0xb3,0xc9,0x00,0xf0]
+
+	cgdr	%r0, 0, %f0
+	cgdr	%r0, 0, %f15
+	cgdr	%r0, 15, %f0
+	cgdr	%r4, 5, %f6
+	cgdr	%r15, 0, %f0
+
+#CHECK: cgdtr	%r0, 0, %f0             # encoding: [0xb3,0xe1,0x00,0x00]
+#CHECK: cgdtr	%r0, 0, %f15            # encoding: [0xb3,0xe1,0x00,0x0f]
+#CHECK: cgdtr	%r0, 15, %f0            # encoding: [0xb3,0xe1,0xf0,0x00]
+#CHECK: cgdtr	%r4, 5, %f6             # encoding: [0xb3,0xe1,0x50,0x46]
+#CHECK: cgdtr	%r15, 0, %f0            # encoding: [0xb3,0xe1,0x00,0xf0]
+
+	cgdtr	%r0, 0, %f0
+	cgdtr	%r0, 0, %f15
+	cgdtr	%r0, 15, %f0
+	cgdtr	%r4, 5, %f6
+	cgdtr	%r15, 0, %f0
+
 #CHECK: cgebr	%r0, 0, %f0             # encoding: [0xb3,0xa8,0x00,0x00]
 #CHECK: cgebr	%r0, 0, %f15            # encoding: [0xb3,0xa8,0x00,0x0f]
 #CHECK: cgebr	%r0, 15, %f0            # encoding: [0xb3,0xa8,0xf0,0x00]
@@ -2161,6 +2525,18 @@
 	cgebr	%r4, 5, %f6
 	cgebr	%r15, 0, %f0
 
+#CHECK: cger	%r0, 0, %f0             # encoding: [0xb3,0xc8,0x00,0x00]
+#CHECK: cger	%r0, 0, %f15            # encoding: [0xb3,0xc8,0x00,0x0f]
+#CHECK: cger	%r0, 15, %f0            # encoding: [0xb3,0xc8,0xf0,0x00]
+#CHECK: cger	%r4, 5, %f6             # encoding: [0xb3,0xc8,0x50,0x46]
+#CHECK: cger	%r15, 0, %f0            # encoding: [0xb3,0xc8,0x00,0xf0]
+
+	cger	%r0, 0, %f0
+	cger	%r0, 0, %f15
+	cger	%r0, 15, %f0
+	cger	%r4, 5, %f6
+	cger	%r15, 0, %f0
+
 #CHECK: cgf	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x30]
 #CHECK: cgf	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x30]
 #CHECK: cgf	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x30]
@@ -3063,6 +3439,30 @@
 	cgxbr	%r4, 5, %f8
 	cgxbr	%r15, 0, %f0
 
+#CHECK: cgxr	%r0, 0, %f0             # encoding: [0xb3,0xca,0x00,0x00]
+#CHECK: cgxr	%r0, 0, %f13            # encoding: [0xb3,0xca,0x00,0x0d]
+#CHECK: cgxr	%r0, 15, %f0            # encoding: [0xb3,0xca,0xf0,0x00]
+#CHECK: cgxr	%r4, 5, %f8             # encoding: [0xb3,0xca,0x50,0x48]
+#CHECK: cgxr	%r15, 0, %f0            # encoding: [0xb3,0xca,0x00,0xf0]
+
+	cgxr	%r0, 0, %f0
+	cgxr	%r0, 0, %f13
+	cgxr	%r0, 15, %f0
+	cgxr	%r4, 5, %f8
+	cgxr	%r15, 0, %f0
+
+#CHECK: cgxtr	%r0, 0, %f0             # encoding: [0xb3,0xe9,0x00,0x00]
+#CHECK: cgxtr	%r0, 0, %f13            # encoding: [0xb3,0xe9,0x00,0x0d]
+#CHECK: cgxtr	%r0, 15, %f0            # encoding: [0xb3,0xe9,0xf0,0x00]
+#CHECK: cgxtr	%r4, 5, %f8             # encoding: [0xb3,0xe9,0x50,0x48]
+#CHECK: cgxtr	%r15, 0, %f0            # encoding: [0xb3,0xe9,0x00,0xf0]
+
+	cgxtr	%r0, 0, %f0
+	cgxtr	%r0, 0, %f13
+	cgxtr	%r0, 15, %f0
+	cgxtr	%r4, 5, %f8
+	cgxtr	%r15, 0, %f0
+
 #CHECK: ch	%r0, 0                  # encoding: [0x49,0x00,0x00,0x00]
 #CHECK: ch	%r0, 4095               # encoding: [0x49,0x00,0x0f,0xff]
 #CHECK: ch	%r0, 0(%r1)             # encoding: [0x49,0x00,0x10,0x00]
@@ -5857,6 +6257,18 @@
 	cs	%r0, %r15, 0
 	cs	%r15, %r0, 0
 
+#CHECK: csdtr	%r0, %f0, 0             # encoding: [0xb3,0xe3,0x00,0x00]
+#CHECK: csdtr	%r0, %f15, 0            # encoding: [0xb3,0xe3,0x00,0x0f]
+#CHECK: csdtr	%r0, %f0, 15            # encoding: [0xb3,0xe3,0x0f,0x00]
+#CHECK: csdtr	%r4, %f5, 6             # encoding: [0xb3,0xe3,0x06,0x45]
+#CHECK: csdtr	%r15, %f0, 0            # encoding: [0xb3,0xe3,0x00,0xf0]
+
+	csdtr	%r0, %f0, 0
+	csdtr	%r0, %f15, 0
+	csdtr	%r0, %f0, 15
+	csdtr	%r4, %f5, 6
+	csdtr	%r15, %f0, 0
+
 #CHECK: csg	%r0, %r0, -524288       # encoding: [0xeb,0x00,0x00,0x00,0x80,0x30]
 #CHECK: csg	%r0, %r0, -1            # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x30]
 #CHECK: csg	%r0, %r0, 0             # encoding: [0xeb,0x00,0x00,0x00,0x00,0x30]
@@ -5895,6 +6307,18 @@
         csst	0(%r1), 1(%r15), %r2
         csst	0(%r1), 4095(%r15), %r2
 
+#CHECK: csxtr	%r0, %f0, 0             # encoding: [0xb3,0xeb,0x00,0x00]
+#CHECK: csxtr	%r0, %f13, 0            # encoding: [0xb3,0xeb,0x00,0x0d]
+#CHECK: csxtr	%r0, %f0, 15            # encoding: [0xb3,0xeb,0x0f,0x00]
+#CHECK: csxtr	%r4, %f5, 6             # encoding: [0xb3,0xeb,0x06,0x45]
+#CHECK: csxtr	%r14, %f0, 0            # encoding: [0xb3,0xeb,0x00,0xe0]
+
+	csxtr	%r0, %f0, 0
+	csxtr	%r0, %f13, 0
+	csxtr	%r0, %f0, 15
+	csxtr	%r4, %f5, 6
+	csxtr	%r14, %f0, 0
+
 #CHECK: csy	%r0, %r0, -524288       # encoding: [0xeb,0x00,0x00,0x00,0x80,0x14]
 #CHECK: csy	%r0, %r0, -1            # encoding: [0xeb,0x00,0x0f,0xff,0xff,0x14]
 #CHECK: csy	%r0, %r0, 0             # encoding: [0xeb,0x00,0x00,0x00,0x00,0x14]
@@ -5995,6 +6419,18 @@
 	cu42	%r14, %r0
 	cu42	%r6, %r8
 
+#CHECK: cudtr	%r0, %f0                # encoding: [0xb3,0xe2,0x00,0x00]
+#CHECK: cudtr	%r0, %f15               # encoding: [0xb3,0xe2,0x00,0x0f]
+#CHECK: cudtr	%r15, %f0               # encoding: [0xb3,0xe2,0x00,0xf0]
+#CHECK: cudtr	%r7, %f8                # encoding: [0xb3,0xe2,0x00,0x78]
+#CHECK: cudtr	%r15, %f15              # encoding: [0xb3,0xe2,0x00,0xff]
+
+	cudtr	%r0, %f0
+	cudtr	%r0, %f15
+	cudtr	%r15, %f0
+	cudtr	%r7, %f8
+	cudtr	%r15, %f15
+
 #CHECK: cuse	%r0, %r8                # encoding: [0xb2,0x57,0x00,0x08]
 #CHECK: cuse	%r0, %r14               # encoding: [0xb2,0x57,0x00,0x0e]
 #CHECK: cuse	%r14, %r0               # encoding: [0xb2,0x57,0x00,0xe0]
@@ -6033,6 +6469,18 @@
 	cuutf	%r4, %r12, 0
 	cuutf	%r4, %r12, 15
 
+#CHECK: cuxtr	%r0, %f0                # encoding: [0xb3,0xea,0x00,0x00]
+#CHECK: cuxtr	%r0, %f13               # encoding: [0xb3,0xea,0x00,0x0d]
+#CHECK: cuxtr	%r14, %f0               # encoding: [0xb3,0xea,0x00,0xe0]
+#CHECK: cuxtr	%r6, %f8                # encoding: [0xb3,0xea,0x00,0x68]
+#CHECK: cuxtr	%r14, %f13              # encoding: [0xb3,0xea,0x00,0xed]
+
+	cuxtr	%r0, %f0
+	cuxtr	%r0, %f13
+	cuxtr	%r14, %f0
+	cuxtr	%r6, %f8
+	cuxtr	%r14, %f13
+
 #CHECK: cvb	%r0, 0                  # encoding: [0x4f,0x00,0x00,0x00]
 #CHECK: cvb	%r0, 4095               # encoding: [0x4f,0x00,0x0f,0xff]
 #CHECK: cvb	%r0, 0(%r1)             # encoding: [0x4f,0x00,0x10,0x00]
@@ -6175,6 +6623,18 @@
 	cxfbr	%f8, %r7
 	cxfbr	%f13, %r15
 
+#CHECK: cxfr	%f0, %r0                # encoding: [0xb3,0xb6,0x00,0x00]
+#CHECK: cxfr	%f0, %r15               # encoding: [0xb3,0xb6,0x00,0x0f]
+#CHECK: cxfr	%f13, %r0               # encoding: [0xb3,0xb6,0x00,0xd0]
+#CHECK: cxfr	%f8, %r7                # encoding: [0xb3,0xb6,0x00,0x87]
+#CHECK: cxfr	%f13, %r15              # encoding: [0xb3,0xb6,0x00,0xdf]
+
+	cxfr	%f0, %r0
+	cxfr	%f0, %r15
+	cxfr	%f13, %r0
+	cxfr	%f8, %r7
+	cxfr	%f13, %r15
+
 #CHECK: cxgbr	%f0, %r0                # encoding: [0xb3,0xa6,0x00,0x00]
 #CHECK: cxgbr	%f0, %r15               # encoding: [0xb3,0xa6,0x00,0x0f]
 #CHECK: cxgbr	%f13, %r0               # encoding: [0xb3,0xa6,0x00,0xd0]
@@ -6187,6 +6647,74 @@
 	cxgbr	%f8, %r7
 	cxgbr	%f13, %r15
 
+#CHECK: cxgr	%f0, %r0                # encoding: [0xb3,0xc6,0x00,0x00]
+#CHECK: cxgr	%f0, %r15               # encoding: [0xb3,0xc6,0x00,0x0f]
+#CHECK: cxgr	%f13, %r0               # encoding: [0xb3,0xc6,0x00,0xd0]
+#CHECK: cxgr	%f8, %r7                # encoding: [0xb3,0xc6,0x00,0x87]
+#CHECK: cxgr	%f13, %r15              # encoding: [0xb3,0xc6,0x00,0xdf]
+
+	cxgr	%f0, %r0
+	cxgr	%f0, %r15
+	cxgr	%f13, %r0
+	cxgr	%f8, %r7
+	cxgr	%f13, %r15
+
+#CHECK: cxgtr	%f0, %r0                # encoding: [0xb3,0xf9,0x00,0x00]
+#CHECK: cxgtr	%f0, %r15               # encoding: [0xb3,0xf9,0x00,0x0f]
+#CHECK: cxgtr	%f13, %r0               # encoding: [0xb3,0xf9,0x00,0xd0]
+#CHECK: cxgtr	%f8, %r7                # encoding: [0xb3,0xf9,0x00,0x87]
+#CHECK: cxgtr	%f13, %r15              # encoding: [0xb3,0xf9,0x00,0xdf]
+
+	cxgtr	%f0, %r0
+	cxgtr	%f0, %r15
+	cxgtr	%f13, %r0
+	cxgtr	%f8, %r7
+	cxgtr	%f13, %r15
+
+#CHECK: cxr	%f0, %f0                # encoding: [0xb3,0x69,0x00,0x00]
+#CHECK: cxr	%f0, %f13               # encoding: [0xb3,0x69,0x00,0x0d]
+#CHECK: cxr	%f8, %f8                # encoding: [0xb3,0x69,0x00,0x88]
+#CHECK: cxr	%f13, %f0               # encoding: [0xb3,0x69,0x00,0xd0]
+
+	cxr	%f0, %f0
+	cxr	%f0, %f13
+	cxr	%f8, %f8
+	cxr	%f13, %f0
+
+#CHECK: cxstr	%f0, %r0                # encoding: [0xb3,0xfb,0x00,0x00]
+#CHECK: cxstr	%f0, %r14               # encoding: [0xb3,0xfb,0x00,0x0e]
+#CHECK: cxstr	%f13, %r0               # encoding: [0xb3,0xfb,0x00,0xd0]
+#CHECK: cxstr	%f8, %r6                # encoding: [0xb3,0xfb,0x00,0x86]
+#CHECK: cxstr	%f13, %r14              # encoding: [0xb3,0xfb,0x00,0xde]
+
+	cxstr	%f0, %r0
+	cxstr	%f0, %r14
+	cxstr	%f13, %r0
+	cxstr	%f8, %r6
+	cxstr	%f13, %r14
+
+#CHECK: cxtr	%f0, %f0                # encoding: [0xb3,0xec,0x00,0x00]
+#CHECK: cxtr	%f0, %f13               # encoding: [0xb3,0xec,0x00,0x0d]
+#CHECK: cxtr	%f8, %f8                # encoding: [0xb3,0xec,0x00,0x88]
+#CHECK: cxtr	%f13, %f0               # encoding: [0xb3,0xec,0x00,0xd0]
+
+	cxtr	%f0, %f0
+	cxtr	%f0, %f13
+	cxtr	%f8, %f8
+	cxtr	%f13, %f0
+
+#CHECK: cxutr	%f0, %r0                # encoding: [0xb3,0xfa,0x00,0x00]
+#CHECK: cxutr	%f0, %r14               # encoding: [0xb3,0xfa,0x00,0x0e]
+#CHECK: cxutr	%f13, %r0               # encoding: [0xb3,0xfa,0x00,0xd0]
+#CHECK: cxutr	%f8, %r6                # encoding: [0xb3,0xfa,0x00,0x86]
+#CHECK: cxutr	%f13, %r14              # encoding: [0xb3,0xfa,0x00,0xde]
+
+	cxutr	%f0, %r0
+	cxutr	%f0, %r14
+	cxutr	%f13, %r0
+	cxutr	%f8, %r6
+	cxutr	%f13, %r14
+
 #CHECK: cy	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x59]
 #CHECK: cy	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x59]
 #CHECK: cy	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x59]
@@ -6225,6 +6753,22 @@
 	d	%r0, 4095(%r15,%r1)
 	d	%r14, 0
 
+#CHECK: dd	%f0, 0                  # encoding: [0x6d,0x00,0x00,0x00]
+#CHECK: dd	%f0, 4095               # encoding: [0x6d,0x00,0x0f,0xff]
+#CHECK: dd	%f0, 0(%r1)             # encoding: [0x6d,0x00,0x10,0x00]
+#CHECK: dd	%f0, 0(%r15)            # encoding: [0x6d,0x00,0xf0,0x00]
+#CHECK: dd	%f0, 4095(%r1,%r15)     # encoding: [0x6d,0x01,0xff,0xff]
+#CHECK: dd	%f0, 4095(%r15,%r1)     # encoding: [0x6d,0x0f,0x1f,0xff]
+#CHECK: dd	%f15, 0                 # encoding: [0x6d,0xf0,0x00,0x00]
+
+	dd	%f0, 0
+	dd	%f0, 4095
+	dd	%f0, 0(%r1)
+	dd	%f0, 0(%r15)
+	dd	%f0, 4095(%r1,%r15)
+	dd	%f0, 4095(%r15,%r1)
+	dd	%f15, 0
+
 #CHECK: ddb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x1d]
 #CHECK: ddb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1d]
 #CHECK: ddb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x1d]
@@ -6251,6 +6795,44 @@
 	ddbr	%f7, %f8
 	ddbr	%f15, %f0
 
+#CHECK: ddr	%f0, %f0                # encoding: [0x2d,0x00]
+#CHECK: ddr	%f0, %f15               # encoding: [0x2d,0x0f]
+#CHECK: ddr	%f7, %f8                # encoding: [0x2d,0x78]
+#CHECK: ddr	%f15, %f0               # encoding: [0x2d,0xf0]
+
+	ddr	%f0, %f0
+	ddr	%f0, %f15
+	ddr	%f7, %f8
+	ddr	%f15, %f0
+
+#CHECK: ddtr	%f0, %f0, %f0           # encoding: [0xb3,0xd1,0x00,0x00]
+#CHECK: ddtr	%f0, %f0, %f15          # encoding: [0xb3,0xd1,0xf0,0x00]
+#CHECK: ddtr	%f0, %f15, %f0          # encoding: [0xb3,0xd1,0x00,0x0f]
+#CHECK: ddtr	%f15, %f0, %f0          # encoding: [0xb3,0xd1,0x00,0xf0]
+#CHECK: ddtr	%f7, %f8, %f9           # encoding: [0xb3,0xd1,0x90,0x78]
+
+	ddtr	%f0, %f0, %f0
+	ddtr	%f0, %f0, %f15
+	ddtr	%f0, %f15, %f0
+	ddtr	%f15, %f0, %f0
+	ddtr	%f7, %f8, %f9
+
+#CHECK: de	%f0, 0                  # encoding: [0x7d,0x00,0x00,0x00]
+#CHECK: de	%f0, 4095               # encoding: [0x7d,0x00,0x0f,0xff]
+#CHECK: de	%f0, 0(%r1)             # encoding: [0x7d,0x00,0x10,0x00]
+#CHECK: de	%f0, 0(%r15)            # encoding: [0x7d,0x00,0xf0,0x00]
+#CHECK: de	%f0, 4095(%r1,%r15)     # encoding: [0x7d,0x01,0xff,0xff]
+#CHECK: de	%f0, 4095(%r15,%r1)     # encoding: [0x7d,0x0f,0x1f,0xff]
+#CHECK: de	%f15, 0                 # encoding: [0x7d,0xf0,0x00,0x00]
+
+	de	%f0, 0
+	de	%f0, 4095
+	de	%f0, 0(%r1)
+	de	%f0, 0(%r15)
+	de	%f0, 4095(%r1,%r15)
+	de	%f0, 4095(%r15,%r1)
+	de	%f15, 0
+
 #CHECK: deb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x0d]
 #CHECK: deb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x0d]
 #CHECK: deb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x0d]
@@ -6277,6 +6859,16 @@
 	debr	%f7, %f8
 	debr	%f15, %f0
 
+#CHECK: der	%f0, %f0                # encoding: [0x3d,0x00]
+#CHECK: der	%f0, %f15               # encoding: [0x3d,0x0f]
+#CHECK: der	%f7, %f8                # encoding: [0x3d,0x78]
+#CHECK: der	%f15, %f0               # encoding: [0x3d,0xf0]
+
+	der	%f0, %f0
+	der	%f0, %f15
+	der	%f7, %f8
+	der	%f15, %f0
+
 #CHECK: didbr	%f0, %f0, %f0, 0        # encoding: [0xb3,0x5b,0x00,0x00]
 #CHECK: didbr	%f0, %f0, %f0, 15       # encoding: [0xb3,0x5b,0x0f,0x00]
 #CHECK: didbr	%f0, %f0, %f15, 0       # encoding: [0xb3,0x5b,0x00,0x0f]
@@ -6483,6 +7075,28 @@
 	dxbr	%f8, %f8
 	dxbr	%f13, %f0
 
+#CHECK: dxr	%f0, %f0                # encoding: [0xb2,0x2d,0x00,0x00]
+#CHECK: dxr	%f0, %f13               # encoding: [0xb2,0x2d,0x00,0x0d]
+#CHECK: dxr	%f8, %f8                # encoding: [0xb2,0x2d,0x00,0x88]
+#CHECK: dxr	%f13, %f0               # encoding: [0xb2,0x2d,0x00,0xd0]
+
+	dxr	%f0, %f0
+	dxr	%f0, %f13
+	dxr	%f8, %f8
+	dxr	%f13, %f0
+
+#CHECK: dxtr	%f0, %f0, %f0           # encoding: [0xb3,0xd9,0x00,0x00]
+#CHECK: dxtr	%f0, %f0, %f13          # encoding: [0xb3,0xd9,0xd0,0x00]
+#CHECK: dxtr	%f0, %f13, %f0          # encoding: [0xb3,0xd9,0x00,0x0d]
+#CHECK: dxtr	%f13, %f0, %f0          # encoding: [0xb3,0xd9,0x00,0xd0]
+#CHECK: dxtr	%f8, %f8, %f8           # encoding: [0xb3,0xd9,0x80,0x88]
+
+	dxtr	%f0, %f0, %f0
+	dxtr	%f0, %f0, %f13
+	dxtr	%f0, %f13, %f0
+	dxtr	%f13, %f0, %f0
+	dxtr	%f8, %f8, %f8
+
 #CHECK: ear	%r0, %a0                # encoding: [0xb2,0x4f,0x00,0x00]
 #CHECK: ear	%r0, %a15               # encoding: [0xb2,0x4f,0x00,0x0f]
 #CHECK: ear	%r15, %a0               # encoding: [0xb2,0x4f,0x00,0xf0]
@@ -6589,6 +7203,26 @@
 	edmk	0(256,%r1), 0
 	edmk	0(256,%r15), 0
 
+#CHECK: eedtr	%f0, %f9                # encoding: [0xb3,0xe5,0x00,0x09]
+#CHECK: eedtr	%f0, %f15               # encoding: [0xb3,0xe5,0x00,0x0f]
+#CHECK: eedtr	%f15, %f0               # encoding: [0xb3,0xe5,0x00,0xf0]
+#CHECK: eedtr	%f15, %f9               # encoding: [0xb3,0xe5,0x00,0xf9]
+
+	eedtr	%f0,%f9
+	eedtr	%f0,%f15
+	eedtr	%f15,%f0
+	eedtr	%f15,%f9
+
+#CHECK: eextr	%f0, %f8                # encoding: [0xb3,0xed,0x00,0x08]
+#CHECK: eextr	%f0, %f13               # encoding: [0xb3,0xed,0x00,0x0d]
+#CHECK: eextr	%f13, %f0               # encoding: [0xb3,0xed,0x00,0xd0]
+#CHECK: eextr	%f13, %f9               # encoding: [0xb3,0xed,0x00,0xd9]
+
+	eextr	%f0,%f8
+	eextr	%f0,%f13
+	eextr	%f13,%f0
+	eextr	%f13,%f9
+
 #CHECK: efpc	%r0                     # encoding: [0xb3,0x8c,0x00,0x00]
 #CHECK: efpc	%r1                     # encoding: [0xb3,0x8c,0x00,0x10]
 #CHECK: efpc	%r15                    # encoding: [0xb3,0x8c,0x00,0xf0]
@@ -6607,6 +7241,26 @@
 	epsw	%r15, %r0
 	epsw	%r15, %r8
 
+#CHECK: esdtr	%f0, %f9                # encoding: [0xb3,0xe7,0x00,0x09]
+#CHECK: esdtr	%f0, %f15               # encoding: [0xb3,0xe7,0x00,0x0f]
+#CHECK: esdtr	%f15, %f0               # encoding: [0xb3,0xe7,0x00,0xf0]
+#CHECK: esdtr	%f15, %f9               # encoding: [0xb3,0xe7,0x00,0xf9]
+
+	esdtr	%f0,%f9
+	esdtr	%f0,%f15
+	esdtr	%f15,%f0
+	esdtr	%f15,%f9
+
+#CHECK: esxtr	%f0, %f8                # encoding: [0xb3,0xef,0x00,0x08]
+#CHECK: esxtr	%f0, %f13               # encoding: [0xb3,0xef,0x00,0x0d]
+#CHECK: esxtr	%f13, %f0               # encoding: [0xb3,0xef,0x00,0xd0]
+#CHECK: esxtr	%f13, %f9               # encoding: [0xb3,0xef,0x00,0xd9]
+
+	esxtr	%f0,%f8
+	esxtr	%f0,%f13
+	esxtr	%f13,%f0
+	esxtr	%f13,%f9
+
 #CHECK: ex	%r0, 0                  # encoding: [0x44,0x00,0x00,0x00]
 #CHECK: ex	%r0, 4095               # encoding: [0x44,0x00,0x0f,0xff]
 #CHECK: ex	%r0, 0(%r1)             # encoding: [0x44,0x00,0x10,0x00]
@@ -6672,6 +7326,30 @@
 	fidbr	%f4, 5, %f6
 	fidbr	%f15, 0, %f0
 
+#CHECK: fidr	%f0, %f0                # encoding: [0xb3,0x7f,0x00,0x00]
+#CHECK: fidr	%f0, %f15               # encoding: [0xb3,0x7f,0x00,0x0f]
+#CHECK: fidr	%f4, %f6                # encoding: [0xb3,0x7f,0x00,0x46]
+#CHECK: fidr	%f15, %f0               # encoding: [0xb3,0x7f,0x00,0xf0]
+
+	fidr	%f0, %f0
+	fidr	%f0, %f15
+	fidr	%f4, %f6
+	fidr	%f15, %f0
+
+#CHECK: fidtr	%f0, 0, %f0, 0          # encoding: [0xb3,0xd7,0x00,0x00]
+#CHECK: fidtr	%f0, 0, %f0, 15         # encoding: [0xb3,0xd7,0x0f,0x00]
+#CHECK: fidtr	%f0, 0, %f15, 0         # encoding: [0xb3,0xd7,0x00,0x0f]
+#CHECK: fidtr	%f0, 15, %f0, 0         # encoding: [0xb3,0xd7,0xf0,0x00]
+#CHECK: fidtr	%f4, 5, %f6, 7          # encoding: [0xb3,0xd7,0x57,0x46]
+#CHECK: fidtr	%f15, 0, %f0, 0         # encoding: [0xb3,0xd7,0x00,0xf0]
+
+	fidtr	%f0, 0, %f0, 0
+	fidtr	%f0, 0, %f0, 15
+	fidtr	%f0, 0, %f15, 0
+	fidtr	%f0, 15, %f0, 0
+	fidtr	%f4, 5, %f6, 7
+	fidtr	%f15, 0, %f0, 0
+
 #CHECK: fiebr	%f0, 0, %f0             # encoding: [0xb3,0x57,0x00,0x00]
 #CHECK: fiebr	%f0, 0, %f15            # encoding: [0xb3,0x57,0x00,0x0f]
 #CHECK: fiebr	%f0, 15, %f0            # encoding: [0xb3,0x57,0xf0,0x00]
@@ -6684,6 +7362,16 @@
 	fiebr	%f4, 5, %f6
 	fiebr	%f15, 0, %f0
 
+#CHECK: fier	%f0, %f0                # encoding: [0xb3,0x77,0x00,0x00]
+#CHECK: fier	%f0, %f15               # encoding: [0xb3,0x77,0x00,0x0f]
+#CHECK: fier	%f4, %f6                # encoding: [0xb3,0x77,0x00,0x46]
+#CHECK: fier	%f15, %f0               # encoding: [0xb3,0x77,0x00,0xf0]
+
+	fier	%f0, %f0
+	fier	%f0, %f15
+	fier	%f4, %f6
+	fier	%f15, %f0
+
 #CHECK: fixbr	%f0, 0, %f0             # encoding: [0xb3,0x47,0x00,0x00]
 #CHECK: fixbr	%f0, 0, %f13            # encoding: [0xb3,0x47,0x00,0x0d]
 #CHECK: fixbr	%f0, 15, %f0            # encoding: [0xb3,0x47,0xf0,0x00]
@@ -6696,6 +7384,30 @@
 	fixbr	%f4, 5, %f8
 	fixbr	%f13, 0, %f0
 
+#CHECK: fixr	%f0, %f0                # encoding: [0xb3,0x67,0x00,0x00]
+#CHECK: fixr	%f0, %f13               # encoding: [0xb3,0x67,0x00,0x0d]
+#CHECK: fixr	%f4, %f8                # encoding: [0xb3,0x67,0x00,0x48]
+#CHECK: fixr	%f13, %f0               # encoding: [0xb3,0x67,0x00,0xd0]
+
+	fixr	%f0, %f0
+	fixr	%f0, %f13
+	fixr	%f4, %f8
+	fixr	%f13, %f0
+
+#CHECK: fixtr	%f0, 0, %f0, 0          # encoding: [0xb3,0xdf,0x00,0x00]
+#CHECK: fixtr	%f0, 0, %f0, 15         # encoding: [0xb3,0xdf,0x0f,0x00]
+#CHECK: fixtr	%f0, 0, %f13, 0         # encoding: [0xb3,0xdf,0x00,0x0d]
+#CHECK: fixtr	%f0, 15, %f0, 0         # encoding: [0xb3,0xdf,0xf0,0x00]
+#CHECK: fixtr	%f4, 5, %f8, 9          # encoding: [0xb3,0xdf,0x59,0x48]
+#CHECK: fixtr	%f13, 0, %f0, 0         # encoding: [0xb3,0xdf,0x00,0xd0]
+
+	fixtr	%f0, 0, %f0, 0
+	fixtr	%f0, 0, %f0, 15
+	fixtr	%f0, 0, %f13, 0
+	fixtr	%f0, 15, %f0, 0
+	fixtr	%f4, 5, %f8, 9
+	fixtr	%f13, 0, %f0, 0
+
 #CHECK: flogr	%r0, %r0                # encoding: [0xb9,0x83,0x00,0x00]
 #CHECK: flogr	%r0, %r15               # encoding: [0xb9,0x83,0x00,0x0f]
 #CHECK: flogr	%r10, %r9               # encoding: [0xb9,0x83,0x00,0xa9]
@@ -6706,6 +7418,26 @@
 	flogr	%r10, %r9
 	flogr	%r14, %r0
 
+#CHECK: hdr	%f0, %f0                # encoding: [0x24,0x00]
+#CHECK: hdr	%f0, %f15               # encoding: [0x24,0x0f]
+#CHECK: hdr	%f7, %f8                # encoding: [0x24,0x78]
+#CHECK: hdr	%f15, %f0               # encoding: [0x24,0xf0]
+
+	hdr	%f0, %f0
+	hdr	%f0, %f15
+	hdr	%f7, %f8
+	hdr	%f15, %f0
+
+#CHECK: her	%f0, %f0                # encoding: [0x34,0x00]
+#CHECK: her	%f0, %f15               # encoding: [0x34,0x0f]
+#CHECK: her	%f7, %f8                # encoding: [0x34,0x78]
+#CHECK: her	%f15, %f0               # encoding: [0x34,0xf0]
+
+	her	%f0, %f0
+	her	%f0, %f15
+	her	%f7, %f8
+	her	%f15, %f0
+
 #CHECK: ic	%r0, 0                  # encoding: [0x43,0x00,0x00,0x00]
 #CHECK: ic	%r0, 4095               # encoding: [0x43,0x00,0x0f,0xff]
 #CHECK: ic	%r0, 0(%r1)             # encoding: [0x43,0x00,0x10,0x00]
@@ -6804,6 +7536,34 @@
 	icy	%r0, 524287(%r15,%r1)
 	icy	%r15, 0
 
+#CHECK: iedtr	%f0, %f0, %f0           # encoding: [0xb3,0xf6,0x00,0x00]
+#CHECK: iedtr	%f0, %f0, %f15          # encoding: [0xb3,0xf6,0x00,0x0f]
+#CHECK: iedtr	%f0, %f15, %f0          # encoding: [0xb3,0xf6,0xf0,0x00]
+#CHECK: iedtr	%f15, %f0, %f0          # encoding: [0xb3,0xf6,0x00,0xf0]
+#CHECK: iedtr	%f1, %f2, %f3           # encoding: [0xb3,0xf6,0x20,0x13]
+#CHECK: iedtr	%f15, %f15, %f15        # encoding: [0xb3,0xf6,0xf0,0xff]
+
+	iedtr	%f0, %f0, %f0
+	iedtr	%f0, %f0, %f15
+	iedtr	%f0, %f15, %f0
+	iedtr	%f15, %f0, %f0
+	iedtr	%f1, %f2, %f3
+	iedtr	%f15, %f15, %f15
+
+#CHECK: iextr	%f0, %f0, %f0           # encoding: [0xb3,0xfe,0x00,0x00]
+#CHECK: iextr	%f0, %f0, %f13          # encoding: [0xb3,0xfe,0x00,0x0d]
+#CHECK: iextr	%f0, %f13, %f0          # encoding: [0xb3,0xfe,0xd0,0x00]
+#CHECK: iextr	%f13, %f0, %f0          # encoding: [0xb3,0xfe,0x00,0xd0]
+#CHECK: iextr	%f1, %f8, %f4           # encoding: [0xb3,0xfe,0x80,0x14]
+#CHECK: iextr	%f13, %f13, %f13        # encoding: [0xb3,0xfe,0xd0,0xdd]
+
+	iextr	%f0, %f0, %f0
+	iextr	%f0, %f0, %f13
+	iextr	%f0, %f13, %f0
+	iextr	%f13, %f0, %f0
+	iextr	%f1, %f8, %f4
+	iextr	%f13, %f13, %f13
+
 #CHECK: iihf	%r0, 0                  # encoding: [0xc0,0x08,0x00,0x00,0x00,0x00]
 #CHECK: iihf	%r0, 4294967295         # encoding: [0xc0,0x08,0xff,0xff,0xff,0xff]
 #CHECK: iihf	%r15, 0                 # encoding: [0xc0,0xf8,0x00,0x00,0x00,0x00]
@@ -6894,6 +7654,16 @@
 	kdbr	%f7, %f8
 	kdbr	%f15, %f0
 
+#CHECK: kdtr	%f0, %f0                # encoding: [0xb3,0xe0,0x00,0x00]
+#CHECK: kdtr	%f0, %f15               # encoding: [0xb3,0xe0,0x00,0x0f]
+#CHECK: kdtr	%f7, %f8                # encoding: [0xb3,0xe0,0x00,0x78]
+#CHECK: kdtr	%f15, %f0               # encoding: [0xb3,0xe0,0x00,0xf0]
+
+	kdtr	%f0, %f0
+	kdtr	%f0, %f15
+	kdtr	%f7, %f8
+	kdtr	%f15, %f0
+
 #CHECK: keb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x08]
 #CHECK: keb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x08]
 #CHECK: keb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x08]
@@ -6980,6 +7750,16 @@
 	kxbr	%f8, %f8
 	kxbr	%f13, %f0
 
+#CHECK: kxtr	%f0, %f0                # encoding: [0xb3,0xe8,0x00,0x00]
+#CHECK: kxtr	%f0, %f13               # encoding: [0xb3,0xe8,0x00,0x0d]
+#CHECK: kxtr	%f8, %f8                # encoding: [0xb3,0xe8,0x00,0x88]
+#CHECK: kxtr	%f13, %f0               # encoding: [0xb3,0xe8,0x00,0xd0]
+
+	kxtr	%f0, %f0
+	kxtr	%f0, %f13
+	kxtr	%f8, %f8
+	kxtr	%f13, %f0
+
 #CHECK: l	%r0, 0                  # encoding: [0x58,0x00,0x00,0x00]
 #CHECK: l	%r0, 4095               # encoding: [0x58,0x00,0x0f,0xff]
 #CHECK: l	%r0, 0(%r1)             # encoding: [0x58,0x00,0x10,0x00]
@@ -7199,6 +7979,16 @@
 	lcdbr	%f15,%f0
 	lcdbr	%f15,%f9
 
+#CHECK: lcdr	%f0, %f9                # encoding: [0x23,0x09]
+#CHECK: lcdr	%f0, %f15               # encoding: [0x23,0x0f]
+#CHECK: lcdr	%f15, %f0               # encoding: [0x23,0xf0]
+#CHECK: lcdr	%f15, %f9               # encoding: [0x23,0xf9]
+
+	lcdr	%f0,%f9
+	lcdr	%f0,%f15
+	lcdr	%f15,%f0
+	lcdr	%f15,%f9
+
 #CHECK: lcebr	%f0, %f9                # encoding: [0xb3,0x03,0x00,0x09]
 #CHECK: lcebr	%f0, %f15               # encoding: [0xb3,0x03,0x00,0x0f]
 #CHECK: lcebr	%f15, %f0               # encoding: [0xb3,0x03,0x00,0xf0]
@@ -7209,6 +7999,16 @@
 	lcebr	%f15,%f0
 	lcebr	%f15,%f9
 
+#CHECK: lcer	%f0, %f9                # encoding: [0x33,0x09]
+#CHECK: lcer	%f0, %f15               # encoding: [0x33,0x0f]
+#CHECK: lcer	%f15, %f0               # encoding: [0x33,0xf0]
+#CHECK: lcer	%f15, %f9               # encoding: [0x33,0xf9]
+
+	lcer	%f0,%f9
+	lcer	%f0,%f15
+	lcer	%f15,%f0
+	lcer	%f15,%f9
+
 #CHECK: lcgfr	%r0, %r0                # encoding: [0xb9,0x13,0x00,0x00]
 #CHECK: lcgfr	%r0, %r15               # encoding: [0xb9,0x13,0x00,0x0f]
 #CHECK: lcgfr	%r15, %r0               # encoding: [0xb9,0x13,0x00,0xf0]
@@ -7249,6 +8049,16 @@
 	lcxbr	%f13,%f0
 	lcxbr	%f13,%f9
 
+#CHECK: lcxr	%f0, %f8                # encoding: [0xb3,0x63,0x00,0x08]
+#CHECK: lcxr	%f0, %f13               # encoding: [0xb3,0x63,0x00,0x0d]
+#CHECK: lcxr	%f13, %f0               # encoding: [0xb3,0x63,0x00,0xd0]
+#CHECK: lcxr	%f13, %f9               # encoding: [0xb3,0x63,0x00,0xd9]
+
+	lcxr	%f0,%f8
+	lcxr	%f0,%f13
+	lcxr	%f13,%f0
+	lcxr	%f13,%f9
+
 #CHECK: ld	%f0, 0                  # encoding: [0x68,0x00,0x00,0x00]
 #CHECK: ld	%f0, 4095               # encoding: [0x68,0x00,0x0f,0xff]
 #CHECK: ld	%f0, 0(%r1)             # encoding: [0x68,0x00,0x10,0x00]
@@ -7265,6 +8075,22 @@
 	ld	%f0, 4095(%r15,%r1)
 	ld	%f15, 0
 
+#CHECK: lde	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x24]
+#CHECK: lde	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x24]
+#CHECK: lde	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x24]
+#CHECK: lde	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x24]
+#CHECK: lde	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x24]
+#CHECK: lde	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x24]
+#CHECK: lde	%f15, 0                 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x24]
+
+	lde	%f0, 0
+	lde	%f0, 4095
+	lde	%f0, 0(%r1)
+	lde	%f0, 0(%r15)
+	lde	%f0, 4095(%r1,%r15)
+	lde	%f0, 4095(%r15,%r1)
+	lde	%f15, 0
+
 #CHECK: ldeb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x04]
 #CHECK: ldeb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x04]
 #CHECK: ldeb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x04]
@@ -7289,6 +8115,24 @@
 	ldebr	%f7, %f8
 	ldebr	%f15, %f0
 
+#CHECK: lder	%f0, %f15               # encoding: [0xb3,0x24,0x00,0x0f]
+#CHECK: lder	%f7, %f8                # encoding: [0xb3,0x24,0x00,0x78]
+#CHECK: lder	%f15, %f0               # encoding: [0xb3,0x24,0x00,0xf0]
+
+	lder	%f0, %f15
+	lder	%f7, %f8
+	lder	%f15, %f0
+
+#CHECK: ldetr	%f0, %f0, 15            # encoding: [0xb3,0xd4,0x0f,0x00]
+#CHECK: ldetr	%f0, %f15, 0            # encoding: [0xb3,0xd4,0x00,0x0f]
+#CHECK: ldetr	%f7, %f8, 9             # encoding: [0xb3,0xd4,0x09,0x78]
+#CHECK: ldetr	%f15, %f0, 0            # encoding: [0xb3,0xd4,0x00,0xf0]
+
+	ldetr	%f0, %f0, 15
+	ldetr	%f0, %f15, 0
+	ldetr	%f7, %f8, 9
+	ldetr	%f15, %f0, 0
+
 #CHECK: ldgr	%f0, %r0                # encoding: [0xb3,0xc1,0x00,0x00]
 #CHECK: ldgr	%f0, %r15               # encoding: [0xb3,0xc1,0x00,0x0f]
 #CHECK: ldgr	%f15, %r0               # encoding: [0xb3,0xc1,0x00,0xf0]
@@ -7323,6 +8167,32 @@
 	ldxbr	%f13, %f0
 	ldxbr	%f13, %f13
 
+#CHECK: ldxr	%f0, %f0                # encoding: [0x25,0x00]
+#CHECK: ldxr	%f0, %f13               # encoding: [0x25,0x0d]
+#CHECK: ldxr	%f7, %f8                # encoding: [0x25,0x78]
+#CHECK: ldxr	%f15, %f0               # encoding: [0x25,0xf0]
+#CHECK: ldxr	%f15, %f13              # encoding: [0x25,0xfd]
+
+	ldxr	%f0, %f0
+	ldxr	%f0, %f13
+	ldxr	%f7, %f8
+	ldxr	%f15, %f0
+	ldxr	%f15, %f13
+
+#CHECK: ldxtr	%f0, 0, %f0, 0          # encoding: [0xb3,0xdd,0x00,0x00]
+#CHECK: ldxtr	%f0, 0, %f0, 15         # encoding: [0xb3,0xdd,0x0f,0x00]
+#CHECK: ldxtr	%f0, 0, %f13, 0         # encoding: [0xb3,0xdd,0x00,0x0d]
+#CHECK: ldxtr	%f0, 15, %f0, 0         # encoding: [0xb3,0xdd,0xf0,0x00]
+#CHECK: ldxtr	%f4, 5, %f8, 9          # encoding: [0xb3,0xdd,0x59,0x48]
+#CHECK: ldxtr	%f13, 0, %f0, 0         # encoding: [0xb3,0xdd,0x00,0xd0]
+
+	ldxtr	%f0, 0, %f0, 0
+	ldxtr	%f0, 0, %f0, 15
+	ldxtr	%f0, 0, %f13, 0
+	ldxtr	%f0, 15, %f0, 0
+	ldxtr	%f4, 5, %f8, 9
+	ldxtr	%f13, 0, %f0, 0
+
 #CHECK: ldy	%f0, -524288            # encoding: [0xed,0x00,0x00,0x00,0x80,0x65]
 #CHECK: ldy	%f0, -1                 # encoding: [0xed,0x00,0x0f,0xff,0xff,0x65]
 #CHECK: ldy	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x65]
@@ -7373,6 +8243,32 @@
 	ledbr	%f15, %f0
 	ledbr	%f15, %f15
 
+#CHECK: ledr	%f0, %f0                # encoding: [0x35,0x00]
+#CHECK: ledr	%f0, %f15               # encoding: [0x35,0x0f]
+#CHECK: ledr	%f7, %f8                # encoding: [0x35,0x78]
+#CHECK: ledr	%f15, %f0               # encoding: [0x35,0xf0]
+#CHECK: ledr	%f15, %f15              # encoding: [0x35,0xff]
+
+	ledr	%f0, %f0
+	ledr	%f0, %f15
+	ledr	%f7, %f8
+	ledr	%f15, %f0
+	ledr	%f15, %f15
+
+#CHECK: ledtr	%f0, 0, %f0, 0          # encoding: [0xb3,0xd5,0x00,0x00]
+#CHECK: ledtr	%f0, 0, %f0, 15         # encoding: [0xb3,0xd5,0x0f,0x00]
+#CHECK: ledtr	%f0, 0, %f15, 0         # encoding: [0xb3,0xd5,0x00,0x0f]
+#CHECK: ledtr	%f0, 15, %f0, 0         # encoding: [0xb3,0xd5,0xf0,0x00]
+#CHECK: ledtr	%f4, 5, %f6, 7          # encoding: [0xb3,0xd5,0x57,0x46]
+#CHECK: ledtr	%f15, 0, %f0, 0         # encoding: [0xb3,0xd5,0x00,0xf0]
+
+	ledtr	%f0, 0, %f0, 0
+	ledtr	%f0, 0, %f0, 15
+	ledtr	%f0, 0, %f15, 0
+	ledtr	%f0, 15, %f0, 0
+	ledtr	%f4, 5, %f6, 7
+	ledtr	%f15, 0, %f0, 0
+
 #CHECK: ler	%f0, %f9                # encoding: [0x38,0x09]
 #CHECK: ler	%f0, %f15               # encoding: [0x38,0x0f]
 #CHECK: ler	%f15, %f0               # encoding: [0x38,0xf0]
@@ -7395,6 +8291,18 @@
 	lexbr	%f13, %f0
 	lexbr	%f13, %f13
 
+#CHECK: lexr	%f0, %f0                # encoding: [0xb3,0x66,0x00,0x00]
+#CHECK: lexr	%f0, %f13               # encoding: [0xb3,0x66,0x00,0x0d]
+#CHECK: lexr	%f7, %f8                # encoding: [0xb3,0x66,0x00,0x78]
+#CHECK: lexr	%f15, %f0               # encoding: [0xb3,0x66,0x00,0xf0]
+#CHECK: lexr	%f15, %f13              # encoding: [0xb3,0x66,0x00,0xfd]
+
+	lexr	%f0, %f0
+	lexr	%f0, %f13
+	lexr	%f7, %f8
+	lexr	%f15, %f0
+	lexr	%f15, %f13
+
 #CHECK: ley	%f0, -524288            # encoding: [0xed,0x00,0x00,0x00,0x80,0x64]
 #CHECK: ley	%f0, -1                 # encoding: [0xed,0x00,0x0f,0xff,0xff,0x64]
 #CHECK: ley	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x64]
@@ -8303,6 +9211,16 @@
 	lndbr	%f15,%f0
 	lndbr	%f15,%f9
 
+#CHECK: lndr	%f0, %f9                # encoding: [0x21,0x09]
+#CHECK: lndr	%f0, %f15               # encoding: [0x21,0x0f]
+#CHECK: lndr	%f15, %f0               # encoding: [0x21,0xf0]
+#CHECK: lndr	%f15, %f9               # encoding: [0x21,0xf9]
+
+	lndr	%f0,%f9
+	lndr	%f0,%f15
+	lndr	%f15,%f0
+	lndr	%f15,%f9
+
 #CHECK: lnebr	%f0, %f9                # encoding: [0xb3,0x01,0x00,0x09]
 #CHECK: lnebr	%f0, %f15               # encoding: [0xb3,0x01,0x00,0x0f]
 #CHECK: lnebr	%f15, %f0               # encoding: [0xb3,0x01,0x00,0xf0]
@@ -8313,6 +9231,16 @@
 	lnebr	%f15,%f0
 	lnebr	%f15,%f9
 
+#CHECK: lner	%f0, %f9                # encoding: [0x31,0x09]
+#CHECK: lner	%f0, %f15               # encoding: [0x31,0x0f]
+#CHECK: lner	%f15, %f0               # encoding: [0x31,0xf0]
+#CHECK: lner	%f15, %f9               # encoding: [0x31,0xf9]
+
+	lner	%f0,%f9
+	lner	%f0,%f15
+	lner	%f15,%f0
+	lner	%f15,%f9
+
 #CHECK: lngfr	%r0, %r0                # encoding: [0xb9,0x11,0x00,0x00]
 #CHECK: lngfr	%r0, %r15               # encoding: [0xb9,0x11,0x00,0x0f]
 #CHECK: lngfr	%r15, %r0               # encoding: [0xb9,0x11,0x00,0xf0]
@@ -8353,6 +9281,16 @@
 	lnxbr	%f13,%f0
 	lnxbr	%f13,%f9
 
+#CHECK: lnxr	%f0, %f8                # encoding: [0xb3,0x61,0x00,0x08]
+#CHECK: lnxr	%f0, %f13               # encoding: [0xb3,0x61,0x00,0x0d]
+#CHECK: lnxr	%f13, %f0               # encoding: [0xb3,0x61,0x00,0xd0]
+#CHECK: lnxr	%f13, %f9               # encoding: [0xb3,0x61,0x00,0xd9]
+
+	lnxr	%f0,%f8
+	lnxr	%f0,%f13
+	lnxr	%f13,%f0
+	lnxr	%f13,%f9
+
 #CHECK: lpdbr	%f0, %f9                # encoding: [0xb3,0x10,0x00,0x09]
 #CHECK: lpdbr	%f0, %f15               # encoding: [0xb3,0x10,0x00,0x0f]
 #CHECK: lpdbr	%f15, %f0               # encoding: [0xb3,0x10,0x00,0xf0]
@@ -8363,6 +9301,16 @@
 	lpdbr	%f15,%f0
 	lpdbr	%f15,%f9
 
+#CHECK: lpdr	%f0, %f9                # encoding: [0x20,0x09]
+#CHECK: lpdr	%f0, %f15               # encoding: [0x20,0x0f]
+#CHECK: lpdr	%f15, %f0               # encoding: [0x20,0xf0]
+#CHECK: lpdr	%f15, %f9               # encoding: [0x20,0xf9]
+
+	lpdr	%f0,%f9
+	lpdr	%f0,%f15
+	lpdr	%f15,%f0
+	lpdr	%f15,%f9
+
 #CHECK: lpebr	%f0, %f9                # encoding: [0xb3,0x00,0x00,0x09]
 #CHECK: lpebr	%f0, %f15               # encoding: [0xb3,0x00,0x00,0x0f]
 #CHECK: lpebr	%f15, %f0               # encoding: [0xb3,0x00,0x00,0xf0]
@@ -8373,6 +9321,16 @@
 	lpebr	%f15,%f0
 	lpebr	%f15,%f9
 
+#CHECK: lper	%f0, %f9                # encoding: [0x30,0x09]
+#CHECK: lper	%f0, %f15               # encoding: [0x30,0x0f]
+#CHECK: lper	%f15, %f0               # encoding: [0x30,0xf0]
+#CHECK: lper	%f15, %f9               # encoding: [0x30,0xf9]
+
+	lper	%f0,%f9
+	lper	%f0,%f15
+	lper	%f15,%f0
+	lper	%f15,%f9
+
 #CHECK: lpgfr	%r0, %r0                # encoding: [0xb9,0x10,0x00,0x00]
 #CHECK: lpgfr	%r0, %r15               # encoding: [0xb9,0x10,0x00,0x0f]
 #CHECK: lpgfr	%r15, %r0               # encoding: [0xb9,0x10,0x00,0xf0]
@@ -8435,6 +9393,16 @@
 	lpxbr	%f13,%f0
 	lpxbr	%f13,%f9
 
+#CHECK: lpxr	%f0, %f8                # encoding: [0xb3,0x60,0x00,0x08]
+#CHECK: lpxr	%f0, %f13               # encoding: [0xb3,0x60,0x00,0x0d]
+#CHECK: lpxr	%f13, %f0               # encoding: [0xb3,0x60,0x00,0xd0]
+#CHECK: lpxr	%f13, %f9               # encoding: [0xb3,0x60,0x00,0xd9]
+
+	lpxr	%f0,%f8
+	lpxr	%f0,%f13
+	lpxr	%f13,%f0
+	lpxr	%f13,%f9
+
 #CHECK: lr	%r0, %r9                # encoding: [0x18,0x09]
 #CHECK: lr	%r0, %r15               # encoding: [0x18,0x0f]
 #CHECK: lr	%r15, %r0               # encoding: [0x18,0xf0]
@@ -8445,6 +9413,30 @@
 	lr	%r15,%r0
 	lr	%r15,%r9
 
+#CHECK: lrdr	%f0, %f0                # encoding: [0x25,0x00]
+#CHECK: lrdr	%f0, %f13               # encoding: [0x25,0x0d]
+#CHECK: lrdr	%f7, %f8                # encoding: [0x25,0x78]
+#CHECK: lrdr	%f15, %f0               # encoding: [0x25,0xf0]
+#CHECK: lrdr	%f15, %f13              # encoding: [0x25,0xfd]
+
+	lrdr	%f0, %f0
+	lrdr	%f0, %f13
+	lrdr	%f7, %f8
+	lrdr	%f15, %f0
+	lrdr	%f15, %f13
+
+#CHECK: lrer	%f0, %f0                # encoding: [0x35,0x00]
+#CHECK: lrer	%f0, %f15               # encoding: [0x35,0x0f]
+#CHECK: lrer	%f7, %f8                # encoding: [0x35,0x78]
+#CHECK: lrer	%f15, %f0               # encoding: [0x35,0xf0]
+#CHECK: lrer	%f15, %f15              # encoding: [0x35,0xff]
+
+	lrer	%f0, %f0
+	lrer	%f0, %f15
+	lrer	%f7, %f8
+	lrer	%f15, %f0
+	lrer	%f15, %f15
+
 #CHECK: lrl	%r0, .[[LAB:L.*]]-4294967296 # encoding: [0xc4,0x0d,A,A,A,A]
 #CHECK:  fixup A - offset: 2, value: (.[[LAB]]-4294967296)+2, kind: FK_390_PC32DBL
 	lrl	%r0, -0x100000000
@@ -8604,6 +9596,26 @@
 	ltdbr	%f15,%f0
 	ltdbr	%f15,%f9
 
+#CHECK: ltdr	%f0, %f9                # encoding: [0x22,0x09]
+#CHECK: ltdr	%f0, %f15               # encoding: [0x22,0x0f]
+#CHECK: ltdr	%f15, %f0               # encoding: [0x22,0xf0]
+#CHECK: ltdr	%f15, %f9               # encoding: [0x22,0xf9]
+
+	ltdr	%f0,%f9
+	ltdr	%f0,%f15
+	ltdr	%f15,%f0
+	ltdr	%f15,%f9
+
+#CHECK: ltdtr	%f0, %f9                # encoding: [0xb3,0xd6,0x00,0x09]
+#CHECK: ltdtr	%f0, %f15               # encoding: [0xb3,0xd6,0x00,0x0f]
+#CHECK: ltdtr	%f15, %f0               # encoding: [0xb3,0xd6,0x00,0xf0]
+#CHECK: ltdtr	%f15, %f9               # encoding: [0xb3,0xd6,0x00,0xf9]
+
+	ltdtr	%f0,%f9
+	ltdtr	%f0,%f15
+	ltdtr	%f15,%f0
+	ltdtr	%f15,%f9
+
 #CHECK: ltebr	%f0, %f9                # encoding: [0xb3,0x02,0x00,0x09]
 #CHECK: ltebr	%f0, %f15               # encoding: [0xb3,0x02,0x00,0x0f]
 #CHECK: ltebr	%f15, %f0               # encoding: [0xb3,0x02,0x00,0xf0]
@@ -8614,6 +9626,16 @@
 	ltebr	%f15,%f0
 	ltebr	%f15,%f9
 
+#CHECK: lter	%f0, %f9                # encoding: [0x32,0x09]
+#CHECK: lter	%f0, %f15               # encoding: [0x32,0x0f]
+#CHECK: lter	%f15, %f0               # encoding: [0x32,0xf0]
+#CHECK: lter	%f15, %f9               # encoding: [0x32,0xf9]
+
+	lter	%f0,%f9
+	lter	%f0,%f15
+	lter	%f15,%f0
+	lter	%f15,%f9
+
 #CHECK: ltg	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x02]
 #CHECK: ltg	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x02]
 #CHECK: ltg	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x02]
@@ -8698,6 +9720,136 @@
 	ltxbr	%f13,%f0
 	ltxbr	%f13,%f9
 
+#CHECK: ltxr	%f0, %f9                # encoding: [0xb3,0x62,0x00,0x09]
+#CHECK: ltxr	%f0, %f13               # encoding: [0xb3,0x62,0x00,0x0d]
+#CHECK: ltxr	%f13, %f0               # encoding: [0xb3,0x62,0x00,0xd0]
+#CHECK: ltxr	%f13, %f9               # encoding: [0xb3,0x62,0x00,0xd9]
+
+	ltxr	%f0,%f9
+	ltxr	%f0,%f13
+	ltxr	%f13,%f0
+	ltxr	%f13,%f9
+
+#CHECK: ltxtr	%f0, %f9                # encoding: [0xb3,0xde,0x00,0x09]
+#CHECK: ltxtr	%f0, %f13               # encoding: [0xb3,0xde,0x00,0x0d]
+#CHECK: ltxtr	%f13, %f0               # encoding: [0xb3,0xde,0x00,0xd0]
+#CHECK: ltxtr	%f13, %f9               # encoding: [0xb3,0xde,0x00,0xd9]
+
+	ltxtr	%f0,%f9
+	ltxtr	%f0,%f13
+	ltxtr	%f13,%f0
+	ltxtr	%f13,%f9
+
+#CHECK: lxd	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x25]
+#CHECK: lxd	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x25]
+#CHECK: lxd	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x25]
+#CHECK: lxd	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x25]
+#CHECK: lxd	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x25]
+#CHECK: lxd	%f13, 0                 # encoding: [0xed,0xd0,0x00,0x00,0x00,0x25]
+
+	lxd	%f0, 0
+	lxd	%f0, 4095
+	lxd	%f0, 0(%r1)
+	lxd	%f0, 0(%r15)
+	lxd	%f0, 4095(%r1,%r15)
+	lxd	%f0, 4095(%r15,%r1)
+	lxd	%f13, 0
+
+#CHECK: lxdb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x05]
+#CHECK: lxdb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x05]
+#CHECK: lxdb	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x05]
+#CHECK: lxdb	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x05]
+#CHECK: lxdb	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x05]
+#CHECK: lxdb	%f13, 0                 # encoding: [0xed,0xd0,0x00,0x00,0x00,0x05]
+
+	lxdb	%f0, 0
+	lxdb	%f0, 4095
+	lxdb	%f0, 0(%r1)
+	lxdb	%f0, 0(%r15)
+	lxdb	%f0, 4095(%r1,%r15)
+	lxdb	%f0, 4095(%r15,%r1)
+	lxdb	%f13, 0
+
+#CHECK: lxdbr	%f0, %f8                # encoding: [0xb3,0x05,0x00,0x08]
+#CHECK: lxdbr	%f0, %f13               # encoding: [0xb3,0x05,0x00,0x0d]
+#CHECK: lxdbr	%f13, %f0               # encoding: [0xb3,0x05,0x00,0xd0]
+#CHECK: lxdbr	%f13, %f15              # encoding: [0xb3,0x05,0x00,0xdf]
+
+	lxdbr	%f0,%f8
+	lxdbr	%f0,%f13
+	lxdbr	%f13,%f0
+	lxdbr	%f13,%f15
+
+#CHECK: lxdr	%f0, %f8                # encoding: [0xb3,0x25,0x00,0x08]
+#CHECK: lxdr	%f0, %f13               # encoding: [0xb3,0x25,0x00,0x0d]
+#CHECK: lxdr	%f13, %f0               # encoding: [0xb3,0x25,0x00,0xd0]
+#CHECK: lxdr	%f13, %f15              # encoding: [0xb3,0x25,0x00,0xdf]
+
+	lxdr	%f0,%f8
+	lxdr	%f0,%f13
+	lxdr	%f13,%f0
+	lxdr	%f13,%f15
+
+#CHECK: lxdtr	%f0, %f0, 15            # encoding: [0xb3,0xdc,0x0f,0x00]
+#CHECK: lxdtr	%f0, %f15, 0            # encoding: [0xb3,0xdc,0x00,0x0f]
+#CHECK: lxdtr	%f5, %f8, 9             # encoding: [0xb3,0xdc,0x09,0x58]
+#CHECK: lxdtr	%f13, %f0, 0            # encoding: [0xb3,0xdc,0x00,0xd0]
+
+	lxdtr	%f0, %f0, 15
+	lxdtr	%f0, %f15, 0
+	lxdtr	%f5, %f8, 9
+	lxdtr	%f13, %f0, 0
+
+#CHECK: lxe	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x26]
+#CHECK: lxe	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x26]
+#CHECK: lxe	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x26]
+#CHECK: lxe	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x26]
+#CHECK: lxe	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x26]
+#CHECK: lxe	%f13, 0                 # encoding: [0xed,0xd0,0x00,0x00,0x00,0x26]
+
+	lxe	%f0, 0
+	lxe	%f0, 4095
+	lxe	%f0, 0(%r1)
+	lxe	%f0, 0(%r15)
+	lxe	%f0, 4095(%r1,%r15)
+	lxe	%f0, 4095(%r15,%r1)
+	lxe	%f13, 0
+
+#CHECK: lxeb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x06]
+#CHECK: lxeb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x06]
+#CHECK: lxeb	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x06]
+#CHECK: lxeb	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x06]
+#CHECK: lxeb	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x06]
+#CHECK: lxeb	%f13, 0                 # encoding: [0xed,0xd0,0x00,0x00,0x00,0x06]
+
+	lxeb	%f0, 0
+	lxeb	%f0, 4095
+	lxeb	%f0, 0(%r1)
+	lxeb	%f0, 0(%r15)
+	lxeb	%f0, 4095(%r1,%r15)
+	lxeb	%f0, 4095(%r15,%r1)
+	lxeb	%f13, 0
+
+#CHECK: lxebr	%f0, %f8                # encoding: [0xb3,0x06,0x00,0x08]
+#CHECK: lxebr	%f0, %f13               # encoding: [0xb3,0x06,0x00,0x0d]
+#CHECK: lxebr	%f13, %f0               # encoding: [0xb3,0x06,0x00,0xd0]
+#CHECK: lxebr	%f13, %f15              # encoding: [0xb3,0x06,0x00,0xdf]
+
+	lxebr	%f0,%f8
+	lxebr	%f0,%f13
+	lxebr	%f13,%f0
+	lxebr	%f13,%f15
+
+#CHECK: lxer	%f0, %f8                # encoding: [0xb3,0x26,0x00,0x08]
+#CHECK: lxer	%f0, %f13               # encoding: [0xb3,0x26,0x00,0x0d]
+#CHECK: lxer	%f13, %f0               # encoding: [0xb3,0x26,0x00,0xd0]
+#CHECK: lxer	%f13, %f15              # encoding: [0xb3,0x26,0x00,0xdf]
+
+	lxer	%f0,%f8
+	lxer	%f0,%f13
+	lxer	%f13,%f0
+	lxer	%f13,%f15
+
 #CHECK: lxr	%f0, %f8                # encoding: [0xb3,0x65,0x00,0x08]
 #CHECK: lxr	%f0, %f13               # encoding: [0xb3,0x65,0x00,0x0d]
 #CHECK: lxr	%f13, %f0               # encoding: [0xb3,0x65,0x00,0xd0]
@@ -8770,6 +9922,26 @@
 	m	%r0, 4095(%r15,%r1)
 	m	%r14, 0
 
+#CHECK: mad	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x3e]
+#CHECK: mad	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x3e]
+#CHECK: mad	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x3e]
+#CHECK: mad	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x3e]
+#CHECK: mad	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x3e]
+#CHECK: mad	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x3e]
+#CHECK: mad	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x3e]
+#CHECK: mad	%f15, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xf0,0x3e]
+#CHECK: mad	%f15, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xf0,0x3e]
+
+	mad	%f0, %f0, 0
+	mad	%f0, %f0, 4095
+	mad	%f0, %f0, 0(%r1)
+	mad	%f0, %f0, 0(%r15)
+	mad	%f0, %f0, 4095(%r1,%r15)
+	mad	%f0, %f0, 4095(%r15,%r1)
+	mad	%f0, %f15, 0
+	mad	%f15, %f0, 0
+	mad	%f15, %f15, 0
+
 #CHECK: madb	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x1e]
 #CHECK: madb	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1e]
 #CHECK: madb	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x1e]
@@ -8804,6 +9976,40 @@
 	madbr	%f7, %f8, %f9
 	madbr	%f15, %f15, %f15
 
+#CHECK: madr	%f0, %f0, %f0           # encoding: [0xb3,0x3e,0x00,0x00]
+#CHECK: madr	%f0, %f0, %f15          # encoding: [0xb3,0x3e,0x00,0x0f]
+#CHECK: madr	%f0, %f15, %f0          # encoding: [0xb3,0x3e,0x00,0xf0]
+#CHECK: madr	%f15, %f0, %f0          # encoding: [0xb3,0x3e,0xf0,0x00]
+#CHECK: madr	%f7, %f8, %f9           # encoding: [0xb3,0x3e,0x70,0x89]
+#CHECK: madr	%f15, %f15, %f15        # encoding: [0xb3,0x3e,0xf0,0xff]
+
+	madr	%f0, %f0, %f0
+	madr	%f0, %f0, %f15
+	madr	%f0, %f15, %f0
+	madr	%f15, %f0, %f0
+	madr	%f7, %f8, %f9
+	madr	%f15, %f15, %f15
+
+#CHECK: mae	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x2e]
+#CHECK: mae	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x2e]
+#CHECK: mae	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x2e]
+#CHECK: mae	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x2e]
+#CHECK: mae	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x2e]
+#CHECK: mae	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x2e]
+#CHECK: mae	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x2e]
+#CHECK: mae	%f15, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xf0,0x2e]
+#CHECK: mae	%f15, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xf0,0x2e]
+
+	mae	%f0, %f0, 0
+	mae	%f0, %f0, 4095
+	mae	%f0, %f0, 0(%r1)
+	mae	%f0, %f0, 0(%r15)
+	mae	%f0, %f0, 4095(%r1,%r15)
+	mae	%f0, %f0, 4095(%r15,%r1)
+	mae	%f0, %f15, 0
+	mae	%f15, %f0, 0
+	mae	%f15, %f15, 0
+
 #CHECK: maeb	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x0e]
 #CHECK: maeb	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x0e]
 #CHECK: maeb	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x0e]
@@ -8838,6 +10044,122 @@
 	maebr	%f7, %f8, %f9
 	maebr	%f15, %f15, %f15
 
+#CHECK: maer	%f0, %f0, %f0           # encoding: [0xb3,0x2e,0x00,0x00]
+#CHECK: maer	%f0, %f0, %f15          # encoding: [0xb3,0x2e,0x00,0x0f]
+#CHECK: maer	%f0, %f15, %f0          # encoding: [0xb3,0x2e,0x00,0xf0]
+#CHECK: maer	%f15, %f0, %f0          # encoding: [0xb3,0x2e,0xf0,0x00]
+#CHECK: maer	%f7, %f8, %f9           # encoding: [0xb3,0x2e,0x70,0x89]
+#CHECK: maer	%f15, %f15, %f15        # encoding: [0xb3,0x2e,0xf0,0xff]
+
+	maer	%f0, %f0, %f0
+	maer	%f0, %f0, %f15
+	maer	%f0, %f15, %f0
+	maer	%f15, %f0, %f0
+	maer	%f7, %f8, %f9
+	maer	%f15, %f15, %f15
+
+#CHECK: may	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x3a]
+#CHECK: may	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x3a]
+#CHECK: may	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x3a]
+#CHECK: may	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x3a]
+#CHECK: may	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x3a]
+#CHECK: may	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x3a]
+#CHECK: may	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x3a]
+#CHECK: may	%f13, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xd0,0x3a]
+#CHECK: may	%f13, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xd0,0x3a]
+
+	may	%f0, %f0, 0
+	may	%f0, %f0, 4095
+	may	%f0, %f0, 0(%r1)
+	may	%f0, %f0, 0(%r15)
+	may	%f0, %f0, 4095(%r1,%r15)
+	may	%f0, %f0, 4095(%r15,%r1)
+	may	%f0, %f15, 0
+	may	%f13, %f0, 0
+	may	%f13, %f15, 0
+
+#CHECK: mayh	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x3c]
+#CHECK: mayh	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x3c]
+#CHECK: mayh	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x3c]
+#CHECK: mayh	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x3c]
+#CHECK: mayh	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x3c]
+#CHECK: mayh	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x3c]
+#CHECK: mayh	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x3c]
+#CHECK: mayh	%f15, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xf0,0x3c]
+#CHECK: mayh	%f15, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xf0,0x3c]
+
+	mayh	%f0, %f0, 0
+	mayh	%f0, %f0, 4095
+	mayh	%f0, %f0, 0(%r1)
+	mayh	%f0, %f0, 0(%r15)
+	mayh	%f0, %f0, 4095(%r1,%r15)
+	mayh	%f0, %f0, 4095(%r15,%r1)
+	mayh	%f0, %f15, 0
+	mayh	%f15, %f0, 0
+	mayh	%f15, %f15, 0
+
+#CHECK: mayhr	%f0, %f0, %f0           # encoding: [0xb3,0x3c,0x00,0x00]
+#CHECK: mayhr	%f0, %f0, %f15          # encoding: [0xb3,0x3c,0x00,0x0f]
+#CHECK: mayhr	%f0, %f15, %f0          # encoding: [0xb3,0x3c,0x00,0xf0]
+#CHECK: mayhr	%f15, %f0, %f0          # encoding: [0xb3,0x3c,0xf0,0x00]
+#CHECK: mayhr	%f7, %f8, %f9           # encoding: [0xb3,0x3c,0x70,0x89]
+#CHECK: mayhr	%f15, %f15, %f15        # encoding: [0xb3,0x3c,0xf0,0xff]
+
+	mayhr	%f0, %f0, %f0
+	mayhr	%f0, %f0, %f15
+	mayhr	%f0, %f15, %f0
+	mayhr	%f15, %f0, %f0
+	mayhr	%f7, %f8, %f9
+	mayhr	%f15, %f15, %f15
+
+#CHECK: mayl	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x38]
+#CHECK: mayl	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x38]
+#CHECK: mayl	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x38]
+#CHECK: mayl	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x38]
+#CHECK: mayl	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x38]
+#CHECK: mayl	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x38]
+#CHECK: mayl	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x38]
+#CHECK: mayl	%f15, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xf0,0x38]
+#CHECK: mayl	%f15, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xf0,0x38]
+
+	mayl	%f0, %f0, 0
+	mayl	%f0, %f0, 4095
+	mayl	%f0, %f0, 0(%r1)
+	mayl	%f0, %f0, 0(%r15)
+	mayl	%f0, %f0, 4095(%r1,%r15)
+	mayl	%f0, %f0, 4095(%r15,%r1)
+	mayl	%f0, %f15, 0
+	mayl	%f15, %f0, 0
+	mayl	%f15, %f15, 0
+
+#CHECK: maylr	%f0, %f0, %f0           # encoding: [0xb3,0x38,0x00,0x00]
+#CHECK: maylr	%f0, %f0, %f15          # encoding: [0xb3,0x38,0x00,0x0f]
+#CHECK: maylr	%f0, %f15, %f0          # encoding: [0xb3,0x38,0x00,0xf0]
+#CHECK: maylr	%f15, %f0, %f0          # encoding: [0xb3,0x38,0xf0,0x00]
+#CHECK: maylr	%f7, %f8, %f9           # encoding: [0xb3,0x38,0x70,0x89]
+#CHECK: maylr	%f15, %f15, %f15        # encoding: [0xb3,0x38,0xf0,0xff]
+
+	maylr	%f0, %f0, %f0
+	maylr	%f0, %f0, %f15
+	maylr	%f0, %f15, %f0
+	maylr	%f15, %f0, %f0
+	maylr	%f7, %f8, %f9
+	maylr	%f15, %f15, %f15
+
+#CHECK: mayr	%f0, %f0, %f0           # encoding: [0xb3,0x3a,0x00,0x00]
+#CHECK: mayr	%f0, %f0, %f15          # encoding: [0xb3,0x3a,0x00,0x0f]
+#CHECK: mayr	%f0, %f15, %f0          # encoding: [0xb3,0x3a,0x00,0xf0]
+#CHECK: mayr	%f13, %f0, %f0          # encoding: [0xb3,0x3a,0xd0,0x00]
+#CHECK: mayr	%f5, %f8, %f9           # encoding: [0xb3,0x3a,0x50,0x89]
+#CHECK: mayr	%f13, %f15, %f15        # encoding: [0xb3,0x3a,0xd0,0xff]
+
+	mayr	%f0, %f0, %f0
+	mayr	%f0, %f0, %f15
+	mayr	%f0, %f15, %f0
+	mayr	%f13, %f0, %f0
+	mayr	%f5, %f8, %f9
+	mayr	%f13, %f15, %f15
+
 #CHECK: mc	0, 0                    # encoding: [0xaf,0x00,0x00,0x00]
 #CHECK: mc	4095, 0                 # encoding: [0xaf,0x00,0x0f,0xff]
 #CHECK: mc	0, 255                  # encoding: [0xaf,0xff,0x00,0x00]
@@ -8854,6 +10176,22 @@
 	mc	4095(%r1), 42
 	mc	4095(%r15), 42
 
+#CHECK: md	%f0, 0                  # encoding: [0x6c,0x00,0x00,0x00]
+#CHECK: md	%f0, 4095               # encoding: [0x6c,0x00,0x0f,0xff]
+#CHECK: md	%f0, 0(%r1)             # encoding: [0x6c,0x00,0x10,0x00]
+#CHECK: md	%f0, 0(%r15)            # encoding: [0x6c,0x00,0xf0,0x00]
+#CHECK: md	%f0, 4095(%r1,%r15)     # encoding: [0x6c,0x01,0xff,0xff]
+#CHECK: md	%f0, 4095(%r15,%r1)     # encoding: [0x6c,0x0f,0x1f,0xff]
+#CHECK: md	%f15, 0                 # encoding: [0x6c,0xf0,0x00,0x00]
+
+	md	%f0, 0
+	md	%f0, 4095
+	md	%f0, 0(%r1)
+	md	%f0, 0(%r15)
+	md	%f0, 4095(%r1,%r15)
+	md	%f0, 4095(%r15,%r1)
+	md	%f15, 0
+
 #CHECK: mdb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x1c]
 #CHECK: mdb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1c]
 #CHECK: mdb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x1c]
@@ -8880,6 +10218,22 @@
 	mdbr	%f7, %f8
 	mdbr	%f15, %f0
 
+#CHECK: mde	%f0, 0                  # encoding: [0x7c,0x00,0x00,0x00]
+#CHECK: mde	%f0, 4095               # encoding: [0x7c,0x00,0x0f,0xff]
+#CHECK: mde	%f0, 0(%r1)             # encoding: [0x7c,0x00,0x10,0x00]
+#CHECK: mde	%f0, 0(%r15)            # encoding: [0x7c,0x00,0xf0,0x00]
+#CHECK: mde	%f0, 4095(%r1,%r15)     # encoding: [0x7c,0x01,0xff,0xff]
+#CHECK: mde	%f0, 4095(%r15,%r1)     # encoding: [0x7c,0x0f,0x1f,0xff]
+#CHECK: mde	%f15, 0                 # encoding: [0x7c,0xf0,0x00,0x00]
+
+	mde	%f0, 0
+	mde	%f0, 4095
+	mde	%f0, 0(%r1)
+	mde	%f0, 0(%r15)
+	mde	%f0, 4095(%r1,%r15)
+	mde	%f0, 4095(%r15,%r1)
+	mde	%f15, 0
+
 #CHECK: mdeb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x0c]
 #CHECK: mdeb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x0c]
 #CHECK: mdeb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x0c]
@@ -8906,6 +10260,70 @@
 	mdebr	%f7, %f8
 	mdebr	%f15, %f0
 
+#CHECK: mder	%f0, %f0                # encoding: [0x3c,0x00]
+#CHECK: mder	%f0, %f15               # encoding: [0x3c,0x0f]
+#CHECK: mder	%f7, %f8                # encoding: [0x3c,0x78]
+#CHECK: mder	%f15, %f0               # encoding: [0x3c,0xf0]
+
+	mder	%f0, %f0
+	mder	%f0, %f15
+	mder	%f7, %f8
+	mder	%f15, %f0
+
+#CHECK: mdr	%f0, %f0                # encoding: [0x2c,0x00]
+#CHECK: mdr	%f0, %f15               # encoding: [0x2c,0x0f]
+#CHECK: mdr	%f7, %f8                # encoding: [0x2c,0x78]
+#CHECK: mdr	%f15, %f0               # encoding: [0x2c,0xf0]
+
+	mdr	%f0, %f0
+	mdr	%f0, %f15
+	mdr	%f7, %f8
+	mdr	%f15, %f0
+
+#CHECK: mdtr	%f0, %f0, %f0           # encoding: [0xb3,0xd0,0x00,0x00]
+#CHECK: mdtr	%f0, %f0, %f15          # encoding: [0xb3,0xd0,0xf0,0x00]
+#CHECK: mdtr	%f0, %f15, %f0          # encoding: [0xb3,0xd0,0x00,0x0f]
+#CHECK: mdtr	%f15, %f0, %f0          # encoding: [0xb3,0xd0,0x00,0xf0]
+#CHECK: mdtr	%f7, %f8, %f9           # encoding: [0xb3,0xd0,0x90,0x78]
+
+	mdtr	%f0, %f0, %f0
+	mdtr	%f0, %f0, %f15
+	mdtr	%f0, %f15, %f0
+	mdtr	%f15, %f0, %f0
+	mdtr	%f7, %f8, %f9
+
+#CHECK: me	%f0, 0                  # encoding: [0x7c,0x00,0x00,0x00]
+#CHECK: me	%f0, 4095               # encoding: [0x7c,0x00,0x0f,0xff]
+#CHECK: me	%f0, 0(%r1)             # encoding: [0x7c,0x00,0x10,0x00]
+#CHECK: me	%f0, 0(%r15)            # encoding: [0x7c,0x00,0xf0,0x00]
+#CHECK: me	%f0, 4095(%r1,%r15)     # encoding: [0x7c,0x01,0xff,0xff]
+#CHECK: me	%f0, 4095(%r15,%r1)     # encoding: [0x7c,0x0f,0x1f,0xff]
+#CHECK: me	%f15, 0                 # encoding: [0x7c,0xf0,0x00,0x00]
+
+	me	%f0, 0
+	me	%f0, 4095
+	me	%f0, 0(%r1)
+	me	%f0, 0(%r15)
+	me	%f0, 4095(%r1,%r15)
+	me	%f0, 4095(%r15,%r1)
+	me	%f15, 0
+
+#CHECK: mee	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x37]
+#CHECK: mee	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x37]
+#CHECK: mee	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x37]
+#CHECK: mee	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x37]
+#CHECK: mee	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x37]
+#CHECK: mee	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x37]
+#CHECK: mee	%f15, 0                 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x37]
+
+	mee	%f0, 0
+	mee	%f0, 4095
+	mee	%f0, 0(%r1)
+	mee	%f0, 0(%r15)
+	mee	%f0, 4095(%r1,%r15)
+	mee	%f0, 4095(%r15,%r1)
+	mee	%f15, 0
+
 #CHECK: meeb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x17]
 #CHECK: meeb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x17]
 #CHECK: meeb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x17]
@@ -8932,6 +10350,26 @@
 	meebr	%f7, %f8
 	meebr	%f15, %f0
 
+#CHECK: meer	%f0, %f0                # encoding: [0xb3,0x37,0x00,0x00]
+#CHECK: meer	%f0, %f15               # encoding: [0xb3,0x37,0x00,0x0f]
+#CHECK: meer	%f7, %f8                # encoding: [0xb3,0x37,0x00,0x78]
+#CHECK: meer	%f15, %f0               # encoding: [0xb3,0x37,0x00,0xf0]
+
+	meer	%f0, %f0
+	meer	%f0, %f15
+	meer	%f7, %f8
+	meer	%f15, %f0
+
+#CHECK: mer	%f0, %f0                # encoding: [0x3c,0x00]
+#CHECK: mer	%f0, %f15               # encoding: [0x3c,0x0f]
+#CHECK: mer	%f7, %f8                # encoding: [0x3c,0x78]
+#CHECK: mer	%f15, %f0               # encoding: [0x3c,0xf0]
+
+	mer	%f0, %f0
+	mer	%f0, %f15
+	mer	%f7, %f8
+	mer	%f15, %f0
+
 #CHECK: mfy	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x5c]
 #CHECK: mfy	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x5c]
 #CHECK: mfy	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x5c]
@@ -9140,6 +10578,26 @@
 	ms	%r0, 4095(%r15,%r1)
 	ms	%r15, 0
 
+#CHECK: msd	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x3f]
+#CHECK: msd	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x3f]
+#CHECK: msd	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x3f]
+#CHECK: msd	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x3f]
+#CHECK: msd	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x3f]
+#CHECK: msd	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x3f]
+#CHECK: msd	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x3f]
+#CHECK: msd	%f15, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xf0,0x3f]
+#CHECK: msd	%f15, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xf0,0x3f]
+
+	msd	%f0, %f0, 0
+	msd	%f0, %f0, 4095
+	msd	%f0, %f0, 0(%r1)
+	msd	%f0, %f0, 0(%r15)
+	msd	%f0, %f0, 4095(%r1,%r15)
+	msd	%f0, %f0, 4095(%r15,%r1)
+	msd	%f0, %f15, 0
+	msd	%f15, %f0, 0
+	msd	%f15, %f15, 0
+
 #CHECK: msdb	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x1f]
 #CHECK: msdb	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1f]
 #CHECK: msdb	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x1f]
@@ -9174,6 +10632,40 @@
 	msdbr	%f7, %f8, %f9
 	msdbr	%f15, %f15, %f15
 
+#CHECK: msdr	%f0, %f0, %f0           # encoding: [0xb3,0x3f,0x00,0x00]
+#CHECK: msdr	%f0, %f0, %f15          # encoding: [0xb3,0x3f,0x00,0x0f]
+#CHECK: msdr	%f0, %f15, %f0          # encoding: [0xb3,0x3f,0x00,0xf0]
+#CHECK: msdr	%f15, %f0, %f0          # encoding: [0xb3,0x3f,0xf0,0x00]
+#CHECK: msdr	%f7, %f8, %f9           # encoding: [0xb3,0x3f,0x70,0x89]
+#CHECK: msdr	%f15, %f15, %f15        # encoding: [0xb3,0x3f,0xf0,0xff]
+
+	msdr	%f0, %f0, %f0
+	msdr	%f0, %f0, %f15
+	msdr	%f0, %f15, %f0
+	msdr	%f15, %f0, %f0
+	msdr	%f7, %f8, %f9
+	msdr	%f15, %f15, %f15
+
+#CHECK: mse	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x2f]
+#CHECK: mse	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x2f]
+#CHECK: mse	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x2f]
+#CHECK: mse	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x2f]
+#CHECK: mse	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x2f]
+#CHECK: mse	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x2f]
+#CHECK: mse	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x2f]
+#CHECK: mse	%f15, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xf0,0x2f]
+#CHECK: mse	%f15, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xf0,0x2f]
+
+	mse	%f0, %f0, 0
+	mse	%f0, %f0, 4095
+	mse	%f0, %f0, 0(%r1)
+	mse	%f0, %f0, 0(%r15)
+	mse	%f0, %f0, 4095(%r1,%r15)
+	mse	%f0, %f0, 4095(%r15,%r1)
+	mse	%f0, %f15, 0
+	mse	%f15, %f0, 0
+	mse	%f15, %f15, 0
+
 #CHECK: mseb	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x0f]
 #CHECK: mseb	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x0f]
 #CHECK: mseb	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x0f]
@@ -9208,6 +10700,20 @@
 	msebr	%f7, %f8, %f9
 	msebr	%f15, %f15, %f15
 
+#CHECK: mser	%f0, %f0, %f0           # encoding: [0xb3,0x2f,0x00,0x00]
+#CHECK: mser	%f0, %f0, %f15          # encoding: [0xb3,0x2f,0x00,0x0f]
+#CHECK: mser	%f0, %f15, %f0          # encoding: [0xb3,0x2f,0x00,0xf0]
+#CHECK: mser	%f15, %f0, %f0          # encoding: [0xb3,0x2f,0xf0,0x00]
+#CHECK: mser	%f7, %f8, %f9           # encoding: [0xb3,0x2f,0x70,0x89]
+#CHECK: mser	%f15, %f15, %f15        # encoding: [0xb3,0x2f,0xf0,0xff]
+
+	mser	%f0, %f0, %f0
+	mser	%f0, %f0, %f15
+	mser	%f0, %f15, %f0
+	mser	%f15, %f0, %f0
+	mser	%f7, %f8, %f9
+	mser	%f15, %f15, %f15
+
 #CHECK: msfi	%r0, -2147483648        # encoding: [0xc2,0x01,0x80,0x00,0x00,0x00]
 #CHECK: msfi	%r0, -1                 # encoding: [0xc2,0x01,0xff,0xff,0xff,0xff]
 #CHECK: msfi	%r0, 0                  # encoding: [0xc2,0x01,0x00,0x00,0x00,0x00]
@@ -9672,6 +11178,22 @@
 	mxbr	%f8, %f5
 	mxbr	%f13, %f13
 
+#CHECK: mxd	%f0, 0                  # encoding: [0x67,0x00,0x00,0x00]
+#CHECK: mxd	%f0, 4095               # encoding: [0x67,0x00,0x0f,0xff]
+#CHECK: mxd	%f0, 0(%r1)             # encoding: [0x67,0x00,0x10,0x00]
+#CHECK: mxd	%f0, 0(%r15)            # encoding: [0x67,0x00,0xf0,0x00]
+#CHECK: mxd	%f0, 4095(%r1,%r15)     # encoding: [0x67,0x01,0xff,0xff]
+#CHECK: mxd	%f0, 4095(%r15,%r1)     # encoding: [0x67,0x0f,0x1f,0xff]
+#CHECK: mxd	%f13, 0                 # encoding: [0x67,0xd0,0x00,0x00]
+
+	mxd	%f0, 0
+	mxd	%f0, 4095
+	mxd	%f0, 0(%r1)
+	mxd	%f0, 0(%r15)
+	mxd	%f0, 4095(%r1,%r15)
+	mxd	%f0, 4095(%r15,%r1)
+	mxd	%f13, 0
+
 #CHECK: mxdb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x07]
 #CHECK: mxdb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x07]
 #CHECK: mxdb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x07]
@@ -9698,6 +11220,140 @@
 	mxdbr	%f8, %f8
 	mxdbr	%f13, %f0
 
+#CHECK: mxdr	%f0, %f0                # encoding: [0x27,0x00]
+#CHECK: mxdr	%f0, %f15               # encoding: [0x27,0x0f]
+#CHECK: mxdr	%f8, %f8                # encoding: [0x27,0x88]
+#CHECK: mxdr	%f13, %f0               # encoding: [0x27,0xd0]
+
+	mxdr	%f0, %f0
+	mxdr	%f0, %f15
+	mxdr	%f8, %f8
+	mxdr	%f13, %f0
+
+#CHECK: mxr	%f0, %f0                # encoding: [0x26,0x00]
+#CHECK: mxr	%f0, %f13               # encoding: [0x26,0x0d]
+#CHECK: mxr	%f8, %f5                # encoding: [0x26,0x85]
+#CHECK: mxr	%f13, %f13              # encoding: [0x26,0xdd]
+
+	mxr	%f0, %f0
+	mxr	%f0, %f13
+	mxr	%f8, %f5
+	mxr	%f13, %f13
+
+#CHECK: mxtr	%f0, %f0, %f0           # encoding: [0xb3,0xd8,0x00,0x00]
+#CHECK: mxtr	%f0, %f0, %f13          # encoding: [0xb3,0xd8,0xd0,0x00]
+#CHECK: mxtr	%f0, %f13, %f0          # encoding: [0xb3,0xd8,0x00,0x0d]
+#CHECK: mxtr	%f13, %f0, %f0          # encoding: [0xb3,0xd8,0x00,0xd0]
+#CHECK: mxtr	%f8, %f8, %f8           # encoding: [0xb3,0xd8,0x80,0x88]
+
+	mxtr	%f0, %f0, %f0
+	mxtr	%f0, %f0, %f13
+	mxtr	%f0, %f13, %f0
+	mxtr	%f13, %f0, %f0
+	mxtr	%f8, %f8, %f8
+
+#CHECK: my	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x3b]
+#CHECK: my	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x3b]
+#CHECK: my	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x3b]
+#CHECK: my	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x3b]
+#CHECK: my	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x3b]
+#CHECK: my	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x3b]
+#CHECK: my	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x3b]
+#CHECK: my	%f13, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xd0,0x3b]
+#CHECK: my	%f13, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xd0,0x3b]
+
+	my	%f0, %f0, 0
+	my	%f0, %f0, 4095
+	my	%f0, %f0, 0(%r1)
+	my	%f0, %f0, 0(%r15)
+	my	%f0, %f0, 4095(%r1,%r15)
+	my	%f0, %f0, 4095(%r15,%r1)
+	my	%f0, %f15, 0
+	my	%f13, %f0, 0
+	my	%f13, %f15, 0
+
+#CHECK: myh	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x3d]
+#CHECK: myh	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x3d]
+#CHECK: myh	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x3d]
+#CHECK: myh	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x3d]
+#CHECK: myh	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x3d]
+#CHECK: myh	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x3d]
+#CHECK: myh	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x3d]
+#CHECK: myh	%f15, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xf0,0x3d]
+#CHECK: myh	%f15, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xf0,0x3d]
+
+	myh	%f0, %f0, 0
+	myh	%f0, %f0, 4095
+	myh	%f0, %f0, 0(%r1)
+	myh	%f0, %f0, 0(%r15)
+	myh	%f0, %f0, 4095(%r1,%r15)
+	myh	%f0, %f0, 4095(%r15,%r1)
+	myh	%f0, %f15, 0
+	myh	%f15, %f0, 0
+	myh	%f15, %f15, 0
+
+#CHECK: myhr	%f0, %f0, %f0           # encoding: [0xb3,0x3d,0x00,0x00]
+#CHECK: myhr	%f0, %f0, %f15          # encoding: [0xb3,0x3d,0x00,0x0f]
+#CHECK: myhr	%f0, %f15, %f0          # encoding: [0xb3,0x3d,0x00,0xf0]
+#CHECK: myhr	%f15, %f0, %f0          # encoding: [0xb3,0x3d,0xf0,0x00]
+#CHECK: myhr	%f7, %f8, %f9           # encoding: [0xb3,0x3d,0x70,0x89]
+#CHECK: myhr	%f15, %f15, %f15        # encoding: [0xb3,0x3d,0xf0,0xff]
+
+	myhr	%f0, %f0, %f0
+	myhr	%f0, %f0, %f15
+	myhr	%f0, %f15, %f0
+	myhr	%f15, %f0, %f0
+	myhr	%f7, %f8, %f9
+	myhr	%f15, %f15, %f15
+
+#CHECK: myl	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x39]
+#CHECK: myl	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x39]
+#CHECK: myl	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x39]
+#CHECK: myl	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x39]
+#CHECK: myl	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x39]
+#CHECK: myl	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x39]
+#CHECK: myl	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x39]
+#CHECK: myl	%f15, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xf0,0x39]
+#CHECK: myl	%f15, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xf0,0x39]
+
+	myl	%f0, %f0, 0
+	myl	%f0, %f0, 4095
+	myl	%f0, %f0, 0(%r1)
+	myl	%f0, %f0, 0(%r15)
+	myl	%f0, %f0, 4095(%r1,%r15)
+	myl	%f0, %f0, 4095(%r15,%r1)
+	myl	%f0, %f15, 0
+	myl	%f15, %f0, 0
+	myl	%f15, %f15, 0
+
+#CHECK: mylr	%f0, %f0, %f0           # encoding: [0xb3,0x39,0x00,0x00]
+#CHECK: mylr	%f0, %f0, %f15          # encoding: [0xb3,0x39,0x00,0x0f]
+#CHECK: mylr	%f0, %f15, %f0          # encoding: [0xb3,0x39,0x00,0xf0]
+#CHECK: mylr	%f15, %f0, %f0          # encoding: [0xb3,0x39,0xf0,0x00]
+#CHECK: mylr	%f7, %f8, %f9           # encoding: [0xb3,0x39,0x70,0x89]
+#CHECK: mylr	%f15, %f15, %f15        # encoding: [0xb3,0x39,0xf0,0xff]
+
+	mylr	%f0, %f0, %f0
+	mylr	%f0, %f0, %f15
+	mylr	%f0, %f15, %f0
+	mylr	%f15, %f0, %f0
+	mylr	%f7, %f8, %f9
+	mylr	%f15, %f15, %f15
+
+#CHECK: myr	%f0, %f0, %f0           # encoding: [0xb3,0x3b,0x00,0x00]
+#CHECK: myr	%f0, %f0, %f15          # encoding: [0xb3,0x3b,0x00,0x0f]
+#CHECK: myr	%f0, %f15, %f0          # encoding: [0xb3,0x3b,0x00,0xf0]
+#CHECK: myr	%f13, %f0, %f0          # encoding: [0xb3,0x3b,0xd0,0x00]
+#CHECK: myr	%f5, %f8, %f9           # encoding: [0xb3,0x3b,0x50,0x89]
+#CHECK: myr	%f13, %f15, %f15        # encoding: [0xb3,0x3b,0xd0,0xff]
+
+	myr	%f0, %f0, %f0
+	myr	%f0, %f0, %f15
+	myr	%f0, %f15, %f0
+	myr	%f13, %f0, %f0
+	myr	%f5, %f8, %f9
+	myr	%f13, %f15, %f15
+
 #CHECK: n	%r0, 0                  # encoding: [0x54,0x00,0x00,0x00]
 #CHECK: n	%r0, 4095               # encoding: [0x54,0x00,0x0f,0xff]
 #CHECK: n	%r0, 0(%r1)             # encoding: [0x54,0x00,0x10,0x00]
@@ -10193,6 +11849,9 @@
 	pfdrl	7, frob@PLT
 	pfdrl	8, frob@PLT
 
+#CHECK: pfpo                            # encoding: [0x01,0x0a]
+        pfpo
+
 #CHECK: pka	0, 0(1)                 # encoding: [0xe9,0x00,0x00,0x00,0x00,0x00]
 #CHECK: pka	0, 0(1,%r1)             # encoding: [0xe9,0x00,0x00,0x00,0x10,0x00]
 #CHECK: pka	0, 0(1,%r15)            # encoding: [0xe9,0x00,0x00,0x00,0xf0,0x00]
@@ -10262,6 +11921,34 @@
 #CHECK: pr                              # encoding: [0x01,0x01]
         pr
 
+#CHECK: qadtr	%f0, %f0, %f0, 0        # encoding: [0xb3,0xf5,0x00,0x00]
+#CHECK: qadtr	%f0, %f0, %f0, 15       # encoding: [0xb3,0xf5,0x0f,0x00]
+#CHECK: qadtr	%f0, %f0, %f15, 0       # encoding: [0xb3,0xf5,0x00,0x0f]
+#CHECK: qadtr	%f0, %f15, %f0, 0       # encoding: [0xb3,0xf5,0xf0,0x00]
+#CHECK: qadtr	%f4, %f5, %f6, 7        # encoding: [0xb3,0xf5,0x57,0x46]
+#CHECK: qadtr	%f15, %f0, %f0, 0       # encoding: [0xb3,0xf5,0x00,0xf0]
+
+	qadtr	%f0, %f0, %f0, 0
+	qadtr	%f0, %f0, %f0, 15
+	qadtr	%f0, %f0, %f15, 0
+	qadtr	%f0, %f15, %f0, 0
+	qadtr	%f4, %f5, %f6, 7
+	qadtr	%f15, %f0, %f0, 0
+
+#CHECK: qaxtr	%f0, %f0, %f0, 0        # encoding: [0xb3,0xfd,0x00,0x00]
+#CHECK: qaxtr	%f0, %f0, %f0, 15       # encoding: [0xb3,0xfd,0x0f,0x00]
+#CHECK: qaxtr	%f0, %f0, %f13, 0       # encoding: [0xb3,0xfd,0x00,0x0d]
+#CHECK: qaxtr	%f0, %f13, %f0, 0       # encoding: [0xb3,0xfd,0xd0,0x00]
+#CHECK: qaxtr	%f8, %f8, %f8, 8        # encoding: [0xb3,0xfd,0x88,0x88]
+#CHECK: qaxtr	%f13, %f0, %f0, 0       # encoding: [0xb3,0xfd,0x00,0xd0]
+
+	qaxtr	%f0, %f0, %f0, 0
+	qaxtr	%f0, %f0, %f0, 15
+	qaxtr	%f0, %f0, %f13, 0
+	qaxtr	%f0, %f13, %f0, 0
+	qaxtr	%f8, %f8, %f8, 8
+	qaxtr	%f13, %f0, %f0, 0
+
 #CHECK: risbg	%r0, %r0, 0, 0, 0       # encoding: [0xec,0x00,0x00,0x00,0x00,0x55]
 #CHECK: risbg	%r0, %r0, 0, 0, 63      # encoding: [0xec,0x00,0x00,0x00,0x3f,0x55]
 #CHECK: risbg	%r0, %r0, 0, 255, 0     # encoding: [0xec,0x00,0x00,0xff,0x00,0x55]
@@ -10362,6 +12049,34 @@
 	rosbg	%r15,%r0,0,0,0
 	rosbg	%r4,%r5,6,7,8
 
+#CHECK: rrdtr	%f0, %f0, %f0, 0        # encoding: [0xb3,0xf7,0x00,0x00]
+#CHECK: rrdtr	%f0, %f0, %f0, 15       # encoding: [0xb3,0xf7,0x0f,0x00]
+#CHECK: rrdtr	%f0, %f0, %f15, 0       # encoding: [0xb3,0xf7,0x00,0x0f]
+#CHECK: rrdtr	%f0, %f15, %f0, 0       # encoding: [0xb3,0xf7,0xf0,0x00]
+#CHECK: rrdtr	%f4, %f5, %f6, 7        # encoding: [0xb3,0xf7,0x57,0x46]
+#CHECK: rrdtr	%f15, %f0, %f0, 0       # encoding: [0xb3,0xf7,0x00,0xf0]
+
+	rrdtr	%f0, %f0, %f0, 0
+	rrdtr	%f0, %f0, %f0, 15
+	rrdtr	%f0, %f0, %f15, 0
+	rrdtr	%f0, %f15, %f0, 0
+	rrdtr	%f4, %f5, %f6, 7
+	rrdtr	%f15, %f0, %f0, 0
+
+#CHECK: rrxtr	%f0, %f0, %f0, 0        # encoding: [0xb3,0xff,0x00,0x00]
+#CHECK: rrxtr	%f0, %f0, %f0, 15       # encoding: [0xb3,0xff,0x0f,0x00]
+#CHECK: rrxtr	%f0, %f0, %f13, 0       # encoding: [0xb3,0xff,0x00,0x0d]
+#CHECK: rrxtr	%f0, %f13, %f0, 0       # encoding: [0xb3,0xff,0xd0,0x00]
+#CHECK: rrxtr	%f8, %f8, %f8, 8        # encoding: [0xb3,0xff,0x88,0x88]
+#CHECK: rrxtr	%f13, %f0, %f0, 0       # encoding: [0xb3,0xff,0x00,0xd0]
+
+	rrxtr	%f0, %f0, %f0, 0
+	rrxtr	%f0, %f0, %f0, 15
+	rrxtr	%f0, %f0, %f13, 0
+	rrxtr	%f0, %f13, %f0, 0
+	rrxtr	%f8, %f8, %f8, 8
+	rrxtr	%f13, %f0, %f0, 0
+
 #CHECK: rxsbg	%r0, %r0, 0, 0, 0       # encoding: [0xec,0x00,0x00,0x00,0x00,0x57]
 #CHECK: rxsbg	%r0, %r0, 0, 0, 63      # encoding: [0xec,0x00,0x00,0x00,0x3f,0x57]
 #CHECK: rxsbg	%r0, %r0, 0, 255, 0     # encoding: [0xec,0x00,0x00,0xff,0x00,0x57]
@@ -10414,6 +12129,22 @@
 	sar	%a7, %r8
 	sar	%a15, %r15
 
+#CHECK: sd	%f0, 0                  # encoding: [0x6b,0x00,0x00,0x00]
+#CHECK: sd	%f0, 4095               # encoding: [0x6b,0x00,0x0f,0xff]
+#CHECK: sd	%f0, 0(%r1)             # encoding: [0x6b,0x00,0x10,0x00]
+#CHECK: sd	%f0, 0(%r15)            # encoding: [0x6b,0x00,0xf0,0x00]
+#CHECK: sd	%f0, 4095(%r1,%r15)     # encoding: [0x6b,0x01,0xff,0xff]
+#CHECK: sd	%f0, 4095(%r15,%r1)     # encoding: [0x6b,0x0f,0x1f,0xff]
+#CHECK: sd	%f15, 0                 # encoding: [0x6b,0xf0,0x00,0x00]
+
+	sd	%f0, 0
+	sd	%f0, 4095
+	sd	%f0, 0(%r1)
+	sd	%f0, 0(%r15)
+	sd	%f0, 4095(%r1,%r15)
+	sd	%f0, 4095(%r15,%r1)
+	sd	%f15, 0
+
 #CHECK: sdb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x1b]
 #CHECK: sdb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x1b]
 #CHECK: sdb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x1b]
@@ -10440,6 +12171,44 @@
 	sdbr	%f7, %f8
 	sdbr	%f15, %f0
 
+#CHECK: sdr	%f0, %f0                # encoding: [0x2b,0x00]
+#CHECK: sdr	%f0, %f15               # encoding: [0x2b,0x0f]
+#CHECK: sdr	%f7, %f8                # encoding: [0x2b,0x78]
+#CHECK: sdr	%f15, %f0               # encoding: [0x2b,0xf0]
+
+	sdr	%f0, %f0
+	sdr	%f0, %f15
+	sdr	%f7, %f8
+	sdr	%f15, %f0
+
+#CHECK: sdtr	%f0, %f0, %f0           # encoding: [0xb3,0xd3,0x00,0x00]
+#CHECK: sdtr	%f0, %f0, %f15          # encoding: [0xb3,0xd3,0xf0,0x00]
+#CHECK: sdtr	%f0, %f15, %f0          # encoding: [0xb3,0xd3,0x00,0x0f]
+#CHECK: sdtr	%f15, %f0, %f0          # encoding: [0xb3,0xd3,0x00,0xf0]
+#CHECK: sdtr	%f7, %f8, %f9           # encoding: [0xb3,0xd3,0x90,0x78]
+
+	sdtr	%f0, %f0, %f0
+	sdtr	%f0, %f0, %f15
+	sdtr	%f0, %f15, %f0
+	sdtr	%f15, %f0, %f0
+	sdtr	%f7, %f8, %f9
+
+#CHECK: se	%f0, 0                  # encoding: [0x7b,0x00,0x00,0x00]
+#CHECK: se	%f0, 4095               # encoding: [0x7b,0x00,0x0f,0xff]
+#CHECK: se	%f0, 0(%r1)             # encoding: [0x7b,0x00,0x10,0x00]
+#CHECK: se	%f0, 0(%r15)            # encoding: [0x7b,0x00,0xf0,0x00]
+#CHECK: se	%f0, 4095(%r1,%r15)     # encoding: [0x7b,0x01,0xff,0xff]
+#CHECK: se	%f0, 4095(%r15,%r1)     # encoding: [0x7b,0x0f,0x1f,0xff]
+#CHECK: se	%f15, 0                 # encoding: [0x7b,0xf0,0x00,0x00]
+
+	se	%f0, 0
+	se	%f0, 4095
+	se	%f0, 0(%r1)
+	se	%f0, 0(%r15)
+	se	%f0, 4095(%r1,%r15)
+	se	%f0, 4095(%r15,%r1)
+	se	%f15, 0
+
 #CHECK: seb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x0b]
 #CHECK: seb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x0b]
 #CHECK: seb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x0b]
@@ -10466,6 +12235,16 @@
 	sebr	%f7, %f8
 	sebr	%f15, %f0
 
+#CHECK: ser	%f0, %f0                # encoding: [0x3b,0x00]
+#CHECK: ser	%f0, %f15               # encoding: [0x3b,0x0f]
+#CHECK: ser	%f7, %f8                # encoding: [0x3b,0x78]
+#CHECK: ser	%f15, %f0               # encoding: [0x3b,0xf0]
+
+	ser	%f0, %f0
+	ser	%f0, %f15
+	ser	%f7, %f8
+	ser	%f15, %f0
+
 #CHECK: sfasr	%r0                     # encoding: [0xb3,0x85,0x00,0x00]
 #CHECK: sfasr	%r1                     # encoding: [0xb3,0x85,0x00,0x10]
 #CHECK: sfasr	%r15                    # encoding: [0xb3,0x85,0x00,0xf0]
@@ -10744,6 +12523,26 @@
 	sldl	%r0,4095(%r1)
 	sldl	%r0,4095(%r15)
 
+#CHECK: sldt	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x40]
+#CHECK: sldt	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x40]
+#CHECK: sldt	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x40]
+#CHECK: sldt	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x40]
+#CHECK: sldt	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x40]
+#CHECK: sldt	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x40]
+#CHECK: sldt	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x40]
+#CHECK: sldt	%f15, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xf0,0x40]
+#CHECK: sldt	%f15, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xf0,0x40]
+
+	sldt	%f0, %f0, 0
+	sldt	%f0, %f0, 4095
+	sldt	%f0, %f0, 0(%r1)
+	sldt	%f0, %f0, 0(%r15)
+	sldt	%f0, %f0, 4095(%r1,%r15)
+	sldt	%f0, %f0, 4095(%r15,%r1)
+	sldt	%f0, %f15, 0
+	sldt	%f15, %f0, 0
+	sldt	%f15, %f15, 0
+
 #CHECK: slfi	%r0, 0                  # encoding: [0xc2,0x05,0x00,0x00,0x00,0x00]
 #CHECK: slfi	%r0, 4294967295         # encoding: [0xc2,0x05,0xff,0xff,0xff,0xff]
 #CHECK: slfi	%r15, 0                 # encoding: [0xc2,0xf5,0x00,0x00,0x00,0x00]
@@ -10878,6 +12677,26 @@
 	slr	%r15,%r0
 	slr	%r7,%r8
 
+#CHECK: slxt	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x48]
+#CHECK: slxt	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x48]
+#CHECK: slxt	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x48]
+#CHECK: slxt	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x48]
+#CHECK: slxt	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x48]
+#CHECK: slxt	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x48]
+#CHECK: slxt	%f0, %f13, 0            # encoding: [0xed,0xd0,0x00,0x00,0x00,0x48]
+#CHECK: slxt	%f13, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xd0,0x48]
+#CHECK: slxt	%f13, %f13, 0           # encoding: [0xed,0xd0,0x00,0x00,0xd0,0x48]
+
+	slxt	%f0, %f0, 0
+	slxt	%f0, %f0, 4095
+	slxt	%f0, %f0, 0(%r1)
+	slxt	%f0, %f0, 0(%r15)
+	slxt	%f0, %f0, 4095(%r1,%r15)
+	slxt	%f0, %f0, 4095(%r15,%r1)
+	slxt	%f0, %f13, 0
+	slxt	%f13, %f0, 0
+	slxt	%f13, %f13, 0
+
 #CHECK: sly	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x5f]
 #CHECK: sly	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x5f]
 #CHECK: sly	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x5f]
@@ -10938,6 +12757,22 @@
 	spm	%r1
 	spm	%r15
 
+#CHECK: sqd	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x35]
+#CHECK: sqd	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x35]
+#CHECK: sqd	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x35]
+#CHECK: sqd	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x35]
+#CHECK: sqd	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x35]
+#CHECK: sqd	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x35]
+#CHECK: sqd	%f15, 0                 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x35]
+
+	sqd	%f0, 0
+	sqd	%f0, 4095
+	sqd	%f0, 0(%r1)
+	sqd	%f0, 0(%r15)
+	sqd	%f0, 4095(%r1,%r15)
+	sqd	%f0, 4095(%r15,%r1)
+	sqd	%f15, 0
+
 #CHECK: sqdb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x15]
 #CHECK: sqdb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x15]
 #CHECK: sqdb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x15]
@@ -10964,6 +12799,32 @@
 	sqdbr	%f7, %f8
 	sqdbr	%f15, %f0
 
+#CHECK: sqdr	%f0, %f0                # encoding: [0xb2,0x44,0x00,0x00]
+#CHECK: sqdr	%f0, %f15               # encoding: [0xb2,0x44,0x00,0x0f]
+#CHECK: sqdr	%f7, %f8                # encoding: [0xb2,0x44,0x00,0x78]
+#CHECK: sqdr	%f15, %f0               # encoding: [0xb2,0x44,0x00,0xf0]
+
+	sqdr	%f0, %f0
+	sqdr	%f0, %f15
+	sqdr	%f7, %f8
+	sqdr	%f15, %f0
+
+#CHECK: sqe	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x34]
+#CHECK: sqe	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x34]
+#CHECK: sqe	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x34]
+#CHECK: sqe	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x34]
+#CHECK: sqe	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x34]
+#CHECK: sqe	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x34]
+#CHECK: sqe	%f15, 0                 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x34]
+
+	sqe	%f0, 0
+	sqe	%f0, 4095
+	sqe	%f0, 0(%r1)
+	sqe	%f0, 0(%r15)
+	sqe	%f0, 4095(%r1,%r15)
+	sqe	%f0, 4095(%r15,%r1)
+	sqe	%f15, 0
+
 #CHECK: sqeb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x14]
 #CHECK: sqeb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x14]
 #CHECK: sqeb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x14]
@@ -10990,6 +12851,16 @@
 	sqebr	%f7, %f8
 	sqebr	%f15, %f0
 
+#CHECK: sqer	%f0, %f0                # encoding: [0xb2,0x45,0x00,0x00]
+#CHECK: sqer	%f0, %f15               # encoding: [0xb2,0x45,0x00,0x0f]
+#CHECK: sqer	%f7, %f8                # encoding: [0xb2,0x45,0x00,0x78]
+#CHECK: sqer	%f15, %f0               # encoding: [0xb2,0x45,0x00,0xf0]
+
+	sqer	%f0, %f0
+	sqer	%f0, %f15
+	sqer	%f7, %f8
+	sqer	%f15, %f0
+
 #CHECK: sqxbr	%f0, %f0                # encoding: [0xb3,0x16,0x00,0x00]
 #CHECK: sqxbr	%f0, %f13               # encoding: [0xb3,0x16,0x00,0x0d]
 #CHECK: sqxbr	%f8, %f8                # encoding: [0xb3,0x16,0x00,0x88]
@@ -11000,6 +12871,16 @@
 	sqxbr	%f8, %f8
 	sqxbr	%f13, %f0
 
+#CHECK: sqxr	%f0, %f0                # encoding: [0xb3,0x36,0x00,0x00]
+#CHECK: sqxr	%f0, %f13               # encoding: [0xb3,0x36,0x00,0x0d]
+#CHECK: sqxr	%f8, %f8                # encoding: [0xb3,0x36,0x00,0x88]
+#CHECK: sqxr	%f13, %f0               # encoding: [0xb3,0x36,0x00,0xd0]
+
+	sqxr	%f0, %f0
+	sqxr	%f0, %f13
+	sqxr	%f8, %f8
+	sqxr	%f13, %f0
+
 #CHECK: sr	%r0, %r0                # encoding: [0x1b,0x00]
 #CHECK: sr	%r0, %r15               # encoding: [0x1b,0x0f]
 #CHECK: sr	%r15, %r0               # encoding: [0x1b,0xf0]
@@ -11090,6 +12971,26 @@
 	srdl	%r0,4095(%r1)
 	srdl	%r0,4095(%r15)
 
+#CHECK: srdt	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x41]
+#CHECK: srdt	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x41]
+#CHECK: srdt	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x41]
+#CHECK: srdt	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x41]
+#CHECK: srdt	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x41]
+#CHECK: srdt	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x41]
+#CHECK: srdt	%f0, %f15, 0            # encoding: [0xed,0xf0,0x00,0x00,0x00,0x41]
+#CHECK: srdt	%f15, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xf0,0x41]
+#CHECK: srdt	%f15, %f15, 0           # encoding: [0xed,0xf0,0x00,0x00,0xf0,0x41]
+
+	srdt	%f0, %f0, 0
+	srdt	%f0, %f0, 4095
+	srdt	%f0, %f0, 0(%r1)
+	srdt	%f0, %f0, 0(%r15)
+	srdt	%f0, %f0, 4095(%r1,%r15)
+	srdt	%f0, %f0, 4095(%r15,%r1)
+	srdt	%f0, %f15, 0
+	srdt	%f15, %f0, 0
+	srdt	%f15, %f15, 0
+
 #CHECK: srl	%r0, 0                  # encoding: [0x88,0x00,0x00,0x00]
 #CHECK: srl	%r7, 0                  # encoding: [0x88,0x70,0x00,0x00]
 #CHECK: srl	%r15, 0                 # encoding: [0x88,0xf0,0x00,0x00]
@@ -11210,6 +13111,26 @@
 	srstu	%r15,%r0
 	srstu	%r7,%r8
 
+#CHECK: srxt	%f0, %f0, 0             # encoding: [0xed,0x00,0x00,0x00,0x00,0x49]
+#CHECK: srxt	%f0, %f0, 4095          # encoding: [0xed,0x00,0x0f,0xff,0x00,0x49]
+#CHECK: srxt	%f0, %f0, 0(%r1)        # encoding: [0xed,0x00,0x10,0x00,0x00,0x49]
+#CHECK: srxt	%f0, %f0, 0(%r15)       # encoding: [0xed,0x00,0xf0,0x00,0x00,0x49]
+#CHECK: srxt	%f0, %f0, 4095(%r1,%r15) # encoding: [0xed,0x01,0xff,0xff,0x00,0x49]
+#CHECK: srxt	%f0, %f0, 4095(%r15,%r1) # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x49]
+#CHECK: srxt	%f0, %f13, 0            # encoding: [0xed,0xd0,0x00,0x00,0x00,0x49]
+#CHECK: srxt	%f13, %f0, 0            # encoding: [0xed,0x00,0x00,0x00,0xd0,0x49]
+#CHECK: srxt	%f13, %f13, 0           # encoding: [0xed,0xd0,0x00,0x00,0xd0,0x49]
+
+	srxt	%f0, %f0, 0
+	srxt	%f0, %f0, 4095
+	srxt	%f0, %f0, 0(%r1)
+	srxt	%f0, %f0, 0(%r15)
+	srxt	%f0, %f0, 4095(%r1,%r15)
+	srxt	%f0, %f0, 4095(%r15,%r1)
+	srxt	%f0, %f13, 0
+	srxt	%f13, %f0, 0
+	srxt	%f13, %f13, 0
+
 #CHECK: st	%r0, 0                  # encoding: [0x50,0x00,0x00,0x00]
 #CHECK: st	%r0, 4095               # encoding: [0x50,0x00,0x0f,0xff]
 #CHECK: st	%r0, 0(%r1)             # encoding: [0x50,0x00,0x10,0x00]
@@ -11929,6 +13850,32 @@
 	sty	%r0, 524287(%r15,%r1)
 	sty	%r15, 0
 
+#CHECK: su	%f0, 0                  # encoding: [0x7f,0x00,0x00,0x00]
+#CHECK: su	%f0, 4095               # encoding: [0x7f,0x00,0x0f,0xff]
+#CHECK: su	%f0, 0(%r1)             # encoding: [0x7f,0x00,0x10,0x00]
+#CHECK: su	%f0, 0(%r15)            # encoding: [0x7f,0x00,0xf0,0x00]
+#CHECK: su	%f0, 4095(%r1,%r15)     # encoding: [0x7f,0x01,0xff,0xff]
+#CHECK: su	%f0, 4095(%r15,%r1)     # encoding: [0x7f,0x0f,0x1f,0xff]
+#CHECK: su	%f15, 0                 # encoding: [0x7f,0xf0,0x00,0x00]
+
+	su	%f0, 0
+	su	%f0, 4095
+	su	%f0, 0(%r1)
+	su	%f0, 0(%r15)
+	su	%f0, 4095(%r1,%r15)
+	su	%f0, 4095(%r15,%r1)
+	su	%f15, 0
+
+#CHECK: sur	%f0, %f0                # encoding: [0x3f,0x00]
+#CHECK: sur	%f0, %f15               # encoding: [0x3f,0x0f]
+#CHECK: sur	%f7, %f8                # encoding: [0x3f,0x78]
+#CHECK: sur	%f15, %f0               # encoding: [0x3f,0xf0]
+
+	sur	%f0, %f0
+	sur	%f0, %f15
+	sur	%f7, %f8
+	sur	%f15, %f0
+
 #CHECK: svc	0			# encoding: [0x0a,0x00]
 #CHECK: svc	3			# encoding: [0x0a,0x03]
 #CHECK: svc	128			# encoding: [0x0a,0x80]
@@ -11939,6 +13886,32 @@
 	svc	128
 	svc	0xff
 
+#CHECK: sw	%f0, 0                  # encoding: [0x6f,0x00,0x00,0x00]
+#CHECK: sw	%f0, 4095               # encoding: [0x6f,0x00,0x0f,0xff]
+#CHECK: sw	%f0, 0(%r1)             # encoding: [0x6f,0x00,0x10,0x00]
+#CHECK: sw	%f0, 0(%r15)            # encoding: [0x6f,0x00,0xf0,0x00]
+#CHECK: sw	%f0, 4095(%r1,%r15)     # encoding: [0x6f,0x01,0xff,0xff]
+#CHECK: sw	%f0, 4095(%r15,%r1)     # encoding: [0x6f,0x0f,0x1f,0xff]
+#CHECK: sw	%f15, 0                 # encoding: [0x6f,0xf0,0x00,0x00]
+
+	sw	%f0, 0
+	sw	%f0, 4095
+	sw	%f0, 0(%r1)
+	sw	%f0, 0(%r15)
+	sw	%f0, 4095(%r1,%r15)
+	sw	%f0, 4095(%r15,%r1)
+	sw	%f15, 0
+
+#CHECK: swr	%f0, %f0                # encoding: [0x2f,0x00]
+#CHECK: swr	%f0, %f15               # encoding: [0x2f,0x0f]
+#CHECK: swr	%f7, %f8                # encoding: [0x2f,0x78]
+#CHECK: swr	%f15, %f0               # encoding: [0x2f,0xf0]
+
+	swr	%f0, %f0
+	swr	%f0, %f15
+	swr	%f7, %f8
+	swr	%f15, %f0
+
 #CHECK: sxbr	%f0, %f0                # encoding: [0xb3,0x4b,0x00,0x00]
 #CHECK: sxbr	%f0, %f13               # encoding: [0xb3,0x4b,0x00,0x0d]
 #CHECK: sxbr	%f8, %f8                # encoding: [0xb3,0x4b,0x00,0x88]
@@ -11949,6 +13922,28 @@
 	sxbr	%f8, %f8
 	sxbr	%f13, %f0
 
+#CHECK: sxr	%f0, %f0                # encoding: [0x37,0x00]
+#CHECK: sxr	%f0, %f13               # encoding: [0x37,0x0d]
+#CHECK: sxr	%f8, %f8                # encoding: [0x37,0x88]
+#CHECK: sxr	%f13, %f0               # encoding: [0x37,0xd0]
+
+	sxr	%f0, %f0
+	sxr	%f0, %f13
+	sxr	%f8, %f8
+	sxr	%f13, %f0
+
+#CHECK: sxtr	%f0, %f0, %f0           # encoding: [0xb3,0xdb,0x00,0x00]
+#CHECK: sxtr	%f0, %f0, %f13          # encoding: [0xb3,0xdb,0xd0,0x00]
+#CHECK: sxtr	%f0, %f13, %f0          # encoding: [0xb3,0xdb,0x00,0x0d]
+#CHECK: sxtr	%f13, %f0, %f0          # encoding: [0xb3,0xdb,0x00,0xd0]
+#CHECK: sxtr	%f8, %f8, %f8           # encoding: [0xb3,0xdb,0x80,0x88]
+
+	sxtr	%f0, %f0, %f0
+	sxtr	%f0, %f0, %f13
+	sxtr	%f0, %f13, %f0
+	sxtr	%f13, %f0, %f0
+	sxtr	%f8, %f8, %f8
+
 #CHECK: sy	%r0, -524288            # encoding: [0xe3,0x00,0x00,0x00,0x80,0x5b]
 #CHECK: sy	%r0, -1                 # encoding: [0xe3,0x00,0x0f,0xff,0xff,0x5b]
 #CHECK: sy	%r0, 0                  # encoding: [0xe3,0x00,0x00,0x00,0x00,0x5b]
@@ -11975,6 +13970,30 @@
 
 	tam
 
+#CHECK: tbdr	%f0, 0, %f0             # encoding: [0xb3,0x51,0x00,0x00]
+#CHECK: tbdr	%f0, 0, %f15            # encoding: [0xb3,0x51,0x00,0x0f]
+#CHECK: tbdr	%f0, 15, %f0            # encoding: [0xb3,0x51,0xf0,0x00]
+#CHECK: tbdr	%f4, 5, %f6             # encoding: [0xb3,0x51,0x50,0x46]
+#CHECK: tbdr	%f15, 0, %f0            # encoding: [0xb3,0x51,0x00,0xf0]
+
+	tbdr	%f0, 0, %f0
+	tbdr	%f0, 0, %f15
+	tbdr	%f0, 15, %f0
+	tbdr	%f4, 5, %f6
+	tbdr	%f15, 0, %f0
+
+#CHECK: tbedr	%f0, 0, %f0             # encoding: [0xb3,0x50,0x00,0x00]
+#CHECK: tbedr	%f0, 0, %f15            # encoding: [0xb3,0x50,0x00,0x0f]
+#CHECK: tbedr	%f0, 15, %f0            # encoding: [0xb3,0x50,0xf0,0x00]
+#CHECK: tbedr	%f4, 5, %f6             # encoding: [0xb3,0x50,0x50,0x46]
+#CHECK: tbedr	%f15, 0, %f0            # encoding: [0xb3,0x50,0x00,0xf0]
+
+	tbedr	%f0, 0, %f0
+	tbedr	%f0, 0, %f15
+	tbedr	%f0, 15, %f0
+	tbedr	%f4, 5, %f6
+	tbedr	%f15, 0, %f0
+
 #CHECK: tcdb	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x11]
 #CHECK: tcdb	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x11]
 #CHECK: tcdb	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x11]
@@ -12023,6 +14042,122 @@
 	tcxb	%f0, 4095(%r15,%r1)
 	tcxb	%f13, 0
 
+#CHECK: tdcdt	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x54]
+#CHECK: tdcdt	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x54]
+#CHECK: tdcdt	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x54]
+#CHECK: tdcdt	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x54]
+#CHECK: tdcdt	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x54]
+#CHECK: tdcdt	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x54]
+#CHECK: tdcdt	%f15, 0                 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x54]
+
+	tdcdt	%f0, 0
+	tdcdt	%f0, 4095
+	tdcdt	%f0, 0(%r1)
+	tdcdt	%f0, 0(%r15)
+	tdcdt	%f0, 4095(%r1,%r15)
+	tdcdt	%f0, 4095(%r15,%r1)
+	tdcdt	%f15, 0
+
+#CHECK: tdcet	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x50]
+#CHECK: tdcet	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x50]
+#CHECK: tdcet	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x50]
+#CHECK: tdcet	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x50]
+#CHECK: tdcet	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x50]
+#CHECK: tdcet	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x50]
+#CHECK: tdcet	%f15, 0                 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x50]
+
+	tdcet	%f0, 0
+	tdcet	%f0, 4095
+	tdcet	%f0, 0(%r1)
+	tdcet	%f0, 0(%r15)
+	tdcet	%f0, 4095(%r1,%r15)
+	tdcet	%f0, 4095(%r15,%r1)
+	tdcet	%f15, 0
+
+#CHECK: tdcxt	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x58]
+#CHECK: tdcxt	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x58]
+#CHECK: tdcxt	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x58]
+#CHECK: tdcxt	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x58]
+#CHECK: tdcxt	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x58]
+#CHECK: tdcxt	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x58]
+#CHECK: tdcxt	%f13, 0                 # encoding: [0xed,0xd0,0x00,0x00,0x00,0x58]
+
+	tdcxt	%f0, 0
+	tdcxt	%f0, 4095
+	tdcxt	%f0, 0(%r1)
+	tdcxt	%f0, 0(%r15)
+	tdcxt	%f0, 4095(%r1,%r15)
+	tdcxt	%f0, 4095(%r15,%r1)
+	tdcxt	%f13, 0
+
+#CHECK: tdgdt	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x55]
+#CHECK: tdgdt	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x55]
+#CHECK: tdgdt	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x55]
+#CHECK: tdgdt	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x55]
+#CHECK: tdgdt	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x55]
+#CHECK: tdgdt	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x55]
+#CHECK: tdgdt	%f15, 0                 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x55]
+
+	tdgdt	%f0, 0
+	tdgdt	%f0, 4095
+	tdgdt	%f0, 0(%r1)
+	tdgdt	%f0, 0(%r15)
+	tdgdt	%f0, 4095(%r1,%r15)
+	tdgdt	%f0, 4095(%r15,%r1)
+	tdgdt	%f15, 0
+
+#CHECK: tdget	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x51]
+#CHECK: tdget	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x51]
+#CHECK: tdget	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x51]
+#CHECK: tdget	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x51]
+#CHECK: tdget	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x51]
+#CHECK: tdget	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x51]
+#CHECK: tdget	%f15, 0                 # encoding: [0xed,0xf0,0x00,0x00,0x00,0x51]
+
+	tdget	%f0, 0
+	tdget	%f0, 4095
+	tdget	%f0, 0(%r1)
+	tdget	%f0, 0(%r15)
+	tdget	%f0, 4095(%r1,%r15)
+	tdget	%f0, 4095(%r15,%r1)
+	tdget	%f15, 0
+
+#CHECK: tdgxt	%f0, 0                  # encoding: [0xed,0x00,0x00,0x00,0x00,0x59]
+#CHECK: tdgxt	%f0, 4095               # encoding: [0xed,0x00,0x0f,0xff,0x00,0x59]
+#CHECK: tdgxt	%f0, 0(%r1)             # encoding: [0xed,0x00,0x10,0x00,0x00,0x59]
+#CHECK: tdgxt	%f0, 0(%r15)            # encoding: [0xed,0x00,0xf0,0x00,0x00,0x59]
+#CHECK: tdgxt	%f0, 4095(%r1,%r15)     # encoding: [0xed,0x01,0xff,0xff,0x00,0x59]
+#CHECK: tdgxt	%f0, 4095(%r15,%r1)     # encoding: [0xed,0x0f,0x1f,0xff,0x00,0x59]
+#CHECK: tdgxt	%f13, 0                 # encoding: [0xed,0xd0,0x00,0x00,0x00,0x59]
+
+	tdgxt	%f0, 0
+	tdgxt	%f0, 4095
+	tdgxt	%f0, 0(%r1)
+	tdgxt	%f0, 0(%r15)
+	tdgxt	%f0, 4095(%r1,%r15)
+	tdgxt	%f0, 4095(%r15,%r1)
+	tdgxt	%f13, 0
+
+#CHECK: thder	%f0, %f9                # encoding: [0xb3,0x58,0x00,0x09]
+#CHECK: thder	%f0, %f15               # encoding: [0xb3,0x58,0x00,0x0f]
+#CHECK: thder	%f15, %f0               # encoding: [0xb3,0x58,0x00,0xf0]
+#CHECK: thder	%f15, %f9               # encoding: [0xb3,0x58,0x00,0xf9]
+
+	thder	%f0,%f9
+	thder	%f0,%f15
+	thder	%f15,%f0
+	thder	%f15,%f9
+
+#CHECK: thdr	%f0, %f9                # encoding: [0xb3,0x59,0x00,0x09]
+#CHECK: thdr	%f0, %f15               # encoding: [0xb3,0x59,0x00,0x0f]
+#CHECK: thdr	%f15, %f0               # encoding: [0xb3,0x59,0x00,0xf0]
+#CHECK: thdr	%f15, %f9               # encoding: [0xb3,0x59,0x00,0xf9]
+
+	thdr	%f0,%f9
+	thdr	%f0,%f15
+	thdr	%f15,%f0
+	thdr	%f15,%f9
+
 #CHECK: tm	0, 0                    # encoding: [0x91,0x00,0x00,0x00]
 #CHECK: tm	4095, 0                 # encoding: [0x91,0x00,0x0f,0xff]
 #CHECK: tm	0, 255                  # encoding: [0x91,0xff,0x00,0x00]
diff --git a/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
deleted file mode 100644
index a7f414b8694b..000000000000
--- a/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; REQUIRES: asserts
-; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-; CHECK-LABEL: all_scalar
-; CHECK:       LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
-; CHECK:       LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
-; CHECK:       LV: Not considering vector loop of width 2 because it will not generate any vector instructions
-;
-define void @all_scalar(i64* %a, i64 %n) {
-entry:
-  br label %for.body
-
-for.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %tmp0 = getelementptr i64, i64* %a, i64 %i
-  store i64 0, i64* %tmp0, align 1
-  %i.next = add nuw nsw i64 %i, 2
-  %cond = icmp eq i64 %i.next, %n
-  br i1 %cond, label %for.end, label %for.body
-
-for.end:
-  ret void
-}
diff --git a/tools/llvm-pdbdump/C13DebugFragmentVisitor.cpp b/tools/llvm-pdbdump/C13DebugFragmentVisitor.cpp
index b38b36532a71..78971eb5879a 100644
--- a/tools/llvm-pdbdump/C13DebugFragmentVisitor.cpp
+++ b/tools/llvm-pdbdump/C13DebugFragmentVisitor.cpp
@@ -9,9 +9,9 @@
 
 #include "C13DebugFragmentVisitor.h"
 
-#include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugLineFragment.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PDBStringTable.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
@@ -25,25 +25,25 @@ C13DebugFragmentVisitor::C13DebugFragmentVisitor(PDBFile &F) : F(F) {}
 C13DebugFragmentVisitor::~C13DebugFragmentVisitor() {}
 
 Error C13DebugFragmentVisitor::visitUnknown(
-    codeview::ModuleDebugUnknownFragmentRef &Fragment) {
+    codeview::DebugUnknownSubsectionRef &Fragment) {
   return Error::success();
 }
 
 Error C13DebugFragmentVisitor::visitFileChecksums(
-    codeview::ModuleDebugFileChecksumFragmentRef &Checksums) {
+    codeview::DebugChecksumsSubsectionRef &Checksums) {
   assert(!this->Checksums.hasValue());
   this->Checksums = Checksums;
   return Error::success();
 }
 
 Error C13DebugFragmentVisitor::visitLines(
-    codeview::ModuleDebugLineFragmentRef &Lines) {
+    codeview::DebugLinesSubsectionRef &Lines) {
   this->Lines.push_back(Lines);
   return Error::success();
 }
 
 Error C13DebugFragmentVisitor::visitInlineeLines(
-    codeview::ModuleDebugInlineeLineFragmentRef &Lines) {
+    codeview::DebugInlineeLinesSubsectionRef &Lines) {
   this->InlineeLines.push_back(Lines);
   return Error::success();
 }
diff --git a/tools/llvm-pdbdump/C13DebugFragmentVisitor.h b/tools/llvm-pdbdump/C13DebugFragmentVisitor.h
index 1054b0c9f6e0..a12f282c4c5c 100644
--- a/tools/llvm-pdbdump/C13DebugFragmentVisitor.h
+++ b/tools/llvm-pdbdump/C13DebugFragmentVisitor.h
@@ -11,8 +11,8 @@
 #define LLVM_TOOLS_LLVMPDBDUMP_C13DEBUGFRAGMENTVISITOR_H
 
 #include "llvm/ADT/Optional.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragmentVisitor.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h"
 #include "llvm/Support/Error.h"
 
 #include <vector>
@@ -23,20 +23,20 @@ namespace pdb {
 
 class PDBFile;
 
-class C13DebugFragmentVisitor : public codeview::ModuleDebugFragmentVisitor {
+class C13DebugFragmentVisitor : public codeview::DebugSubsectionVisitor {
 public:
   C13DebugFragmentVisitor(PDBFile &F);
   ~C13DebugFragmentVisitor();
 
-  Error visitUnknown(codeview::ModuleDebugUnknownFragmentRef &Fragment) final;
-
-  Error visitFileChecksums(
-      codeview::ModuleDebugFileChecksumFragmentRef &Checksums) final;
-
-  Error visitLines(codeview::ModuleDebugLineFragmentRef &Lines) final;
+  Error visitUnknown(codeview::DebugUnknownSubsectionRef &Fragment) final;
 
   Error
-  visitInlineeLines(codeview::ModuleDebugInlineeLineFragmentRef &Lines) final;
+  visitFileChecksums(codeview::DebugChecksumsSubsectionRef &Checksums) final;
+
+  Error visitLines(codeview::DebugLinesSubsectionRef &Lines) final;
+
+  Error
+  visitInlineeLines(codeview::DebugInlineeLinesSubsectionRef &Lines) final;
 
   Error finished() final;
 
@@ -48,9 +48,9 @@ class C13DebugFragmentVisitor : public codeview::ModuleDebugFragmentVisitor {
   Expected<StringRef> getNameFromStringTable(uint32_t Offset);
   Expected<StringRef> getNameFromChecksumsBuffer(uint32_t Offset);
 
-  Optional<codeview::ModuleDebugFileChecksumFragmentRef> Checksums;
-  std::vector<codeview::ModuleDebugInlineeLineFragmentRef> InlineeLines;
-  std::vector<codeview::ModuleDebugLineFragmentRef> Lines;
+  Optional<codeview::DebugChecksumsSubsectionRef> Checksums;
+  std::vector<codeview::DebugInlineeLinesSubsectionRef> InlineeLines;
+  std::vector<codeview::DebugLinesSubsectionRef> Lines;
 
   PDBFile &F;
 };
diff --git a/tools/llvm-pdbdump/LLVMOutputStyle.cpp b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
index 07d3b226d7c6..d95eca1aeddb 100644
--- a/tools/llvm-pdbdump/LLVMOutputStyle.cpp
+++ b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
@@ -15,14 +15,14 @@
 #include "llvm-pdbdump.h"
 
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h"
+#include "llvm/DebugInfo/CodeView/DebugUnknownSubsection.h"
 #include "llvm/DebugInfo/CodeView/EnumTables.h"
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragmentVisitor.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugLineFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugUnknownFragment.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumper.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
@@ -830,8 +830,8 @@ Error LLVMOutputStyle::dumpDbiStream() {
             return ExpectedTypes.takeError();
           auto &IpiItems = *ExpectedTypes;
           C13RawVisitor V(P, File, IpiItems);
-          if (auto EC = codeview::visitModuleDebugFragments(
-                  ModS.linesAndChecksums(), V))
+          if (auto EC =
+                  codeview::visitDebugSubsections(ModS.linesAndChecksums(), V))
             return EC;
         }
       }
diff --git a/tools/llvm-pdbdump/YAMLOutputStyle.cpp b/tools/llvm-pdbdump/YAMLOutputStyle.cpp
index 652182e8e9b3..7aa68dee7d47 100644
--- a/tools/llvm-pdbdump/YAMLOutputStyle.cpp
+++ b/tools/llvm-pdbdump/YAMLOutputStyle.cpp
@@ -13,13 +13,13 @@
 #include "PdbYaml.h"
 #include "llvm-pdbdump.h"
 
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionVisitor.h"
+#include "llvm/DebugInfo/CodeView/DebugUnknownSubsection.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFragmentVisitor.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugLineFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugUnknownFragment.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
@@ -207,8 +207,8 @@ YAMLOutputStyle::getFileLineInfo(const pdb::ModuleDebugStreamRef &ModS) {
 
   yaml::PdbSourceFileInfo Info;
   C13YamlVisitor Visitor(Info, File);
-  if (auto EC = codeview::visitModuleDebugFragments(ModS.linesAndChecksums(),
-                                                    Visitor))
+  if (auto EC =
+          codeview::visitDebugSubsections(ModS.linesAndChecksums(), Visitor))
     return std::move(EC);
 
   return Info;
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.cpp b/tools/llvm-pdbdump/llvm-pdbdump.cpp
index ff14c39cbaab..baba862ae663 100644
--- a/tools/llvm-pdbdump/llvm-pdbdump.cpp
+++ b/tools/llvm-pdbdump/llvm-pdbdump.cpp
@@ -31,10 +31,10 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugLineFragment.h"
 #include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
@@ -543,8 +543,7 @@ static void yamlToPdb(StringRef Path) {
       // File Checksums must be emitted before line information, because line
       // info records use offsets into the checksum buffer to reference a file's
       // source file name.
-      auto Checksums =
-          llvm::make_unique<ModuleDebugFileChecksumFragment>(Strings);
+      auto Checksums = llvm::make_unique<DebugChecksumsSubsection>(Strings);
       auto &ChecksumRef = *Checksums;
       if (!FLI.FileChecksums.empty()) {
         for (auto &FC : FLI.FileChecksums)
@@ -554,7 +553,7 @@ static void yamlToPdb(StringRef Path) {
 
       for (const auto &Fragment : FLI.LineFragments) {
         auto Lines =
-            llvm::make_unique<ModuleDebugLineFragment>(ChecksumRef, Strings);
+            llvm::make_unique<DebugLinesSubsection>(ChecksumRef, Strings);
         Lines->setCodeSize(Fragment.CodeSize);
         Lines->setRelocationAddress(Fragment.RelocSegment,
                                     Fragment.RelocOffset);
@@ -582,7 +581,7 @@ static void yamlToPdb(StringRef Path) {
       }
 
       for (const auto &Inlinee : FLI.Inlinees) {
-        auto Inlinees = llvm::make_unique<ModuleDebugInlineeLineFragment>(
+        auto Inlinees = llvm::make_unique<DebugInlineeLinesSubsection>(
             ChecksumRef, Inlinee.HasExtraFiles);
         for (const auto &Site : Inlinee.Sites) {
           Inlinees->addInlineSite(Site.Inlinee, Site.FileName,
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index a59caa351e72..663f7b4c8a82 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMWinEHPrinter.h"
-#include "CodeView.h"
 #include "Error.h"
 #include "ObjDumper.h"
 #include "StackMapPrinter.h"
@@ -24,13 +23,14 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/DebugChecksumsSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugFrameDataSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugInlineeLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugStringTableSubsection.h"
 #include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugFileChecksumFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugInlineeLinesFragment.h"
-#include "llvm/DebugInfo/CodeView/ModuleDebugLineFragment.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
-#include "llvm/DebugInfo/CodeView/StringTable.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumpDelegate.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumper.h"
@@ -157,9 +157,9 @@ class COFFDumper : public ObjDumper {
   bool RelocCached = false;
   RelocMapTy RelocMap;
 
-  VarStreamArray<FileChecksumEntry> CVFileChecksumTable;
+  DebugChecksumsSubsectionRef CVFileChecksumTable;
 
-  StringTableRef CVStringTable;
+  DebugStringTableSubsectionRef CVStringTable;
 
   ScopedPrinter &Writer;
   BinaryByteStream TypeContents;
@@ -200,7 +200,9 @@ class COFFObjectDumpDelegate : public SymbolDumpDelegate {
     return CD.getFileNameForFileOffset(FileOffset);
   }
 
-  StringTableRef getStringTable() override { return CD.CVStringTable; }
+  DebugStringTableSubsectionRef getStringTable() override {
+    return CD.CVStringTable;
+  }
 
 private:
   COFFDumper &CD;
@@ -515,19 +517,19 @@ WeakExternalCharacteristics[] = {
 };
 
 static const EnumEntry<uint32_t> SubSectionTypes[] = {
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, Symbols),
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, Lines),
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, StringTable),
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, FileChecksums),
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, FrameData),
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, InlineeLines),
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, CrossScopeImports),
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, CrossScopeExports),
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, ILLines),
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, FuncMDTokenMap),
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, TypeMDTokenMap),
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, MergedAssemblyInput),
-    LLVM_READOBJ_ENUM_CLASS_ENT(ModuleDebugFragmentKind, CoffSymbolRVA),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, Symbols),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, Lines),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, StringTable),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, FileChecksums),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, FrameData),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, InlineeLines),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, CrossScopeImports),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, CrossScopeExports),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, ILLines),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, FuncMDTokenMap),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, TypeMDTokenMap),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, MergedAssemblyInput),
+    LLVM_READOBJ_ENUM_CLASS_ENT(DebugSubsectionKind, CoffSymbolRVA),
 };
 
 static const EnumEntry<uint32_t> FrameDataFlags[] = {
@@ -774,16 +776,14 @@ void COFFDumper::initializeFileAndStringTables(BinaryStreamReader &Reader) {
     StringRef Contents;
     error(Reader.readFixedString(Contents, SubSectionSize));
 
-    switch (ModuleDebugFragmentKind(SubType)) {
-    case ModuleDebugFragmentKind::FileChecksums: {
-      BinaryStreamReader CSR(Contents, support::little);
-      error(CSR.readArray(CVFileChecksumTable, CSR.getLength()));
+    BinaryStreamRef ST(Contents, support::little);
+    switch (DebugSubsectionKind(SubType)) {
+    case DebugSubsectionKind::FileChecksums:
+      error(CVFileChecksumTable.initialize(ST));
       break;
-    }
-    case ModuleDebugFragmentKind::StringTable: {
-      BinaryStreamRef ST(Contents, support::little);
+    case DebugSubsectionKind::StringTable:
       error(CVStringTable.initialize(ST));
-    } break;
+      break;
     default:
       break;
     }
@@ -847,20 +847,20 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
       printBinaryBlockWithRelocs("SubSectionContents", Section, SectionContents,
                                  Contents);
 
-    switch (ModuleDebugFragmentKind(SubType)) {
-    case ModuleDebugFragmentKind::Symbols:
+    switch (DebugSubsectionKind(SubType)) {
+    case DebugSubsectionKind::Symbols:
       printCodeViewSymbolsSubsection(Contents, Section, SectionContents);
       break;
 
-    case ModuleDebugFragmentKind::InlineeLines:
+    case DebugSubsectionKind::InlineeLines:
       printCodeViewInlineeLines(Contents);
       break;
 
-    case ModuleDebugFragmentKind::FileChecksums:
+    case DebugSubsectionKind::FileChecksums:
       printCodeViewFileChecksums(Contents);
       break;
 
-    case ModuleDebugFragmentKind::Lines: {
+    case DebugSubsectionKind::Lines: {
       // Holds a PC to file:line table.  Some data to parse this subsection is
       // stored in the other subsections, so just check sanity and store the
       // pointers for deferred processing.
@@ -886,34 +886,33 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
       FunctionNames.push_back(LinkageName);
       break;
     }
-    case ModuleDebugFragmentKind::FrameData: {
+    case DebugSubsectionKind::FrameData: {
       // First four bytes is a relocation against the function.
       BinaryStreamReader SR(Contents, llvm::support::little);
-      const uint32_t *CodePtr;
-      error(SR.readObject(CodePtr));
+
+      DebugFrameDataSubsectionRef FrameData;
+      error(FrameData.initialize(SR));
+
       StringRef LinkageName;
       error(resolveSymbolName(Obj->getCOFFSection(Section), SectionContents,
-                              CodePtr, LinkageName));
+                              FrameData.getRelocPtr(), LinkageName));
       W.printString("LinkageName", LinkageName);
 
       // To find the active frame description, search this array for the
       // smallest PC range that includes the current PC.
-      while (!SR.empty()) {
-        const FrameData *FD;
-        error(SR.readObject(FD));
-
-        StringRef FrameFunc = error(CVStringTable.getString(FD->FrameFunc));
+      for (const auto &FD : FrameData) {
+        StringRef FrameFunc = error(CVStringTable.getString(FD.FrameFunc));
 
         DictScope S(W, "FrameData");
-        W.printHex("RvaStart", FD->RvaStart);
-        W.printHex("CodeSize", FD->CodeSize);
-        W.printHex("LocalSize", FD->LocalSize);
-        W.printHex("ParamsSize", FD->ParamsSize);
-        W.printHex("MaxStackSize", FD->MaxStackSize);
+        W.printHex("RvaStart", FD.RvaStart);
+        W.printHex("CodeSize", FD.CodeSize);
+        W.printHex("LocalSize", FD.LocalSize);
+        W.printHex("ParamsSize", FD.ParamsSize);
+        W.printHex("MaxStackSize", FD.MaxStackSize);
         W.printString("FrameFunc", FrameFunc);
-        W.printHex("PrologSize", FD->PrologSize);
-        W.printHex("SavedRegsSize", FD->SavedRegsSize);
-        W.printFlags("Flags", FD->Flags, makeArrayRef(FrameDataFlags));
+        W.printHex("PrologSize", FD.PrologSize);
+        W.printHex("SavedRegsSize", FD.SavedRegsSize);
+        W.printFlags("Flags", FD.Flags, makeArrayRef(FrameDataFlags));
       }
       break;
     }
@@ -934,7 +933,7 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
 
     BinaryStreamReader Reader(FunctionLineTables[Name], support::little);
 
-    ModuleDebugLineFragmentRef LineInfo;
+    DebugLinesSubsectionRef LineInfo;
     error(LineInfo.initialize(Reader));
 
     W.printHex("Flags", LineInfo.header()->Flags);
@@ -996,9 +995,9 @@ void COFFDumper::printCodeViewSymbolsSubsection(StringRef Subsection,
 }
 
 void COFFDumper::printCodeViewFileChecksums(StringRef Subsection) {
-  BinaryStreamReader SR(Subsection, llvm::support::little);
-  ModuleDebugFileChecksumFragmentRef Checksums;
-  error(Checksums.initialize(SR));
+  BinaryStreamRef Stream(Subsection, llvm::support::little);
+  DebugChecksumsSubsectionRef Checksums;
+  error(Checksums.initialize(Stream));
 
   for (auto &FC : Checksums) {
     DictScope S(W, "FileChecksum");
@@ -1015,7 +1014,7 @@ void COFFDumper::printCodeViewFileChecksums(StringRef Subsection) {
 
 void COFFDumper::printCodeViewInlineeLines(StringRef Subsection) {
   BinaryStreamReader SR(Subsection, llvm::support::little);
-  ModuleDebugInlineeLineFragmentRef Lines;
+  DebugInlineeLinesSubsectionRef Lines;
   error(Lines.initialize(SR));
 
   for (auto &Line : Lines) {
@@ -1039,7 +1038,7 @@ StringRef COFFDumper::getFileNameForFileOffset(uint32_t FileOffset) {
   if (!CVFileChecksumTable.valid() || !CVStringTable.valid())
     error(object_error::parse_failed);
 
-  auto Iter = CVFileChecksumTable.at(FileOffset);
+  auto Iter = CVFileChecksumTable.getArray().at(FileOffset);
 
   // Check if the file checksum table offset is valid.
   if (Iter == CVFileChecksumTable.end())
diff --git a/tools/llvm-readobj/CodeView.h b/tools/llvm-readobj/CodeView.h
deleted file mode 100644
index cf713962eb7f..000000000000
--- a/tools/llvm-readobj/CodeView.h
+++ /dev/null
@@ -1,54 +0,0 @@
-//===-- CodeView.h - On-disk record types for CodeView ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This file provides data structures useful for consuming on-disk
-/// CodeView. It is based on information published by Microsoft at
-/// https://github.com/Microsoft/microsoft-pdb/.
-///
-//===----------------------------------------------------------------------===//
-
-// FIXME: Find a home for this in include/llvm/DebugInfo/CodeView/.
-
-#ifndef LLVM_READOBJ_CODEVIEW_H
-#define LLVM_READOBJ_CODEVIEW_H
-
-#include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/Support/Endian.h"
-
-namespace llvm {
-namespace codeview {
-
-using llvm::support::ulittle16_t;
-using llvm::support::ulittle32_t;
-
-/// Data in the the SUBSEC_FRAMEDATA subection.
-struct FrameData {
-  ulittle32_t RvaStart;
-  ulittle32_t CodeSize;
-  ulittle32_t LocalSize;
-  ulittle32_t ParamsSize;
-  ulittle32_t MaxStackSize;
-  ulittle32_t FrameFunc;
-  ulittle16_t PrologSize;
-  ulittle16_t SavedRegsSize;
-  ulittle32_t Flags;
-  enum : uint32_t {
-    HasSEH = 1 << 0,
-    HasEH = 1 << 1,
-    IsFunctionStart = 1 << 2,
-  };
-};
-
-
-} // namespace codeview
-} // namespace llvm
-
-#endif // LLVM_READOBJ_CODEVIEW_H
diff --git a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
index 9e846674648c..3d14eb736df2 100644
--- a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
+++ b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
@@ -311,9 +311,8 @@ void TestAllForms() {
   EXPECT_EQ(Data2, toReference(DieDG.find(Attr_DW_FORM_ref2), 0));
   EXPECT_EQ(Data4, toReference(DieDG.find(Attr_DW_FORM_ref4), 0));
   EXPECT_EQ(Data8, toReference(DieDG.find(Attr_DW_FORM_ref8), 0));
-  if (Version >= 4) {
+  if (Version >= 4)
     EXPECT_EQ(Data8_2, toReference(DieDG.find(Attr_DW_FORM_ref_sig8), 0));
-  }
   EXPECT_EQ(UData[0], toReference(DieDG.find(Attr_DW_FORM_ref_udata), 0));
 
   //----------------------------------------------------------------------
@@ -321,17 +320,15 @@ void TestAllForms() {
   //----------------------------------------------------------------------
   EXPECT_EQ(1ULL, toUnsigned(DieDG.find(Attr_DW_FORM_flag_true), 0));
   EXPECT_EQ(0ULL, toUnsigned(DieDG.find(Attr_DW_FORM_flag_false), 1));
-  if (Version >= 4) {
+  if (Version >= 4)
     EXPECT_EQ(1ULL, toUnsigned(DieDG.find(Attr_DW_FORM_flag_present), 0));
-  }
 
   //----------------------------------------------------------------------
   // Test SLEB128 based forms
   //----------------------------------------------------------------------
   EXPECT_EQ(SData, toSigned(DieDG.find(Attr_DW_FORM_sdata), 0));
-  if (Version >= 5) {
+  if (Version >= 5)
     EXPECT_EQ(ICSData, toSigned(DieDG.find(Attr_DW_FORM_implicit_const), 0));
-  }
 
   //----------------------------------------------------------------------
   // Test ULEB128 based forms
@@ -343,10 +340,9 @@ void TestAllForms() {
   //----------------------------------------------------------------------
   EXPECT_EQ(Dwarf32Values[0],
             toReference(DieDG.find(Attr_DW_FORM_GNU_ref_alt), 0));
-  if (Version >= 4) {
+  if (Version >= 4)
     EXPECT_EQ(Dwarf32Values[1],
               toSectionOffset(DieDG.find(Attr_DW_FORM_sec_offset), 0));
-  }
 
   //----------------------------------------------------------------------
   // Add an address at the end to make sure we can decode this value
diff --git a/unittests/Support/ManagedStatic.cpp b/unittests/Support/ManagedStatic.cpp
index 153884ba4298..4e2e93036a83 100644
--- a/unittests/Support/ManagedStatic.cpp
+++ b/unittests/Support/ManagedStatic.cpp
@@ -57,4 +57,45 @@ TEST(Initialize, MultipleThreads) {
 }
 #endif
 
+namespace NestedStatics {
+static ManagedStatic<int> Ms1;
+struct Nest {
+  Nest() {
+    ++(*Ms1);
+  }
+
+  ~Nest() {
+    assert(Ms1.isConstructed());
+    ++(*Ms1);
+  }
+};
+static ManagedStatic<Nest> Ms2;
+
+TEST(ManagedStaticTest, NestedStatics) {
+  EXPECT_FALSE(Ms1.isConstructed());
+  EXPECT_FALSE(Ms2.isConstructed());
+
+  *Ms2;
+  EXPECT_TRUE(Ms1.isConstructed());
+  EXPECT_TRUE(Ms2.isConstructed());
+}
+} // namespace NestedStatics
+
+namespace CustomCreatorDeletor {
+struct CustomCreate {
+  static void *call() {
+    void *Mem = std::malloc(sizeof(int));
+    *((int *)Mem) = 42;
+    return Mem;
+  }
+};
+struct CustomDelete {
+  static void call(void *P) { std::free(P); }
+};
+static ManagedStatic<int, CustomCreate, CustomDelete> Custom;
+TEST(ManagedStaticTest, CustomCreatorDeletor) {
+  EXPECT_EQ(42, *Custom);
+}
+} // namespace CustomCreatorDeletor
+
 } // anonymous namespace
diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp
index 185119c2ca43..e0303b7b1ab4 100644
--- a/utils/TableGen/GlobalISelEmitter.cpp
+++ b/utils/TableGen/GlobalISelEmitter.cpp
@@ -1602,7 +1602,7 @@ Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
 
 Error GlobalISelEmitter::importDefaultOperandRenderers(
     BuildMIAction &DstMIBuilder, DagInit *DefaultOps) const {
-  for (const auto *DefaultOp : DefaultOps->args()) {
+  for (const auto *DefaultOp : DefaultOps->getArgs()) {
     // Look through ValueType operators.
     if (const DagInit *DefaultDagOp = dyn_cast<DagInit>(DefaultOp)) {
       if (const DefInit *DefaultDagOperator =
diff --git a/utils/TableGen/X86FoldTablesEmitter.cpp b/utils/TableGen/X86FoldTablesEmitter.cpp
index f211a8fab975..99429c5f96a2 100644
--- a/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -77,7 +77,14 @@ const char *const NoFoldSet[] = {
     "TCRETURNri64",
     "TCRETURNmi64", // Special dealing (in X86InstrCompiler.td under
     "TCRETURNri",   // "tailcall stuff" section).
-    "TCRETURNmi"
+    "TCRETURNmi",
+
+    // Never fold XCHG, the register and memory forms have different locking
+    // semantics.
+    "XCHG8rr",  "XCHG8rm",
+    "XCHG16rr", "XCHG16rm",
+    "XCHG32rr", "XCHG32rm",
+    "XCHG64rr", "XCHG64rm",
 
     // Different calculations of the folded operand between
     // memory and register forms (folding is illegal).

From 550ae89a710bf458d47e5b1d183f5e7039c2b384 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Tue, 30 May 2017 17:37:44 +0000
Subject: [PATCH 2/2] Vendor import of clang trunk r304222:
 https://llvm.org/svn/llvm-project/cfe/trunk@304222

---
 include/clang-c/Index.h                       |   11 +-
 include/clang/Basic/DiagnosticLexKinds.td     |    2 +
 include/clang/Basic/DiagnosticSemaKinds.td    |    9 +-
 include/clang/Frontend/ASTUnit.h              |    5 +
 include/clang/Lex/Preprocessor.h              |   64 +
 include/clang/Lex/PreprocessorLexer.h         |    6 +
 include/clang/Lex/PreprocessorOptions.h       |   10 +-
 include/clang/Serialization/ASTBitCodes.h     |    3 +
 .../clang/StaticAnalyzer/Checkers/Checkers.td |    8 +-
 .../Core/PathSensitive/ProgramState.h         |   13 +
 .../Core/PathSensitive/TaintManager.h         |   10 +
 lib/CodeGen/CGCoroutine.cpp                   |    3 +-
 lib/Frontend/ASTUnit.cpp                      |   28 +-
 lib/Frontend/SerializedDiagnosticPrinter.cpp  |    2 +-
 lib/Headers/altivec.h                         |  106 +-
 lib/Lex/Lexer.cpp                             |   39 +-
 lib/Lex/PPDirectives.cpp                      |   41 +-
 lib/Lex/PPLexerChange.cpp                     |    6 +
 lib/Lex/Pragma.cpp                            |   18 +
 lib/Lex/Preprocessor.cpp                      |    9 +
 lib/Sema/SemaDecl.cpp                         |    3 +
 lib/Sema/SemaExpr.cpp                         |   28 +-
 lib/Sema/SemaOpenMP.cpp                       |   43 +-
 lib/Serialization/ASTReader.cpp               |   15 +
 lib/Serialization/ASTWriter.cpp               |   13 +
 lib/StaticAnalyzer/Checkers/CMakeLists.txt    |    2 +-
 .../Checkers/GenericTaintChecker.cpp          |   83 +-
 .../Checkers/IteratorChecker.cpp              |  833 +++++++++++
 .../Checkers/IteratorPastEndChecker.cpp       |  840 -----------
 .../Checkers/PthreadLockChecker.cpp           |  148 +-
 lib/StaticAnalyzer/Core/ProgramState.cpp      |  100 +-
 lib/StaticAnalyzer/Core/RegionStore.cpp       |    5 +-
 .../Inputs/system-header-simulator-cxx.h      |  389 +++++-
 .../diagnostics/explicit-suppression.cpp      |    2 +-
 test/Analysis/iterator-past-end.cpp           |  205 ---
 test/Analysis/iterator-range.cpp              |   19 +
 test/Analysis/pthreadlock.c                   |   79 ++
 test/Analysis/taint-generic.c                 |   41 +-
 test/CodeGen/altivec-ct.c                     |   82 ++
 test/CodeGen/arm_neon_intrinsics.c            | 1239 +++++++++--------
 test/CodeGen/union-align.c                    |    4 +-
 test/CodeGenOpenCL/bool_cast.cl               |    2 +-
 test/Lexer/preamble.c                         |   11 +-
 test/Lexer/preamble2.c                        |   19 +
 test/Modules/Inputs/preprocess/a.h            |    2 +
 test/Modules/Inputs/preprocess/b.h            |    2 +
 test/Modules/Inputs/preprocess/c.h            |    4 +
 .../Inputs/preprocess/module.modulemap        |    5 +
 test/Modules/preprocess-nested.cpp            |   59 +
 test/Modules/preprocess-unavailable.cpp       |   12 +
 test/OpenMP/target_data_messages.c            |    2 +-
 test/OpenMP/target_enter_data_map_messages.c  |    2 +-
 .../target_enter_data_nowait_messages.cpp     |    2 +-
 test/OpenMP/target_exit_data_map_messages.c   |    2 +-
 .../target_exit_data_nowait_messages.cpp      |    2 +-
 test/OpenMP/target_map_messages.cpp           |    2 +-
 test/OpenMP/target_teams_map_messages.cpp     |    2 +-
 test/SemaOpenCL/arithmetic-conversions.cl     |   23 +
 test/SemaOpenCL/clang-builtin-version.cl      |   59 +-
 test/SemaOpenCL/cond.cl                       |    2 +-
 test/SemaOpenCL/to_addr_builtin.cl            |    2 +-
 tools/c-index-test/c-index-test.c             |    2 +
 tools/libclang/CIndex.cpp                     |   14 +
 tools/libclang/libclang.exports               |    1 +
 64 files changed, 2789 insertions(+), 2010 deletions(-)
 create mode 100644 lib/StaticAnalyzer/Checkers/IteratorChecker.cpp
 delete mode 100644 lib/StaticAnalyzer/Checkers/IteratorPastEndChecker.cpp
 delete mode 100644 test/Analysis/iterator-past-end.cpp
 create mode 100644 test/Analysis/iterator-range.cpp
 create mode 100644 test/CodeGen/altivec-ct.c
 create mode 100644 test/Lexer/preamble2.c
 create mode 100644 test/Modules/Inputs/preprocess/a.h
 create mode 100644 test/Modules/Inputs/preprocess/b.h
 create mode 100644 test/Modules/Inputs/preprocess/c.h
 create mode 100644 test/Modules/preprocess-nested.cpp
 create mode 100644 test/Modules/preprocess-unavailable.cpp
 create mode 100644 test/SemaOpenCL/arithmetic-conversions.cl

diff --git a/include/clang-c/Index.h b/include/clang-c/Index.h
index aeda39308f91..91b3d11a549a 100644
--- a/include/clang-c/Index.h
+++ b/include/clang-c/Index.h
@@ -32,7 +32,7 @@
  * compatible, thus CINDEX_VERSION_MAJOR is expected to remain stable.
  */
 #define CINDEX_VERSION_MAJOR 0
-#define CINDEX_VERSION_MINOR 40
+#define CINDEX_VERSION_MINOR 41
 
 #define CINDEX_VERSION_ENCODE(major, minor) ( \
       ((major) * 10000)                       \
@@ -1418,6 +1418,15 @@ CINDEX_LINKAGE int clang_saveTranslationUnit(CXTranslationUnit TU,
                                              const char *FileName,
                                              unsigned options);
 
+/**
+ * \brief Suspend a translation unit in order to free memory associated with it.
+ *
+ * A suspended translation unit uses significantly less memory but on the other
+ * side does not support any other calls than \c clang_reparseTranslationUnit
+ * to resume it or \c clang_disposeTranslationUnit to dispose it completely.
+ */
+CINDEX_LINKAGE unsigned clang_suspendTranslationUnit(CXTranslationUnit);
+
 /**
  * \brief Destroy the specified CXTranslationUnit object.
  */
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td
index 77db8993f018..b393ce5f1545 100644
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -525,6 +525,8 @@ def err_pp_module_begin_without_module_end : Error<
 def err_pp_module_end_without_module_begin : Error<
   "no matching '#pragma clang module begin' for this "
   "'#pragma clang module end'">;
+def note_pp_module_begin_here : Note<
+  "entering module '%0' due to this pragma">;
 
 def err_defined_macro_name : Error<"'defined' cannot be used as a macro name">;
 def err_paste_at_start : Error<
diff --git a/include/clang/Basic/DiagnosticSemaKinds.td b/include/clang/Basic/DiagnosticSemaKinds.td
index 4934bcfa3890..629e8b837f59 100644
--- a/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/include/clang/Basic/DiagnosticSemaKinds.td
@@ -8312,8 +8312,13 @@ def err_opencl_bitfields : Error<
   "bit-fields are not supported in OpenCL">;
 def err_opencl_vla : Error<
   "variable length arrays are not supported in OpenCL">;
+def err_opencl_scalar_type_rank_greater_than_vector_type : Error<
+    "scalar operand type has greater rank than the type of the vector "
+    "element. (%0 and %1)">;
 def err_bad_kernel_param_type : Error<
   "%0 cannot be used as the type of a kernel parameter">;
+def err_opencl_implicit_function_decl : Error<
+  "implicit declaration of function %0 is invalid in OpenCL">;
 def err_record_with_pointers_kernel_param : Error<
   "%select{struct|union}0 kernel parameters may not contain pointers">;
 def note_within_field_of_type : Note<
@@ -8744,8 +8749,8 @@ def err_omp_not_mappable_type : Error<
   "type %0 is not mappable to target">;
 def err_omp_invalid_map_type_for_directive : Error<
   "%select{map type '%1' is not allowed|map type must be specified}0 for '#pragma omp %2'">;
-def err_omp_no_map_for_directive : Error<
-  "expected at least one map clause for '#pragma omp %0'">;
+def err_omp_no_clause_for_directive : Error<
+  "expected at least one %0 clause for '#pragma omp %1'">;
 def note_omp_polymorphic_in_target : Note<
   "mappable type cannot be polymorphic">;
 def note_omp_static_member_in_target : Note<
diff --git a/include/clang/Frontend/ASTUnit.h b/include/clang/Frontend/ASTUnit.h
index 7f70609efc97..fb0a5e8acd4a 100644
--- a/include/clang/Frontend/ASTUnit.h
+++ b/include/clang/Frontend/ASTUnit.h
@@ -878,6 +878,11 @@ class ASTUnit : public ModuleLoader {
                ArrayRef<RemappedFile> RemappedFiles = None,
                IntrusiveRefCntPtr<vfs::FileSystem> VFS = nullptr);
 
+  /// \brief Free data that will be re-generated on the next parse.
+  ///
+  /// Preamble-related data is not affected.
+  void ResetForParse();
+
   /// \brief Perform code completion at the given file, line, and
   /// column within this translation unit.
   ///
diff --git a/include/clang/Lex/Preprocessor.h b/include/clang/Lex/Preprocessor.h
index 9f015eaa230b..aeca83a90716 100644
--- a/include/clang/Lex/Preprocessor.h
+++ b/include/clang/Lex/Preprocessor.h
@@ -283,6 +283,44 @@ class Preprocessor {
   /// This is used when loading a precompiled preamble.
   std::pair<int, bool> SkipMainFilePreamble;
 
+  class PreambleConditionalStackStore {
+    enum State {
+      Off = 0,
+      Recording = 1,
+      Replaying = 2,
+    };
+
+  public:
+    PreambleConditionalStackStore() : ConditionalStackState(Off) {}
+
+    void startRecording() { ConditionalStackState = Recording; }
+    void startReplaying() { ConditionalStackState = Replaying; }
+    bool isRecording() const { return ConditionalStackState == Recording; }
+    bool isReplaying() const { return ConditionalStackState == Replaying; }
+
+    ArrayRef<PPConditionalInfo> getStack() const {
+      return ConditionalStack;
+    }
+
+    void doneReplaying() {
+      ConditionalStack.clear();
+      ConditionalStackState = Off;
+    }
+
+    void setStack(ArrayRef<PPConditionalInfo> s) {
+      if (!isRecording() && !isReplaying())
+        return;
+      ConditionalStack.clear();
+      ConditionalStack.append(s.begin(), s.end());
+    }
+
+    bool hasRecordedPreamble() const { return !ConditionalStack.empty(); }
+
+  private:
+    SmallVector<PPConditionalInfo, 4> ConditionalStack;
+    State ConditionalStackState;
+  } PreambleConditionalStack;
+
   /// \brief The current top of the stack that we're lexing from if
   /// not expanding a macro and we are lexing directly from source code.
   ///
@@ -1695,6 +1733,11 @@ class Preprocessor {
   /// \brief Return true if we're in the top-level file, not in a \#include.
   bool isInPrimaryFile() const;
 
+  /// \brief Return true if we're in the main file (specifically, if we are 0
+  /// (zero) levels deep \#include. This is used by the lexer to determine if
+  /// it needs to generate errors about unterminated \#if directives.
+  bool isInMainFile() const;
+
   /// \brief Handle cases where the \#include name is expanded
   /// from a macro as multiple tokens, which need to be glued together. 
   ///
@@ -1932,6 +1975,27 @@ class Preprocessor {
                                                           Module *M,
                                                           SourceLocation MLoc);
 
+  bool isRecordingPreamble() const {
+    return PreambleConditionalStack.isRecording();
+  }
+
+  bool hasRecordedPreamble() const {
+    return PreambleConditionalStack.hasRecordedPreamble();
+  }
+
+  ArrayRef<PPConditionalInfo> getPreambleConditionalStack() const {
+      return PreambleConditionalStack.getStack();
+  }
+
+  void setRecordedPreambleConditionalStack(ArrayRef<PPConditionalInfo> s) {
+    PreambleConditionalStack.setStack(s);
+  }
+
+  void setReplayablePreambleConditionalStack(ArrayRef<PPConditionalInfo> s) {
+    PreambleConditionalStack.startReplaying();
+    PreambleConditionalStack.setStack(s);
+  }
+
 private:
   // Macro handling.
   void HandleDefineDirective(Token &Tok, bool ImmediatelyAfterTopLevelIfndef);
diff --git a/include/clang/Lex/PreprocessorLexer.h b/include/clang/Lex/PreprocessorLexer.h
index 6d6cf05a96c4..5c2e4d41454b 100644
--- a/include/clang/Lex/PreprocessorLexer.h
+++ b/include/clang/Lex/PreprocessorLexer.h
@@ -17,6 +17,7 @@
 
 #include "clang/Lex/MultipleIncludeOpt.h"
 #include "clang/Lex/Token.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 
 namespace clang {
@@ -176,6 +177,11 @@ class PreprocessorLexer {
   conditional_iterator conditional_end() const { 
     return ConditionalStack.end(); 
   }
+
+  void setConditionalLevels(ArrayRef<PPConditionalInfo> CL) {
+    ConditionalStack.clear();
+    ConditionalStack.append(CL.begin(), CL.end());
+  }
 };
 
 }  // end namespace clang
diff --git a/include/clang/Lex/PreprocessorOptions.h b/include/clang/Lex/PreprocessorOptions.h
index 58d79f7ff81a..c85d2384fa47 100644
--- a/include/clang/Lex/PreprocessorOptions.h
+++ b/include/clang/Lex/PreprocessorOptions.h
@@ -80,7 +80,14 @@ class PreprocessorOptions {
   /// The boolean indicates whether the preamble ends at the start of a new
   /// line.
   std::pair<unsigned, bool> PrecompiledPreambleBytes;
-  
+
+  /// \brief True indicates that a preamble is being generated.
+  ///
+  /// When the lexer is done, one of the things that need to be preserved is the
+  /// conditional #if stack, so the ASTWriter/ASTReader can save/restore it when
+  /// processing the rest of the file.
+  bool GeneratePreamble;
+
   /// The implicit PTH input included at the start of the translation unit, or
   /// empty.
   std::string ImplicitPTHInclude;
@@ -144,6 +151,7 @@ class PreprocessorOptions {
                           AllowPCHWithCompilerErrors(false),
                           DumpDeserializedPCHDecls(false),
                           PrecompiledPreambleBytes(0, true),
+                          GeneratePreamble(false),
                           RemappedFilesKeepOriginalName(true),
                           RetainRemappedFileBuffers(false),
                           ObjCXXARCStandardLibrary(ARCXX_nolib) { }
diff --git a/include/clang/Serialization/ASTBitCodes.h b/include/clang/Serialization/ASTBitCodes.h
index 823440b19713..6b40781a1239 100644
--- a/include/clang/Serialization/ASTBitCodes.h
+++ b/include/clang/Serialization/ASTBitCodes.h
@@ -607,6 +607,9 @@ namespace clang {
 
       /// \brief Record code for \#pragma pack options.
       PACK_PRAGMA_OPTIONS = 61,
+
+      /// \brief The stack of open #ifs/#ifdefs recorded in a preamble.
+      PP_CONDITIONAL_STACK = 62,
     };
 
     /// \brief Record types used within a source manager block.
diff --git a/include/clang/StaticAnalyzer/Checkers/Checkers.td b/include/clang/StaticAnalyzer/Checkers/Checkers.td
index 790ba5c121c9..4171c685cb98 100644
--- a/include/clang/StaticAnalyzer/Checkers/Checkers.td
+++ b/include/clang/StaticAnalyzer/Checkers/Checkers.td
@@ -279,15 +279,15 @@ def VirtualCallChecker : Checker<"VirtualCall">,
 
 let ParentPackage = CplusplusAlpha in {
 
+def IteratorRangeChecker : Checker<"IteratorRange">,
+  HelpText<"Check for iterators used outside their valid ranges">,
+  DescFile<"IteratorChecker.cpp">;
+
 def MisusedMovedObjectChecker: Checker<"MisusedMovedObject">,
      HelpText<"Method calls on a moved-from object and copying a moved-from "
               "object will be reported">,
      DescFile<"MisusedMovedObjectChecker.cpp">;
 
-def IteratorPastEndChecker : Checker<"IteratorPastEnd">,
-  HelpText<"Check iterators used past end">,
-  DescFile<"IteratorPastEndChecker.cpp">;
-
 } // end: "alpha.cplusplus"
 
 
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h b/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
index 2910ef4212cc..e3a2164b11ff 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
@@ -43,6 +43,7 @@ typedef std::unique_ptr<ConstraintManager>(*ConstraintManagerCreator)(
     ProgramStateManager &, SubEngine *);
 typedef std::unique_ptr<StoreManager>(*StoreManagerCreator)(
     ProgramStateManager &);
+typedef llvm::ImmutableMap<const SubRegion*, TaintTagType> TaintedSubRegions;
 
 //===----------------------------------------------------------------------===//
 // ProgramStateTrait - Traits used by the Generic Data Map of a ProgramState.
@@ -343,6 +344,9 @@ class ProgramState : public llvm::FoldingSetNode {
   ProgramStateRef addTaint(const Stmt *S, const LocationContext *LCtx,
                                TaintTagType Kind = TaintTagGeneric) const;
 
+  /// Create a new state in which the value is marked as tainted.
+  ProgramStateRef addTaint(SVal V, TaintTagType Kind = TaintTagGeneric) const;
+
   /// Create a new state in which the symbol is marked as tainted.
   ProgramStateRef addTaint(SymbolRef S,
                                TaintTagType Kind = TaintTagGeneric) const;
@@ -351,6 +355,14 @@ class ProgramState : public llvm::FoldingSetNode {
   ProgramStateRef addTaint(const MemRegion *R,
                                TaintTagType Kind = TaintTagGeneric) const;
 
+  /// Create a new state in a which a sub-region of a given symbol is tainted.
+  /// This might be necessary when referring to regions that can not have an
+  /// individual symbol, e.g. if they are represented by the default binding of
+  /// a LazyCompoundVal.
+  ProgramStateRef addPartialTaint(SymbolRef ParentSym,
+                                  const SubRegion *SubRegion,
+                                  TaintTagType Kind = TaintTagGeneric) const;
+
   /// Check if the statement is tainted in the current state.
   bool isTainted(const Stmt *S, const LocationContext *LCtx,
                  TaintTagType Kind = TaintTagGeneric) const;
@@ -453,6 +465,7 @@ class ProgramStateManager {
   std::unique_ptr<ConstraintManager>   ConstraintMgr;
 
   ProgramState::GenericDataMap::Factory     GDMFactory;
+  TaintedSubRegions::Factory TSRFactory;
 
   typedef llvm::DenseMap<void*,std::pair<void*,void (*)(void*)> > GDMContextsTy;
   GDMContextsTy GDMContexts;
diff --git a/include/clang/StaticAnalyzer/Core/PathSensitive/TaintManager.h b/include/clang/StaticAnalyzer/Core/PathSensitive/TaintManager.h
index d39b5017d312..7b76263f040c 100644
--- a/include/clang/StaticAnalyzer/Core/PathSensitive/TaintManager.h
+++ b/include/clang/StaticAnalyzer/Core/PathSensitive/TaintManager.h
@@ -35,6 +35,16 @@ template<> struct ProgramStateTrait<TaintMap>
   static void *GDMIndex() { static int index = 0; return &index; }
 };
 
+/// The GDM component mapping derived symbols' parent symbols to their
+/// underlying regions. This is used to efficiently check whether a symbol is
+/// tainted when it represents a sub-region of a tainted symbol.
+struct DerivedSymTaint {};
+typedef llvm::ImmutableMap<SymbolRef, TaintedSubRegions> DerivedSymTaintImpl;
+template<> struct ProgramStateTrait<DerivedSymTaint>
+    :  public ProgramStatePartialTrait<DerivedSymTaintImpl> {
+  static void *GDMIndex() { static int index; return &index; }
+};
+
 class TaintManager {
 
   TaintManager() {}
diff --git a/lib/CodeGen/CGCoroutine.cpp b/lib/CodeGen/CGCoroutine.cpp
index c468c1bb4b5f..f65fb5b9232a 100644
--- a/lib/CodeGen/CGCoroutine.cpp
+++ b/lib/CodeGen/CGCoroutine.cpp
@@ -578,8 +578,7 @@ void CodeGenFunction::EmitCoroutineBody(const CoroutineBodyStmt &S) {
       EmitBlock(FinalBB);
       CurCoro.Data->CurrentAwaitKind = AwaitKind::Final;
       EmitStmt(S.getFinalSuspendStmt());
-    }
-    else {
+    } else {
       // We don't need FinalBB. Emit it to make sure the block is deleted.
       EmitBlock(FinalBB, /*IsFinished=*/true);
     }
diff --git a/lib/Frontend/ASTUnit.cpp b/lib/Frontend/ASTUnit.cpp
index d660638a1e8a..01f7ca8aba9b 100644
--- a/lib/Frontend/ASTUnit.cpp
+++ b/lib/Frontend/ASTUnit.cpp
@@ -1036,8 +1036,6 @@ static void checkAndSanitizeDiags(SmallVectorImpl<StoredDiagnostic> &
 bool ASTUnit::Parse(std::shared_ptr<PCHContainerOperations> PCHContainerOps,
                     std::unique_ptr<llvm::MemoryBuffer> OverrideMainBuffer,
                     IntrusiveRefCntPtr<vfs::FileSystem> VFS) {
-  SavedMainFileBuffer.reset();
-
   if (!Invocation)
     return true;
 
@@ -1090,17 +1088,11 @@ bool ASTUnit::Parse(std::shared_ptr<PCHContainerOperations> PCHContainerOps,
     Clang->createFileManager();
     FileMgr = &Clang->getFileManager();
   }
+
+  ResetForParse();
+
   SourceMgr = new SourceManager(getDiagnostics(), *FileMgr,
                                 UserFilesAreVolatile);
-  TheSema.reset();
-  Ctx = nullptr;
-  PP = nullptr;
-  Reader = nullptr;
-
-  // Clear out old caches and data.
-  TopLevelDecls.clear();
-  clearFileLevelDecls();
-
   if (!OverrideMainBuffer) {
     checkAndRemoveNonDriverDiags(StoredDiagnostics);
     TopLevelDeclsInPreamble.clear();
@@ -1999,6 +1991,7 @@ ASTUnit *ASTUnit::LoadFromCommandLine(
   PreprocessorOptions &PPOpts = CI->getPreprocessorOpts();
   PPOpts.RemappedFilesKeepOriginalName = RemappedFilesKeepOriginalName;
   PPOpts.AllowPCHWithCompilerErrors = AllowPCHWithCompilerErrors;
+  PPOpts.GeneratePreamble = PrecompilePreambleAfterNParses != 0;
   
   // Override the resources path.
   CI->getHeaderSearchOpts().ResourceDir = ResourceFilesPath;
@@ -2115,6 +2108,19 @@ bool ASTUnit::Reparse(std::shared_ptr<PCHContainerOperations> PCHContainerOps,
   return Result;
 }
 
+void ASTUnit::ResetForParse() {
+  SavedMainFileBuffer.reset();
+
+  SourceMgr.reset();
+  TheSema.reset();
+  Ctx.reset();
+  PP.reset();
+  Reader.reset();
+
+  TopLevelDecls.clear();
+  clearFileLevelDecls();
+}
+
 //----------------------------------------------------------------------------//
 // Code completion
 //----------------------------------------------------------------------------//
diff --git a/lib/Frontend/SerializedDiagnosticPrinter.cpp b/lib/Frontend/SerializedDiagnosticPrinter.cpp
index 7f88c919e24a..b5a5acd8ad9a 100644
--- a/lib/Frontend/SerializedDiagnosticPrinter.cpp
+++ b/lib/Frontend/SerializedDiagnosticPrinter.cpp
@@ -506,7 +506,7 @@ void SDiagsWriter::EmitBlockInfoBlock() {
   Abbrev->Add(BitCodeAbbrevOp(RECORD_FILENAME));
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 10)); // Mapped file ID.
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Size.
-  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Modifcation time.  
+  Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 32)); // Modification time.  
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Fixed, 16)); // Text size.
   Abbrev->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::Blob)); // File name text.
   Abbrevs.set(RECORD_FILENAME, Stream.EmitBlockInfoAbbrev(BLOCK_DIAG,
diff --git a/lib/Headers/altivec.h b/lib/Headers/altivec.h
index 957fd5f65e26..90fd477d9b98 100644
--- a/lib/Headers/altivec.h
+++ b/lib/Headers/altivec.h
@@ -2887,87 +2887,79 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
 
 /* vec_ctf */
 
-static __inline__ vector float __ATTRS_o_ai vec_ctf(vector int __a, int __b) {
-  return __builtin_altivec_vcfsx(__a, __b);
-}
-
-static __inline__ vector float __ATTRS_o_ai vec_ctf(vector unsigned int __a,
-                                                    int __b) {
-  return __builtin_altivec_vcfux((vector int)__a, __b);
-}
-
 #ifdef __VSX__
-static __inline__ vector double __ATTRS_o_ai
-vec_ctf(vector unsigned long long __a, int __b) {
-  vector double __ret = __builtin_convertvector(__a, vector double);
-  __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
-  return __ret;
-}
-
-static __inline__ vector double __ATTRS_o_ai
-vec_ctf(vector signed long long __a, int __b) {
-  vector double __ret = __builtin_convertvector(__a, vector double);
-  __ret *= (vector double)(vector unsigned long long)((0x3ffULL - __b) << 52);
-  return __ret;
-}
+#define vec_ctf(__a, __b)                                                      \
+  _Generic((__a), vector int                                                   \
+           : (vector float)__builtin_altivec_vcfsx((__a), (__b)),              \
+             vector unsigned int                                               \
+           : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)),  \
+             vector unsigned long long                                         \
+           : (__builtin_convertvector((vector unsigned long long)(__a),        \
+                                      vector double) *                         \
+              (vector double)(vector unsigned long long)((0x3ffULL - (__b))    \
+                                                         << 52)),              \
+             vector signed long long                                           \
+           : (__builtin_convertvector((vector signed long long)(__a),          \
+                                      vector double) *                         \
+              (vector double)(vector unsigned long long)((0x3ffULL - (__b))    \
+                                                         << 52)))
+#else
+#define vec_ctf(__a, __b)                                                      \
+  _Generic((__a), vector int                                                   \
+           : (vector float)__builtin_altivec_vcfsx((__a), (__b)),              \
+             vector unsigned int                                               \
+           : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)))
 #endif
 
 /* vec_vcfsx */
 
-static __inline__ vector float __attribute__((__always_inline__))
-vec_vcfsx(vector int __a, int __b) {
-  return __builtin_altivec_vcfsx(__a, __b);
-}
+#define vec_vcfux __builtin_altivec_vcfux
 
 /* vec_vcfux */
 
-static __inline__ vector float __attribute__((__always_inline__))
-vec_vcfux(vector unsigned int __a, int __b) {
-  return __builtin_altivec_vcfux((vector int)__a, __b);
-}
+#define vec_vcfsx(__a, __b) __builtin_altivec_vcfsx((vector int)(__a), (__b))
 
 /* vec_cts */
 
-static __inline__ vector int __ATTRS_o_ai vec_cts(vector float __a, int __b) {
-  return __builtin_altivec_vctsxs(__a, __b);
-}
-
 #ifdef __VSX__
-static __inline__ vector signed long long __ATTRS_o_ai
-vec_cts(vector double __a, int __b) {
-  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
-  return __builtin_convertvector(__a, vector signed long long);
-}
+#define vec_cts(__a, __b)                                                      \
+  _Generic((__a), vector float                                                 \
+           : __builtin_altivec_vctsxs((__a), (__b)), vector double             \
+           : __extension__({                                                   \
+             vector double __ret =                                             \
+                 (__a) *                                                       \
+                 (vector double)(vector unsigned long long)((0x3ffULL + (__b)) \
+                                                            << 52);            \
+             __builtin_convertvector(__ret, vector signed long long);          \
+           }))
+#else
+#define vec_cts __builtin_altivec_vctsxs
 #endif
 
 /* vec_vctsxs */
 
-static __inline__ vector int __attribute__((__always_inline__))
-vec_vctsxs(vector float __a, int __b) {
-  return __builtin_altivec_vctsxs(__a, __b);
-}
+#define vec_vctsxs __builtin_altivec_vctsxs
 
 /* vec_ctu */
 
-static __inline__ vector unsigned int __ATTRS_o_ai vec_ctu(vector float __a,
-                                                           int __b) {
-  return __builtin_altivec_vctuxs(__a, __b);
-}
-
 #ifdef __VSX__
-static __inline__ vector unsigned long long __ATTRS_o_ai
-vec_ctu(vector double __a, int __b) {
-  __a *= (vector double)(vector unsigned long long)((0x3ffULL + __b) << 52);
-  return __builtin_convertvector(__a, vector unsigned long long);
-}
+#define vec_ctu(__a, __b)                                                      \
+  _Generic((__a), vector float                                                 \
+           : __builtin_altivec_vctuxs((__a), (__b)), vector double             \
+           : __extension__({                                                   \
+             vector double __ret =                                             \
+                 (__a) *                                                       \
+                 (vector double)(vector unsigned long long)((0x3ffULL + __b)   \
+                                                            << 52);            \
+             __builtin_convertvector(__ret, vector unsigned long long);        \
+           }))
+#else
+#define vec_ctu __builtin_altivec_vctuxs
 #endif
 
 /* vec_vctuxs */
 
-static __inline__ vector unsigned int __attribute__((__always_inline__))
-vec_vctuxs(vector float __a, int __b) {
-  return __builtin_altivec_vctuxs(__a, __b);
-}
+#define vec_vctuxs __builtin_altivec_vctuxs
 
 /* vec_signed */
 
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 92942fd09a0c..f5a35e97d6e1 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -550,8 +550,6 @@ namespace {
 
   enum PreambleDirectiveKind {
     PDK_Skipped,
-    PDK_StartIf,
-    PDK_EndIf,
     PDK_Unknown
   };
 
@@ -574,8 +572,6 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
 
   bool InPreprocessorDirective = false;
   Token TheTok;
-  Token IfStartTok;
-  unsigned IfCount = 0;
   SourceLocation ActiveCommentLoc;
 
   unsigned MaxLineOffset = 0;
@@ -658,33 +654,18 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
               .Case("sccs", PDK_Skipped)
               .Case("assert", PDK_Skipped)
               .Case("unassert", PDK_Skipped)
-              .Case("if", PDK_StartIf)
-              .Case("ifdef", PDK_StartIf)
-              .Case("ifndef", PDK_StartIf)
+              .Case("if", PDK_Skipped)
+              .Case("ifdef", PDK_Skipped)
+              .Case("ifndef", PDK_Skipped)
               .Case("elif", PDK_Skipped)
               .Case("else", PDK_Skipped)
-              .Case("endif", PDK_EndIf)
+              .Case("endif", PDK_Skipped)
               .Default(PDK_Unknown);
 
         switch (PDK) {
         case PDK_Skipped:
           continue;
 
-        case PDK_StartIf:
-          if (IfCount == 0)
-            IfStartTok = HashTok;
-            
-          ++IfCount;
-          continue;
-            
-        case PDK_EndIf:
-          // Mismatched #endif. The preamble ends here.
-          if (IfCount == 0)
-            break;
-
-          --IfCount;
-          continue;
-            
         case PDK_Unknown:
           // We don't know what this directive is; stop at the '#'.
           break;
@@ -705,16 +686,13 @@ std::pair<unsigned, bool> Lexer::ComputePreamble(StringRef Buffer,
   } while (true);
   
   SourceLocation End;
-  if (IfCount)
-    End = IfStartTok.getLocation();
-  else if (ActiveCommentLoc.isValid())
+  if (ActiveCommentLoc.isValid())
     End = ActiveCommentLoc; // don't truncate a decl comment.
   else
     End = TheTok.getLocation();
 
   return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(),
-                        IfCount? IfStartTok.isAtStartOfLine()
-                               : TheTok.isAtStartOfLine());
+                        TheTok.isAtStartOfLine());
 }
 
 /// AdvanceToTokenCharacter - Given a location that specifies the start of a
@@ -2570,6 +2548,11 @@ bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
     return true;
   }
   
+  if (PP->isRecordingPreamble() && !PP->isInMainFile()) {
+    PP->setRecordedPreambleConditionalStack(ConditionalStack);
+    ConditionalStack.clear();
+  }
+
   // Issue diagnostics for unterminated #if and missing newline.
 
   // If we are in a #if directive, emit an error.
diff --git a/lib/Lex/PPDirectives.cpp b/lib/Lex/PPDirectives.cpp
index 030717b8bd5c..8b5877934f61 100644
--- a/lib/Lex/PPDirectives.cpp
+++ b/lib/Lex/PPDirectives.cpp
@@ -1906,6 +1906,25 @@ void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc,
     }
   }
 
+  // The #included file will be considered to be a system header if either it is
+  // in a system include directory, or if the #includer is a system include
+  // header.
+  SrcMgr::CharacteristicKind FileCharacter =
+      SourceMgr.getFileCharacteristic(FilenameTok.getLocation());
+  if (File)
+    FileCharacter = std::max(HeaderInfo.getFileDirFlavor(File), FileCharacter);
+
+  // Ask HeaderInfo if we should enter this #include file.  If not, #including
+  // this file will have no effect.
+  bool SkipHeader = false;
+  if (ShouldEnter && File &&
+      !HeaderInfo.ShouldEnterIncludeFile(*this, File, isImport,
+                                         getLangOpts().Modules,
+                                         SuggestedModule.getModule())) {
+    ShouldEnter = false;
+    SkipHeader = true;
+  }
+
   if (Callbacks) {
     // Notify the callback object that we've seen an inclusion directive.
     Callbacks->InclusionDirective(
@@ -1913,18 +1932,13 @@ void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc,
         LangOpts.MSVCCompat ? NormalizedPath.c_str() : Filename, isAngled,
         FilenameRange, File, SearchPath, RelativePath,
         ShouldEnter ? nullptr : SuggestedModule.getModule());
+    if (SkipHeader && !SuggestedModule.getModule())
+      Callbacks->FileSkipped(*File, FilenameTok, FileCharacter);
   }
 
   if (!File)
     return;
 
-  // The #included file will be considered to be a system header if either it is
-  // in a system include directory, or if the #includer is a system include
-  // header.
-  SrcMgr::CharacteristicKind FileCharacter =
-    std::max(HeaderInfo.getFileDirFlavor(File),
-             SourceMgr.getFileCharacteristic(FilenameTok.getLocation()));
-
   // FIXME: If we have a suggested module, and we've already visited this file,
   // don't bother entering it again. We know it has no further effect.
 
@@ -1964,19 +1978,6 @@ void Preprocessor::HandleIncludeDirective(SourceLocation HashLoc,
     }
   }
 
-  // Ask HeaderInfo if we should enter this #include file.  If not, #including
-  // this file will have no effect.
-  bool SkipHeader = false;
-  if (ShouldEnter &&
-      !HeaderInfo.ShouldEnterIncludeFile(*this, File, isImport,
-                                         getLangOpts().Modules,
-                                         SuggestedModule.getModule())) {
-    ShouldEnter = false;
-    SkipHeader = true;
-    if (Callbacks)
-      Callbacks->FileSkipped(*File, FilenameTok, FileCharacter);
-  }
-
   // If we don't need to enter the file, stop now.
   if (!ShouldEnter) {
     // If this is a module import, make it visible if needed.
diff --git a/lib/Lex/PPLexerChange.cpp b/lib/Lex/PPLexerChange.cpp
index 5a589d6a17b3..1c0cd5636835 100644
--- a/lib/Lex/PPLexerChange.cpp
+++ b/lib/Lex/PPLexerChange.cpp
@@ -46,6 +46,12 @@ bool Preprocessor::isInPrimaryFile() const {
   });
 }
 
+bool Preprocessor::isInMainFile() const {
+  if (IsFileLexer())
+    return IncludeMacroStack.size() == 0;
+  return true;
+}
+
 /// getCurrentLexer - Return the current file lexer being lexed from.  Note
 /// that this ignores any potentially active macro expansions and _Pragma
 /// expansions going on at the time.
diff --git a/lib/Lex/Pragma.cpp b/lib/Lex/Pragma.cpp
index 2d078a4e7603..e1d981527bec 100644
--- a/lib/Lex/Pragma.cpp
+++ b/lib/Lex/Pragma.cpp
@@ -1407,6 +1407,24 @@ struct PragmaModuleBeginHandler : public PragmaHandler {
       M = NewM;
     }
 
+    // If the module isn't available, it doesn't make sense to enter it.
+    if (!M->isAvailable()) {
+      Module::Requirement Requirement;
+      Module::UnresolvedHeaderDirective MissingHeader;
+      (void)M->isAvailable(PP.getLangOpts(), PP.getTargetInfo(),
+                           Requirement, MissingHeader);
+      if (MissingHeader.FileNameLoc.isValid()) {
+        PP.Diag(MissingHeader.FileNameLoc, diag::err_module_header_missing)
+          << MissingHeader.IsUmbrella << MissingHeader.FileName;
+      } else {
+        PP.Diag(M->DefinitionLoc, diag::err_module_unavailable)
+          << M->getFullModuleName() << Requirement.second << Requirement.first;
+      }
+      PP.Diag(BeginLoc, diag::note_pp_module_begin_here)
+        << M->getTopLevelModuleName();
+      return;
+    }
+
     // Enter the scope of the submodule.
     PP.EnterSubmodule(M, BeginLoc, /*ForPragma*/true);
     PP.EnterAnnotationToken(SourceRange(BeginLoc, ModuleName.back().second),
diff --git a/lib/Lex/Preprocessor.cpp b/lib/Lex/Preprocessor.cpp
index dce8c1efda23..3596337c245e 100644
--- a/lib/Lex/Preprocessor.cpp
+++ b/lib/Lex/Preprocessor.cpp
@@ -150,6 +150,9 @@ Preprocessor::Preprocessor(std::shared_ptr<PreprocessorOptions> PPOpts,
     Ident_GetExceptionInfo = Ident_GetExceptionCode = nullptr;
     Ident_AbnormalTermination = nullptr;
   }
+
+  if (this->PPOpts->GeneratePreamble)
+    PreambleConditionalStack.startRecording();
 }
 
 Preprocessor::~Preprocessor() {
@@ -532,6 +535,12 @@ void Preprocessor::EnterMainSourceFile() {
 
   // Start parsing the predefines.
   EnterSourceFile(FID, nullptr, SourceLocation());
+
+  // Restore the conditional stack from the preamble, if there is one.
+  if (PreambleConditionalStack.isReplaying()) {
+    CurPPLexer->setConditionalLevels(PreambleConditionalStack.getStack());
+    PreambleConditionalStack.doneReplaying();
+  }
 }
 
 void Preprocessor::EndSourceFile() {
diff --git a/lib/Sema/SemaDecl.cpp b/lib/Sema/SemaDecl.cpp
index 96fd952c81c7..a9adbec4f842 100644
--- a/lib/Sema/SemaDecl.cpp
+++ b/lib/Sema/SemaDecl.cpp
@@ -12509,6 +12509,9 @@ NamedDecl *Sema::ImplicitlyDefineFunction(SourceLocation Loc,
   unsigned diag_id;
   if (II.getName().startswith("__builtin_"))
     diag_id = diag::warn_builtin_unknown;
+  // OpenCL v2.0 s6.9.u - Implicit function declaration is not supported.
+  else if (getLangOpts().OpenCL)
+    diag_id = diag::err_opencl_implicit_function_decl;
   else if (getLangOpts().C99)
     diag_id = diag::ext_implicit_function_decl;
   else
diff --git a/lib/Sema/SemaExpr.cpp b/lib/Sema/SemaExpr.cpp
index 759c82eeaa07..b1a07ffb7206 100644
--- a/lib/Sema/SemaExpr.cpp
+++ b/lib/Sema/SemaExpr.cpp
@@ -8074,28 +8074,38 @@ QualType Sema::InvalidLogicalVectorOperands(SourceLocation Loc, ExprResult &LHS,
 /// rank; for C, Obj-C, and C++ we allow any real scalar conversion except
 /// for float->int.
 ///
+/// OpenCL V2.0 6.2.6.p2:
+/// An error shall occur if any scalar operand type has greater rank
+/// than the type of the vector element.
+///
 /// \param scalar - if non-null, actually perform the conversions
 /// \return true if the operation fails (but without diagnosing the failure)
 static bool tryVectorConvertAndSplat(Sema &S, ExprResult *scalar,
                                      QualType scalarTy,
                                      QualType vectorEltTy,
-                                     QualType vectorTy) {
+                                     QualType vectorTy,
+                                     unsigned &DiagID) {
   // The conversion to apply to the scalar before splatting it,
   // if necessary.
   CastKind scalarCast = CK_Invalid;
   
   if (vectorEltTy->isIntegralType(S.Context)) {
-    if (!scalarTy->isIntegralType(S.Context))
+    if (S.getLangOpts().OpenCL && (scalarTy->isRealFloatingType() ||
+        (scalarTy->isIntegerType() &&
+         S.Context.getIntegerTypeOrder(vectorEltTy, scalarTy) < 0))) {
+      DiagID = diag::err_opencl_scalar_type_rank_greater_than_vector_type;
       return true;
-    if (S.getLangOpts().OpenCL &&
-        S.Context.getIntegerTypeOrder(vectorEltTy, scalarTy) < 0)
+    }
+    if (!scalarTy->isIntegralType(S.Context))
       return true;
     scalarCast = CK_IntegralCast;
   } else if (vectorEltTy->isRealFloatingType()) {
     if (scalarTy->isRealFloatingType()) {
       if (S.getLangOpts().OpenCL &&
-          S.Context.getFloatingTypeOrder(vectorEltTy, scalarTy) < 0)
+          S.Context.getFloatingTypeOrder(vectorEltTy, scalarTy) < 0) {
+        DiagID = diag::err_opencl_scalar_type_rank_greater_than_vector_type;
         return true;
+      }
       scalarCast = CK_FloatingCast;
     }
     else if (scalarTy->isIntegralType(S.Context))
@@ -8341,10 +8351,12 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
 
   // If there's a vector type and a scalar, try to convert the scalar to
   // the vector element type and splat.
+  unsigned DiagID = diag::err_typecheck_vector_not_convertable;
   if (!RHSVecType) {
     if (isa<ExtVectorType>(LHSVecType)) {
       if (!tryVectorConvertAndSplat(*this, &RHS, RHSType,
-                                    LHSVecType->getElementType(), LHSType))
+                                    LHSVecType->getElementType(), LHSType,
+                                    DiagID))
         return LHSType;
     } else {
       if (!tryGCCVectorConvertAndSplat(*this, &RHS, &LHS))
@@ -8355,7 +8367,7 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
     if (isa<ExtVectorType>(RHSVecType)) {
       if (!tryVectorConvertAndSplat(*this, (IsCompAssign ? nullptr : &LHS),
                                     LHSType, RHSVecType->getElementType(),
-                                    RHSType))
+                                    RHSType, DiagID))
         return RHSType;
     } else {
       if (LHS.get()->getValueKind() == VK_LValue ||
@@ -8431,7 +8443,7 @@ QualType Sema::CheckVectorOperands(ExprResult &LHS, ExprResult &RHS,
   }
 
   // Otherwise, use the generic diagnostic.
-  Diag(Loc, diag::err_typecheck_vector_not_convertable)
+  Diag(Loc, DiagID)
     << LHSType << RHSType
     << LHS.get()->getSourceRange() << RHS.get()->getSourceRange();
   return QualType();
diff --git a/lib/Sema/SemaOpenMP.cpp b/lib/Sema/SemaOpenMP.cpp
index 43fd055bbc56..2b7733d2adbd 100644
--- a/lib/Sema/SemaOpenMP.cpp
+++ b/lib/Sema/SemaOpenMP.cpp
@@ -5929,16 +5929,17 @@ StmtResult Sema::ActOnOpenMPTargetParallelForDirective(
                                                B, DSAStack->isCancelRegion());
 }
 
-/// \brief Check for existence of a map clause in the list of clauses.
-static bool HasMapClause(ArrayRef<OMPClause *> Clauses) {
-  for (ArrayRef<OMPClause *>::iterator I = Clauses.begin(), E = Clauses.end();
-       I != E; ++I) {
-    if (*I != nullptr && (*I)->getClauseKind() == OMPC_map) {
-      return true;
-    }
-  }
+/// Check for existence of a map clause in the list of clauses.
+static bool hasClauses(ArrayRef<OMPClause *> Clauses,
+                       const OpenMPClauseKind K) {
+  return llvm::any_of(
+      Clauses, [K](const OMPClause *C) { return C->getClauseKind() == K; });
+}
 
-  return false;
+template <typename... Params>
+static bool hasClauses(ArrayRef<OMPClause *> Clauses, const OpenMPClauseKind K,
+                       const Params... ClauseTypes) {
+  return hasClauses(Clauses, K) || hasClauses(Clauses, ClauseTypes...);
 }
 
 StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
@@ -5952,8 +5953,9 @@ StmtResult Sema::ActOnOpenMPTargetDataDirective(ArrayRef<OMPClause *> Clauses,
 
   // OpenMP [2.10.1, Restrictions, p. 97]
   // At least one map clause must appear on the directive.
-  if (!HasMapClause(Clauses)) {
-    Diag(StartLoc, diag::err_omp_no_map_for_directive)
+  if (!hasClauses(Clauses, OMPC_map, OMPC_use_device_ptr)) {
+    Diag(StartLoc, diag::err_omp_no_clause_for_directive)
+        << "'map' or 'use_device_ptr'"
         << getOpenMPDirectiveName(OMPD_target_data);
     return StmtError();
   }
@@ -5970,9 +5972,9 @@ Sema::ActOnOpenMPTargetEnterDataDirective(ArrayRef<OMPClause *> Clauses,
                                           SourceLocation EndLoc) {
   // OpenMP [2.10.2, Restrictions, p. 99]
   // At least one map clause must appear on the directive.
-  if (!HasMapClause(Clauses)) {
-    Diag(StartLoc, diag::err_omp_no_map_for_directive)
-        << getOpenMPDirectiveName(OMPD_target_enter_data);
+  if (!hasClauses(Clauses, OMPC_map)) {
+    Diag(StartLoc, diag::err_omp_no_clause_for_directive)
+        << "'map'" << getOpenMPDirectiveName(OMPD_target_enter_data);
     return StmtError();
   }
 
@@ -5986,9 +5988,9 @@ Sema::ActOnOpenMPTargetExitDataDirective(ArrayRef<OMPClause *> Clauses,
                                          SourceLocation EndLoc) {
   // OpenMP [2.10.3, Restrictions, p. 102]
   // At least one map clause must appear on the directive.
-  if (!HasMapClause(Clauses)) {
-    Diag(StartLoc, diag::err_omp_no_map_for_directive)
-        << getOpenMPDirectiveName(OMPD_target_exit_data);
+  if (!hasClauses(Clauses, OMPC_map)) {
+    Diag(StartLoc, diag::err_omp_no_clause_for_directive)
+        << "'map'" << getOpenMPDirectiveName(OMPD_target_exit_data);
     return StmtError();
   }
 
@@ -5998,12 +6000,7 @@ Sema::ActOnOpenMPTargetExitDataDirective(ArrayRef<OMPClause *> Clauses,
 StmtResult Sema::ActOnOpenMPTargetUpdateDirective(ArrayRef<OMPClause *> Clauses,
                                                   SourceLocation StartLoc,
                                                   SourceLocation EndLoc) {
-  bool seenMotionClause = false;
-  for (auto *C : Clauses) {
-    if (C->getClauseKind() == OMPC_to || C->getClauseKind() == OMPC_from)
-      seenMotionClause = true;
-  }
-  if (!seenMotionClause) {
+  if (!hasClauses(Clauses, OMPC_to, OMPC_from)) {
     Diag(StartLoc, diag::err_omp_at_least_one_motion_clause_required);
     return StmtError();
   }
diff --git a/lib/Serialization/ASTReader.cpp b/lib/Serialization/ASTReader.cpp
index 55cb670f427e..b7bbb9dc7be1 100644
--- a/lib/Serialization/ASTReader.cpp
+++ b/lib/Serialization/ASTReader.cpp
@@ -2925,6 +2925,21 @@ ASTReader::ReadASTBlock(ModuleFile &F, unsigned ClientLoadCapabilities) {
       }
       break;
 
+    case PP_CONDITIONAL_STACK:
+      if (!Record.empty()) {
+        SmallVector<PPConditionalInfo, 4> ConditionalStack;
+        for (unsigned Idx = 0, N = Record.size() - 1; Idx < N; /* in loop */) {
+          auto Loc = ReadSourceLocation(F, Record, Idx);
+          bool WasSkipping = Record[Idx++];
+          bool FoundNonSkip = Record[Idx++];
+          bool FoundElse = Record[Idx++];
+          ConditionalStack.push_back(
+              {Loc, WasSkipping, FoundNonSkip, FoundElse});
+        }
+        PP.setReplayablePreambleConditionalStack(ConditionalStack);
+      }
+      break;
+
     case PP_COUNTER_VALUE:
       if (!Record.empty() && Listener)
         Listener->ReadCounter(F, Record[0]);
diff --git a/lib/Serialization/ASTWriter.cpp b/lib/Serialization/ASTWriter.cpp
index b3556371c9b8..c931b13f65f3 100644
--- a/lib/Serialization/ASTWriter.cpp
+++ b/lib/Serialization/ASTWriter.cpp
@@ -1093,6 +1093,7 @@ void ASTWriter::WriteBlockInfoBlock() {
   RECORD(UNUSED_LOCAL_TYPEDEF_NAME_CANDIDATES);
   RECORD(DELETE_EXPRS_TO_ANALYZE);
   RECORD(CUDA_PRAGMA_FORCE_HOST_DEVICE_DEPTH);
+  RECORD(PP_CONDITIONAL_STACK);
 
   // SourceManager Block.
   BLOCK(SOURCE_MANAGER_BLOCK);
@@ -2302,6 +2303,18 @@ void ASTWriter::WritePreprocessor(const Preprocessor &PP, bool IsModule) {
     Stream.EmitRecord(PP_COUNTER_VALUE, Record);
   }
 
+  if (PP.isRecordingPreamble() && PP.hasRecordedPreamble()) {
+    assert(!IsModule);
+    for (const auto &Cond : PP.getPreambleConditionalStack()) {
+      AddSourceLocation(Cond.IfLoc, Record);
+      Record.push_back(Cond.WasSkipping);
+      Record.push_back(Cond.FoundNonSkip);
+      Record.push_back(Cond.FoundElse);
+    }
+    Stream.EmitRecord(PP_CONDITIONAL_STACK, Record);
+    Record.clear();
+  }
+
   // Enter the preprocessor block.
   Stream.EnterSubblock(PREPROCESSOR_BLOCK_ID, 3);
 
diff --git a/lib/StaticAnalyzer/Checkers/CMakeLists.txt b/lib/StaticAnalyzer/Checkers/CMakeLists.txt
index 60d60bc074eb..2759240dd276 100644
--- a/lib/StaticAnalyzer/Checkers/CMakeLists.txt
+++ b/lib/StaticAnalyzer/Checkers/CMakeLists.txt
@@ -39,7 +39,7 @@ add_clang_library(clangStaticAnalyzerCheckers
   GenericTaintChecker.cpp
   GTestChecker.cpp
   IdenticalExprChecker.cpp
-  IteratorPastEndChecker.cpp
+  IteratorChecker.cpp
   IvarInvalidationChecker.cpp
   LLVMConventionsChecker.cpp
   LocalizationChecker.cpp
diff --git a/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp b/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
index b1a54e77951b..883c6a663291 100644
--- a/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/GenericTaintChecker.cpp
@@ -65,21 +65,8 @@ class GenericTaintChecker : public Checker< check::PostStmt<CallExpr>,
   /// and thus, is tainted.
   static bool isStdin(const Expr *E, CheckerContext &C);
 
-  /// This is called from getPointedToSymbol() to resolve symbol references for
-  /// the region underlying a LazyCompoundVal. This is the default binding
-  /// for the LCV, which could be a conjured symbol from a function call that
-  /// initialized the region. It only returns the conjured symbol if the LCV
-  /// covers the entire region, e.g. we avoid false positives by not returning
-  /// a default bindingc for an entire struct if the symbol for only a single
-  /// field or element within it is requested.
-  // TODO: Return an appropriate symbol for sub-fields/elements of an LCV so
-  // that they are also appropriately tainted.
-  static SymbolRef getLCVSymbol(CheckerContext &C,
-                                nonloc::LazyCompoundVal &LCV);
-
-  /// \brief Given a pointer argument, get the symbol of the value it contains
-  /// (points to).
-  static SymbolRef getPointedToSymbol(CheckerContext &C, const Expr *Arg);
+  /// \brief Given a pointer argument, return the value it points to.
+  static Optional<SVal> getPointedToSVal(CheckerContext &C, const Expr *Arg);
 
   /// Functions defining the attack surface.
   typedef ProgramStateRef (GenericTaintChecker::*FnCheck)(const CallExpr *,
@@ -186,9 +173,14 @@ class GenericTaintChecker : public Checker< check::PostStmt<CallExpr>,
     static inline bool isTaintedOrPointsToTainted(const Expr *E,
                                                   ProgramStateRef State,
                                                   CheckerContext &C) {
-      return (State->isTainted(E, C.getLocationContext()) || isStdin(E, C) ||
-              (E->getType().getTypePtr()->isPointerType() &&
-               State->isTainted(getPointedToSymbol(C, E))));
+      if (State->isTainted(E, C.getLocationContext()) || isStdin(E, C))
+        return true;
+
+      if (!E->getType().getTypePtr()->isPointerType())
+        return false;
+
+      Optional<SVal> V = getPointedToSVal(C, E);
+      return (V && State->isTainted(*V));
     }
 
     /// \brief Pre-process a function which propagates taint according to the
@@ -400,9 +392,9 @@ bool GenericTaintChecker::propagateFromPre(const CallExpr *CE,
     if (CE->getNumArgs() < (ArgNum + 1))
       return false;
     const Expr* Arg = CE->getArg(ArgNum);
-    SymbolRef Sym = getPointedToSymbol(C, Arg);
-    if (Sym)
-      State = State->addTaint(Sym);
+    Optional<SVal> V = getPointedToSVal(C, Arg);
+    if (V)
+      State = State->addTaint(*V);
   }
 
   // Clear up the taint info from the state.
@@ -473,47 +465,20 @@ bool GenericTaintChecker::checkPre(const CallExpr *CE, CheckerContext &C) const{
   return false;
 }
 
-SymbolRef GenericTaintChecker::getLCVSymbol(CheckerContext &C,
-                                            nonloc::LazyCompoundVal &LCV) {
-  StoreManager &StoreMgr = C.getStoreManager();
-
-  // getLCVSymbol() is reached in a PostStmt so we can always expect a default
-  // binding to exist if one is present.
-  if (Optional<SVal> binding = StoreMgr.getDefaultBinding(LCV)) {
-    SymbolRef Sym = binding->getAsSymbol();
-    if (!Sym)
-      return nullptr;
-
-    // If the LCV covers an entire base region return the default conjured symbol.
-    if (LCV.getRegion() == LCV.getRegion()->getBaseRegion())
-      return Sym;
-  }
-
-  // Otherwise, return a nullptr as there's not yet a functional way to taint
-  // sub-regions of LCVs.
-  return nullptr;
-}
-
-SymbolRef GenericTaintChecker::getPointedToSymbol(CheckerContext &C,
-                                                  const Expr* Arg) {
+Optional<SVal> GenericTaintChecker::getPointedToSVal(CheckerContext &C,
+                                            const Expr* Arg) {
   ProgramStateRef State = C.getState();
   SVal AddrVal = State->getSVal(Arg->IgnoreParens(), C.getLocationContext());
   if (AddrVal.isUnknownOrUndef())
-    return nullptr;
+    return None;
 
   Optional<Loc> AddrLoc = AddrVal.getAs<Loc>();
   if (!AddrLoc)
-    return nullptr;
+    return None;
 
   const PointerType *ArgTy =
     dyn_cast<PointerType>(Arg->getType().getCanonicalType().getTypePtr());
-  SVal Val = State->getSVal(*AddrLoc,
-                            ArgTy ? ArgTy->getPointeeType(): QualType());
-
-  if (auto LCV = Val.getAs<nonloc::LazyCompoundVal>())
-    return getLCVSymbol(C, *LCV);
-
-  return Val.getAsSymbol();
+  return State->getSVal(*AddrLoc, ArgTy ? ArgTy->getPointeeType(): QualType());
 }
 
 ProgramStateRef
@@ -633,9 +598,9 @@ ProgramStateRef GenericTaintChecker::postScanf(const CallExpr *CE,
     // The arguments are pointer arguments. The data they are pointing at is
     // tainted after the call.
     const Expr* Arg = CE->getArg(i);
-        SymbolRef Sym = getPointedToSymbol(C, Arg);
-    if (Sym)
-      State = State->addTaint(Sym);
+    Optional<SVal> V = getPointedToSVal(C, Arg);
+    if (V)
+      State = State->addTaint(*V);
   }
   return State;
 }
@@ -710,10 +675,10 @@ bool GenericTaintChecker::generateReportIfTainted(const Expr *E,
 
   // Check for taint.
   ProgramStateRef State = C.getState();
-  const SymbolRef PointedToSym = getPointedToSymbol(C, E);
+  Optional<SVal> PointedToSVal = getPointedToSVal(C, E);
   SVal TaintedSVal;
-  if (State->isTainted(PointedToSym))
-    TaintedSVal = nonloc::SymbolVal(PointedToSym);
+  if (PointedToSVal && State->isTainted(*PointedToSVal))
+    TaintedSVal = *PointedToSVal;
   else if (State->isTainted(E, C.getLocationContext()))
     TaintedSVal = C.getSVal(E);
   else
diff --git a/lib/StaticAnalyzer/Checkers/IteratorChecker.cpp b/lib/StaticAnalyzer/Checkers/IteratorChecker.cpp
new file mode 100644
index 000000000000..0f9b749506fa
--- /dev/null
+++ b/lib/StaticAnalyzer/Checkers/IteratorChecker.cpp
@@ -0,0 +1,833 @@
+//===-- IteratorChecker.cpp ---------------------------------------*- C++ -*--//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines a checker for using iterators outside their range (past end). Usage
+// means here dereferencing, incrementing etc.
+//
+//===----------------------------------------------------------------------===//
+//
+// In the code, iterator can be represented as a:
+// * type-I: typedef-ed pointer. Operations over such iterator, such as
+//           comparisons or increments, are modeled straightforwardly by the
+//           analyzer.
+// * type-II: structure with its method bodies available.  Operations over such
+//            iterator are inlined by the analyzer, and results of modeling
+//            these operations are exposing implementation details of the
+//            iterators, which is not necessarily helping.
+// * type-III: completely opaque structure. Operations over such iterator are
+//             modeled conservatively, producing conjured symbols everywhere.
+//
+// To handle all these types in a common way we introduce a structure called
+// IteratorPosition which is an abstraction of the position the iterator
+// represents using symbolic expressions. The checker handles all the
+// operations on this structure.
+//
+// Additionally, depending on the circumstances, operators of types II and III
+// can be represented as:
+// * type-IIa, type-IIIa: conjured structure symbols - when returned by value
+//                        from conservatively evaluated methods such as
+//                        `.begin()`.
+// * type-IIb, type-IIIb: memory regions of iterator-typed objects, such as
+//                        variables or temporaries, when the iterator object is
+//                        currently treated as an lvalue.
+// * type-IIc, type-IIIc: compound values of iterator-typed objects, when the
+//                        iterator object is treated as an rvalue taken of a
+//                        particular lvalue, eg. a copy of "type-a" iterator
+//                        object, or an iterator that existed before the
+//                        analysis has started.
+//
+// To handle any of these three different representations stored in an SVal we
+// use setter and getters functions which separate the three cases. To store
+// them we use a pointer union of symbol and memory region.
+//
+// The checker works the following way: We record the past-end iterator for
+// all containers whenever their `.end()` is called. Since the Constraint
+// Manager cannot handle SVals we need to take over its role. We post-check
+// equality and non-equality comparisons and propagate the position of the
+// iterator to the other side of the comparison if it is past-end and we are in
+// the 'equal' branch (true-branch for `==` and false-branch for `!=`).
+//
+// In case of type-I or type-II iterators we get a concrete integer as a result
+// of the comparison (1 or 0) but in case of type-III we only get a Symbol. In
+// this latter case we record the symbol and reload it in evalAssume() and do
+// the propagation there. We also handle (maybe double) negated comparisons
+// which are represented in the form of (x == 0 or x !=0 ) where x is the
+// comparison itself.
+
+#include "ClangSACheckers.h"
+#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
+#include "clang/StaticAnalyzer/Core/Checker.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+
+using namespace clang;
+using namespace ento;
+
+namespace {
+
+// Abstract position of an iterator. This helps to handle all three kinds
+// of operators in a common way by using a symbolic position.
+struct IteratorPosition {
+private:
+
+  // Container the iterator belongs to
+  const MemRegion *Cont;
+
+  // Abstract offset
+  SymbolRef Offset;
+
+  IteratorPosition(const MemRegion *C, SymbolRef Of)
+      : Cont(C), Offset(Of) {}
+
+public:
+  const MemRegion *getContainer() const { return Cont; }
+  SymbolRef getOffset() const { return Offset; }
+
+  static IteratorPosition getPosition(const MemRegion *C, SymbolRef Of) {
+    return IteratorPosition(C, Of);
+  }
+
+  IteratorPosition setTo(SymbolRef NewOf) const {
+    return IteratorPosition(Cont, NewOf);
+  }
+
+  bool operator==(const IteratorPosition &X) const {
+    return Cont == X.Cont && Offset == X.Offset;
+  }
+
+  bool operator!=(const IteratorPosition &X) const {
+    return Cont != X.Cont || Offset != X.Offset;
+  }
+
+  void Profile(llvm::FoldingSetNodeID &ID) const {
+    ID.AddPointer(Cont);
+    ID.Add(Offset);
+  }
+};
+
+typedef llvm::PointerUnion<const MemRegion *, SymbolRef> RegionOrSymbol;
+
+// Structure to record the symbolic end position of a container
+struct ContainerData {
+private:
+  SymbolRef End;
+
+  ContainerData(SymbolRef E) : End(E) {}
+
+public:
+  static ContainerData fromEnd(SymbolRef E) {
+    return ContainerData(E);
+  }
+
+  SymbolRef getEnd() const { return End; }
+
+  ContainerData newEnd(SymbolRef E) const { return ContainerData(E); }
+
+  bool operator==(const ContainerData &X) const {
+    return End == X.End;
+  }
+
+  bool operator!=(const ContainerData &X) const {
+    return End != X.End;
+  }
+
+  void Profile(llvm::FoldingSetNodeID &ID) const {
+    ID.Add(End);
+  }
+};
+
+// Structure fo recording iterator comparisons. We needed to retrieve the
+// original comparison expression in assumptions.
+struct IteratorComparison {
+private:
+  RegionOrSymbol Left, Right;
+  bool Equality;
+
+public:
+  IteratorComparison(RegionOrSymbol L, RegionOrSymbol R, bool Eq)
+      : Left(L), Right(R), Equality(Eq) {}
+
+  RegionOrSymbol getLeft() const { return Left; }
+  RegionOrSymbol getRight() const { return Right; }
+  bool isEquality() const { return Equality; }
+  bool operator==(const IteratorComparison &X) const {
+    return Left == X.Left && Right == X.Right && Equality == X.Equality;
+  }
+  bool operator!=(const IteratorComparison &X) const {
+    return Left != X.Left || Right != X.Right || Equality != X.Equality;
+  }
+  void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(Equality); }
+};
+
+class IteratorChecker
+    : public Checker<check::PreCall, check::PostCall,
+                     check::PostStmt<MaterializeTemporaryExpr>,
+                     check::DeadSymbols,
+                     eval::Assume> {
+
+  std::unique_ptr<BugType> OutOfRangeBugType;
+
+  void handleComparison(CheckerContext &C, const SVal &RetVal, const SVal &LVal,
+                        const SVal &RVal, OverloadedOperatorKind Op) const;
+  void verifyDereference(CheckerContext &C, const SVal &Val) const;
+  void handleEnd(CheckerContext &C, const Expr *CE, const SVal &RetVal,
+                 const SVal &Cont) const;
+  void assignToContainer(CheckerContext &C, const Expr *CE, const SVal &RetVal,
+                         const MemRegion *Cont) const;
+  void reportOutOfRangeBug(const StringRef &Message, const SVal &Val,
+                           CheckerContext &C, ExplodedNode *ErrNode) const;
+
+public:
+  IteratorChecker();
+
+  enum CheckKind {
+    CK_IteratorRangeChecker,
+    CK_NumCheckKinds
+  };
+
+  DefaultBool ChecksEnabled[CK_NumCheckKinds];
+  CheckName CheckNames[CK_NumCheckKinds];
+
+  void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
+  void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
+  void checkPostStmt(const MaterializeTemporaryExpr *MTE,
+                     CheckerContext &C) const;
+  void checkDeadSymbols(SymbolReaper &SR, CheckerContext &C) const;
+  ProgramStateRef evalAssume(ProgramStateRef State, SVal Cond,
+                             bool Assumption) const;
+};
+} // namespace
+
+REGISTER_MAP_WITH_PROGRAMSTATE(IteratorSymbolMap, SymbolRef, IteratorPosition)
+REGISTER_MAP_WITH_PROGRAMSTATE(IteratorRegionMap, const MemRegion *,
+                               IteratorPosition)
+
+REGISTER_MAP_WITH_PROGRAMSTATE(ContainerMap, const MemRegion *, ContainerData)
+
+REGISTER_MAP_WITH_PROGRAMSTATE(IteratorComparisonMap, const SymExpr *,
+                               IteratorComparison)
+
+namespace {
+
+bool isIteratorType(const QualType &Type);
+bool isIterator(const CXXRecordDecl *CRD);
+bool isEndCall(const FunctionDecl *Func);
+bool isSimpleComparisonOperator(OverloadedOperatorKind OK);
+bool isDereferenceOperator(OverloadedOperatorKind OK);
+BinaryOperator::Opcode getOpcode(const SymExpr *SE);
+const RegionOrSymbol getRegionOrSymbol(const SVal &Val);
+const ProgramStateRef processComparison(ProgramStateRef State,
+                                        RegionOrSymbol LVal,
+                                        RegionOrSymbol RVal, bool Equal);
+const ProgramStateRef saveComparison(ProgramStateRef State,
+                                     const SymExpr *Condition, const SVal &LVal,
+                                     const SVal &RVal, bool Eq);
+const IteratorComparison *loadComparison(ProgramStateRef State,
+                                         const SymExpr *Condition);
+SymbolRef getContainerEnd(ProgramStateRef State, const MemRegion *Cont);
+ProgramStateRef createContainerEnd(ProgramStateRef State, const MemRegion *Cont,
+                                   const SymbolRef Sym);
+const IteratorPosition *getIteratorPosition(ProgramStateRef State,
+                                            const SVal &Val);
+const IteratorPosition *getIteratorPosition(ProgramStateRef State,
+                                            RegionOrSymbol RegOrSym);
+ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
+                                    const IteratorPosition &Pos);
+ProgramStateRef setIteratorPosition(ProgramStateRef State,
+                                    RegionOrSymbol RegOrSym,
+                                    const IteratorPosition &Pos);
+ProgramStateRef removeIteratorPosition(ProgramStateRef State, const SVal &Val);
+ProgramStateRef adjustIteratorPosition(ProgramStateRef State,
+                                       RegionOrSymbol RegOrSym,
+                                       const IteratorPosition &Pos, bool Equal);
+ProgramStateRef relateIteratorPositions(ProgramStateRef State,
+                                        const IteratorPosition &Pos1,
+                                        const IteratorPosition &Pos2,
+                                        bool Equal);
+const ContainerData *getContainerData(ProgramStateRef State,
+                                      const MemRegion *Cont);
+ProgramStateRef setContainerData(ProgramStateRef State, const MemRegion *Cont,
+                                 const ContainerData &CData);
+bool isOutOfRange(ProgramStateRef State, const IteratorPosition &Pos);
+} // namespace
+
+IteratorChecker::IteratorChecker() {
+  OutOfRangeBugType.reset(
+      new BugType(this, "Iterator out of range", "Misuse of STL APIs"));
+  OutOfRangeBugType->setSuppressOnSink(true);
+}
+
+void IteratorChecker::checkPreCall(const CallEvent &Call,
+                                   CheckerContext &C) const {
+  // Check for out of range access
+  const auto *Func = dyn_cast_or_null<FunctionDecl>(Call.getDecl());
+  if (!Func)
+    return;
+
+  if (Func->isOverloadedOperator()) {
+    if (ChecksEnabled[CK_IteratorRangeChecker] &&
+               isDereferenceOperator(Func->getOverloadedOperator())) {
+      // Check for dereference of out-of-range iterators
+      if (const auto *InstCall = dyn_cast<CXXInstanceCall>(&Call)) {
+        verifyDereference(C, InstCall->getCXXThisVal());
+      } else {
+        verifyDereference(C, Call.getArgSVal(0));
+      }
+    }
+  }
+}
+
+void IteratorChecker::checkPostCall(const CallEvent &Call,
+                                    CheckerContext &C) const {
+  // Record new iterator positions and iterator position changes
+  const auto *Func = dyn_cast_or_null<FunctionDecl>(Call.getDecl());
+  if (!Func)
+    return;
+
+  if (Func->isOverloadedOperator()) {
+    const auto Op = Func->getOverloadedOperator();
+    if (isSimpleComparisonOperator(Op)) {
+      if (const auto *InstCall = dyn_cast<CXXInstanceCall>(&Call)) {
+        handleComparison(C, Call.getReturnValue(), InstCall->getCXXThisVal(),
+                         Call.getArgSVal(0), Op);
+      } else {
+        handleComparison(C, Call.getReturnValue(), Call.getArgSVal(0),
+                         Call.getArgSVal(1), Op);
+      }
+    }
+  } else {
+    const auto *OrigExpr = Call.getOriginExpr();
+    if (!OrigExpr)
+      return;
+
+    if (!isIteratorType(Call.getResultType()))
+      return;
+
+    auto State = C.getState();
+    // Already bound to container?
+    if (getIteratorPosition(State, Call.getReturnValue()))
+      return;
+
+    if (const auto *InstCall = dyn_cast<CXXInstanceCall>(&Call)) {
+      if (isEndCall(Func)) {
+        handleEnd(C, OrigExpr, Call.getReturnValue(),
+                  InstCall->getCXXThisVal());
+        return;
+      }
+    }
+
+    // Copy-like and move constructors
+    if (isa<CXXConstructorCall>(&Call) && Call.getNumArgs() == 1) {
+      if (const auto *Pos = getIteratorPosition(State, Call.getArgSVal(0))) {
+        State = setIteratorPosition(State, Call.getReturnValue(), *Pos);
+        if (cast<CXXConstructorDecl>(Func)->isMoveConstructor()) {
+          State = removeIteratorPosition(State, Call.getArgSVal(0));
+        }
+        C.addTransition(State);
+        return;
+      }
+    }
+
+    // Assumption: if return value is an iterator which is not yet bound to a
+    //             container, then look for the first iterator argument, and
+    //             bind the return value to the same container. This approach
+    //             works for STL algorithms.
+    // FIXME: Add a more conservative mode
+    for (unsigned i = 0; i < Call.getNumArgs(); ++i) {
+      if (isIteratorType(Call.getArgExpr(i)->getType())) {
+        if (const auto *Pos = getIteratorPosition(State, Call.getArgSVal(i))) {
+          assignToContainer(C, OrigExpr, Call.getReturnValue(),
+                            Pos->getContainer());
+          return;
+        }
+      }
+    }
+  }
+}
+
+void IteratorChecker::checkPostStmt(const MaterializeTemporaryExpr *MTE,
+                                    CheckerContext &C) const {
+  /* Transfer iterator state to temporary objects */
+  auto State = C.getState();
+  const auto *LCtx = C.getLocationContext();
+  const auto *Pos =
+      getIteratorPosition(State, State->getSVal(MTE->GetTemporaryExpr(), LCtx));
+  if (!Pos)
+    return;
+  State = setIteratorPosition(State, State->getSVal(MTE, LCtx), *Pos);
+  C.addTransition(State);
+}
+
+void IteratorChecker::checkDeadSymbols(SymbolReaper &SR,
+                                       CheckerContext &C) const {
+  // Cleanup
+  auto State = C.getState();
+
+  auto RegionMap = State->get<IteratorRegionMap>();
+  for (const auto Reg : RegionMap) {
+    if (!SR.isLiveRegion(Reg.first)) {
+      State = State->remove<IteratorRegionMap>(Reg.first);
+    }
+  }
+
+  auto SymbolMap = State->get<IteratorSymbolMap>();
+  for (const auto Sym : SymbolMap) {
+    if (!SR.isLive(Sym.first)) {
+      State = State->remove<IteratorSymbolMap>(Sym.first);
+    }
+  }
+
+  auto ContMap = State->get<ContainerMap>();
+  for (const auto Cont : ContMap) {
+    if (!SR.isLiveRegion(Cont.first)) {
+      State = State->remove<ContainerMap>(Cont.first);
+    }
+  }
+
+  auto ComparisonMap = State->get<IteratorComparisonMap>();
+  for (const auto Comp : ComparisonMap) {
+    if (!SR.isLive(Comp.first)) {
+      State = State->remove<IteratorComparisonMap>(Comp.first);
+    }
+  }
+}
+
+ProgramStateRef IteratorChecker::evalAssume(ProgramStateRef State, SVal Cond,
+                                            bool Assumption) const {
+  // Load recorded comparison and transfer iterator state between sides
+  // according to comparison operator and assumption
+  const auto *SE = Cond.getAsSymExpr();
+  if (!SE)
+    return State;
+
+  auto Opc = getOpcode(SE);
+  if (Opc != BO_EQ && Opc != BO_NE)
+    return State;
+
+  bool Negated = false;
+  const auto *Comp = loadComparison(State, SE);
+  if (!Comp) {
+    // Try negated comparison, which is a SymExpr to 0 integer comparison
+    const auto *SIE = dyn_cast<SymIntExpr>(SE);
+    if (!SIE)
+      return State;
+
+    if (SIE->getRHS() != 0)
+      return State;
+
+    SE = SIE->getLHS();
+    Negated = SIE->getOpcode() == BO_EQ; // Equal to zero means negation
+    Opc = getOpcode(SE);
+    if (Opc != BO_EQ && Opc != BO_NE)
+      return State;
+
+    Comp = loadComparison(State, SE);
+    if (!Comp)
+      return State;
+  }
+
+  return processComparison(State, Comp->getLeft(), Comp->getRight(),
+                           (Comp->isEquality() == Assumption) != Negated);
+}
+
+void IteratorChecker::handleComparison(CheckerContext &C, const SVal &RetVal,
+                                       const SVal &LVal, const SVal &RVal,
+                                       OverloadedOperatorKind Op) const {
+  // Record the operands and the operator of the comparison for the next
+  // evalAssume, if the result is a symbolic expression. If it is a concrete
+  // value (only one branch is possible), then transfer the state between
+  // the operands according to the operator and the result
+  auto State = C.getState();
+  if (const auto *Condition = RetVal.getAsSymbolicExpression()) {
+    const auto *LPos = getIteratorPosition(State, LVal);
+    const auto *RPos = getIteratorPosition(State, RVal);
+    if (!LPos && !RPos)
+      return;
+    State = saveComparison(State, Condition, LVal, RVal, Op == OO_EqualEqual);
+    C.addTransition(State);
+  } else if (const auto TruthVal = RetVal.getAs<nonloc::ConcreteInt>()) {
+    if ((State = processComparison(
+             State, getRegionOrSymbol(LVal), getRegionOrSymbol(RVal),
+             (Op == OO_EqualEqual) == (TruthVal->getValue() != 0)))) {
+      C.addTransition(State);
+    } else {
+      C.generateSink(State, C.getPredecessor());
+    }
+  }
+}
+
+void IteratorChecker::verifyDereference(CheckerContext &C,
+                                        const SVal &Val) const {
+  auto State = C.getState();
+  const auto *Pos = getIteratorPosition(State, Val);
+  if (Pos && isOutOfRange(State, *Pos)) {
+    // If I do not put a tag here, some range tests will fail
+    static CheckerProgramPointTag Tag("IteratorRangeChecker",
+                                      "IteratorOutOfRange");
+    auto *N = C.generateNonFatalErrorNode(State, &Tag);
+    if (!N) {
+      return;
+    }
+    reportOutOfRangeBug("Iterator accessed outside of its range.", Val, C, N);
+  }
+}
+
+void IteratorChecker::handleEnd(CheckerContext &C, const Expr *CE,
+                                const SVal &RetVal, const SVal &Cont) const {
+  const auto *ContReg = Cont.getAsRegion();
+  if (!ContReg)
+    return;
+
+  while (const auto *CBOR = ContReg->getAs<CXXBaseObjectRegion>()) {
+    ContReg = CBOR->getSuperRegion();
+  }
+
+  // If the container already has an end symbol then use it. Otherwise first
+  // create a new one.
+  auto State = C.getState();
+  auto EndSym = getContainerEnd(State, ContReg);
+  if (!EndSym) {
+    auto &SymMgr = C.getSymbolManager();
+    EndSym = SymMgr.conjureSymbol(CE, C.getLocationContext(),
+                                  C.getASTContext().LongTy, C.blockCount());
+    State = createContainerEnd(State, ContReg, EndSym);
+  }
+  State = setIteratorPosition(State, RetVal,
+                              IteratorPosition::getPosition(ContReg, EndSym));
+  C.addTransition(State);
+}
+
+void IteratorChecker::assignToContainer(CheckerContext &C, const Expr *CE,
+                                        const SVal &RetVal,
+                                        const MemRegion *Cont) const {
+  while (const auto *CBOR = Cont->getAs<CXXBaseObjectRegion>()) {
+    Cont = CBOR->getSuperRegion();
+  }
+
+  auto State = C.getState();
+  auto &SymMgr = C.getSymbolManager();
+  auto Sym = SymMgr.conjureSymbol(CE, C.getLocationContext(),
+                                  C.getASTContext().LongTy, C.blockCount());
+  State = setIteratorPosition(State, RetVal,
+                              IteratorPosition::getPosition(Cont, Sym));
+  C.addTransition(State);
+}
+
+void IteratorChecker::reportOutOfRangeBug(const StringRef &Message,
+                                          const SVal &Val, CheckerContext &C,
+                                          ExplodedNode *ErrNode) const {
+  auto R = llvm::make_unique<BugReport>(*OutOfRangeBugType, Message, ErrNode);
+  R->markInteresting(Val);
+  C.emitReport(std::move(R));
+}
+
+namespace {
+
+bool isGreaterOrEqual(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2);
+bool compare(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2,
+             BinaryOperator::Opcode Opc);
+
+bool isIteratorType(const QualType &Type) {
+  if (Type->isPointerType())
+    return true;
+
+  const auto *CRD = Type->getUnqualifiedDesugaredType()->getAsCXXRecordDecl();
+  return isIterator(CRD);
+}
+
+bool isIterator(const CXXRecordDecl *CRD) {
+  if (!CRD)
+    return false;
+
+  const auto Name = CRD->getName();
+  if (!(Name.endswith_lower("iterator") || Name.endswith_lower("iter") ||
+        Name.endswith_lower("it")))
+    return false;
+
+  bool HasCopyCtor = false, HasCopyAssign = true, HasDtor = false,
+       HasPreIncrOp = false, HasPostIncrOp = false, HasDerefOp = false;
+  for (const auto *Method : CRD->methods()) {
+    if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(Method)) {
+      if (Ctor->isCopyConstructor()) {
+        HasCopyCtor = !Ctor->isDeleted() && Ctor->getAccess() == AS_public;
+      }
+      continue;
+    }
+    if (const auto *Dtor = dyn_cast<CXXDestructorDecl>(Method)) {
+      HasDtor = !Dtor->isDeleted() && Dtor->getAccess() == AS_public;
+      continue;
+    }
+    if (Method->isCopyAssignmentOperator()) {
+      HasCopyAssign = !Method->isDeleted() && Method->getAccess() == AS_public;
+      continue;
+    }
+    if (!Method->isOverloadedOperator())
+      continue;
+    const auto OPK = Method->getOverloadedOperator();
+    if (OPK == OO_PlusPlus) {
+      HasPreIncrOp = HasPreIncrOp || (Method->getNumParams() == 0);
+      HasPostIncrOp = HasPostIncrOp || (Method->getNumParams() == 1);
+      continue;
+    }
+    if (OPK == OO_Star) {
+      HasDerefOp = (Method->getNumParams() == 0);
+      continue;
+    }
+  }
+
+  return HasCopyCtor && HasCopyAssign && HasDtor && HasPreIncrOp &&
+         HasPostIncrOp && HasDerefOp;
+}
+
+bool isEndCall(const FunctionDecl *Func) {
+  const auto *IdInfo = Func->getIdentifier();
+  if (!IdInfo)
+    return false;
+  return IdInfo->getName().endswith_lower("end");
+}
+
+bool isSimpleComparisonOperator(OverloadedOperatorKind OK) {
+  return OK == OO_EqualEqual || OK == OO_ExclaimEqual;
+}
+
+bool isDereferenceOperator(OverloadedOperatorKind OK) {
+  return OK == OO_Star || OK == OO_Arrow || OK == OO_ArrowStar ||
+         OK == OO_Subscript;
+}
+
+BinaryOperator::Opcode getOpcode(const SymExpr *SE) {
+  if (const auto *BSE = dyn_cast<BinarySymExpr>(SE)) {
+    return BSE->getOpcode();
+  } else if (const auto *SC = dyn_cast<SymbolConjured>(SE)) {
+    const auto *COE = dyn_cast<CXXOperatorCallExpr>(SC->getStmt());
+    if (!COE)
+      return BO_Comma; // Extremal value, neither EQ nor NE
+    if (COE->getOperator() == OO_EqualEqual) {
+      return BO_EQ;
+    } else if (COE->getOperator() == OO_ExclaimEqual) {
+      return BO_NE;
+    }
+    return BO_Comma; // Extremal value, neither EQ nor NE
+  }
+  return BO_Comma; // Extremal value, neither EQ nor NE
+}
+
+const RegionOrSymbol getRegionOrSymbol(const SVal &Val) {
+  if (const auto Reg = Val.getAsRegion()) {
+    return Reg;
+  } else if (const auto Sym = Val.getAsSymbol()) {
+    return Sym;
+  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
+    return LCVal->getRegion();
+  }
+  return RegionOrSymbol();
+}
+
+const ProgramStateRef processComparison(ProgramStateRef State,
+                                        RegionOrSymbol LVal,
+                                        RegionOrSymbol RVal, bool Equal) {
+  const auto *LPos = getIteratorPosition(State, LVal);
+  const auto *RPos = getIteratorPosition(State, RVal);
+  if (LPos && !RPos) {
+    State = adjustIteratorPosition(State, RVal, *LPos, Equal);
+  } else if (!LPos && RPos) {
+    State = adjustIteratorPosition(State, LVal, *RPos, Equal);
+  } else if (LPos && RPos) {
+    State = relateIteratorPositions(State, *LPos, *RPos, Equal);
+  }
+  return State;
+}
+
+const ProgramStateRef saveComparison(ProgramStateRef State,
+                                     const SymExpr *Condition, const SVal &LVal,
+                                     const SVal &RVal, bool Eq) {
+  const auto Left = getRegionOrSymbol(LVal);
+  const auto Right = getRegionOrSymbol(RVal);
+  if (!Left || !Right)
+    return State;
+  return State->set<IteratorComparisonMap>(Condition,
+                                           IteratorComparison(Left, Right, Eq));
+}
+
+const IteratorComparison *loadComparison(ProgramStateRef State,
+                                         const SymExpr *Condition) {
+  return State->get<IteratorComparisonMap>(Condition);
+}
+
+SymbolRef getContainerEnd(ProgramStateRef State, const MemRegion *Cont) {
+  const auto *CDataPtr = getContainerData(State, Cont);
+  if (!CDataPtr)
+    return nullptr;
+
+  return CDataPtr->getEnd();
+}
+
+ProgramStateRef createContainerEnd(ProgramStateRef State, const MemRegion *Cont,
+                                   const SymbolRef Sym) {
+  // Only create if it does not exist
+  const auto *CDataPtr = getContainerData(State, Cont);
+  if (CDataPtr) {
+    if (CDataPtr->getEnd()) {
+      return State;
+    } else {
+      const auto CData = CDataPtr->newEnd(Sym);
+      return setContainerData(State, Cont, CData);
+    }
+  } else {
+    const auto CData = ContainerData::fromEnd(Sym);
+    return setContainerData(State, Cont, CData);
+  }
+}
+
+const ContainerData *getContainerData(ProgramStateRef State,
+                                      const MemRegion *Cont) {
+  return State->get<ContainerMap>(Cont);
+}
+
+ProgramStateRef setContainerData(ProgramStateRef State, const MemRegion *Cont,
+                                 const ContainerData &CData) {
+  return State->set<ContainerMap>(Cont, CData);
+}
+
+const IteratorPosition *getIteratorPosition(ProgramStateRef State,
+                                            const SVal &Val) {
+  if (const auto Reg = Val.getAsRegion()) {
+    return State->get<IteratorRegionMap>(Reg);
+  } else if (const auto Sym = Val.getAsSymbol()) {
+    return State->get<IteratorSymbolMap>(Sym);
+  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
+    return State->get<IteratorRegionMap>(LCVal->getRegion());
+  }
+  return nullptr;
+}
+
+const IteratorPosition *getIteratorPosition(ProgramStateRef State,
+                                            RegionOrSymbol RegOrSym) {
+  if (RegOrSym.is<const MemRegion *>()) {
+    return State->get<IteratorRegionMap>(RegOrSym.get<const MemRegion *>());
+  } else if (RegOrSym.is<SymbolRef>()) {
+    return State->get<IteratorSymbolMap>(RegOrSym.get<SymbolRef>());
+  }
+  return nullptr;
+}
+
+ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
+                                    const IteratorPosition &Pos) {
+  if (const auto Reg = Val.getAsRegion()) {
+    return State->set<IteratorRegionMap>(Reg, Pos);
+  } else if (const auto Sym = Val.getAsSymbol()) {
+    return State->set<IteratorSymbolMap>(Sym, Pos);
+  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
+    return State->set<IteratorRegionMap>(LCVal->getRegion(), Pos);
+  }
+  return nullptr;
+}
+
+ProgramStateRef setIteratorPosition(ProgramStateRef State,
+                                    RegionOrSymbol RegOrSym,
+                                    const IteratorPosition &Pos) {
+  if (RegOrSym.is<const MemRegion *>()) {
+    return State->set<IteratorRegionMap>(RegOrSym.get<const MemRegion *>(),
+                                         Pos);
+  } else if (RegOrSym.is<SymbolRef>()) {
+    return State->set<IteratorSymbolMap>(RegOrSym.get<SymbolRef>(), Pos);
+  }
+  return nullptr;
+}
+
+ProgramStateRef removeIteratorPosition(ProgramStateRef State, const SVal &Val) {
+  if (const auto Reg = Val.getAsRegion()) {
+    return State->remove<IteratorRegionMap>(Reg);
+  } else if (const auto Sym = Val.getAsSymbol()) {
+    return State->remove<IteratorSymbolMap>(Sym);
+  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
+    return State->remove<IteratorRegionMap>(LCVal->getRegion());
+  }
+  return nullptr;
+}
+
+ProgramStateRef adjustIteratorPosition(ProgramStateRef State,
+                                       RegionOrSymbol RegOrSym,
+                                       const IteratorPosition &Pos,
+                                       bool Equal) {
+  if (Equal) {
+    return setIteratorPosition(State, RegOrSym, Pos);
+  } else {
+    return State;
+  }
+}
+
+ProgramStateRef relateIteratorPositions(ProgramStateRef State,
+                                        const IteratorPosition &Pos1,
+                                        const IteratorPosition &Pos2,
+                                        bool Equal) {
+  // Try to compare them and get a defined value
+  auto &SVB = State->getStateManager().getSValBuilder();
+  const auto comparison =
+      SVB.evalBinOp(State, BO_EQ, nonloc::SymbolVal(Pos1.getOffset()),
+                    nonloc::SymbolVal(Pos2.getOffset()), SVB.getConditionType())
+          .getAs<DefinedSVal>();
+  if (comparison) {
+    return State->assume(*comparison, Equal);
+  }
+
+  return State;
+}
+
+bool isOutOfRange(ProgramStateRef State, const IteratorPosition &Pos) {
+  const auto *Cont = Pos.getContainer();
+  const auto *CData = getContainerData(State, Cont);
+  if (!CData)
+    return false;
+
+  // Out of range means less than the begin symbol or greater or equal to the
+  // end symbol.
+
+  const auto End = CData->getEnd();
+  if (End) {
+    if (isGreaterOrEqual(State, Pos.getOffset(), End)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool isGreaterOrEqual(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2) {
+  return compare(State, Sym1, Sym2, BO_GE);
+}
+
+bool compare(ProgramStateRef State, SymbolRef Sym1, SymbolRef Sym2,
+             BinaryOperator::Opcode Opc) {
+  auto &SMgr = State->getStateManager();
+  auto &SVB = SMgr.getSValBuilder();
+
+  const auto comparison =
+      SVB.evalBinOp(State, Opc, nonloc::SymbolVal(Sym1),
+                    nonloc::SymbolVal(Sym2), SVB.getConditionType())
+          .getAs<DefinedSVal>();
+
+  if(comparison) {
+    return !!State->assume(*comparison, true);
+  }
+
+  return false;
+}
+
+} // namespace
+
+#define REGISTER_CHECKER(name)                                                 \
+  void ento::register##name(CheckerManager &Mgr) {                             \
+    auto *checker = Mgr.registerChecker<IteratorChecker>();                    \
+    checker->ChecksEnabled[IteratorChecker::CK_##name] = true;                 \
+    checker->CheckNames[IteratorChecker::CK_##name] =                          \
+        Mgr.getCurrentCheckName();                                             \
+  }
+
+REGISTER_CHECKER(IteratorRangeChecker)
diff --git a/lib/StaticAnalyzer/Checkers/IteratorPastEndChecker.cpp b/lib/StaticAnalyzer/Checkers/IteratorPastEndChecker.cpp
deleted file mode 100644
index f0f7c98c9640..000000000000
--- a/lib/StaticAnalyzer/Checkers/IteratorPastEndChecker.cpp
+++ /dev/null
@@ -1,840 +0,0 @@
-//===-- IteratorPastEndChecker.cpp --------------------------------*- C++ -*--//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Defines a checker for using iterators outside their range (past end). Usage
-// means here dereferencing, incrementing etc.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ClangSACheckers.h"
-#include "clang/StaticAnalyzer/Core/BugReporter/BugType.h"
-#include "clang/StaticAnalyzer/Core/Checker.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/CallEvent.h"
-#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
-
-#include <utility>
-
-using namespace clang;
-using namespace ento;
-
-namespace {
-struct IteratorPosition {
-private:
-  enum Kind { InRange, OutofRange } K;
-  IteratorPosition(Kind InK) : K(InK) {}
-
-public:
-  bool isInRange() const { return K == InRange; }
-  bool isOutofRange() const { return K == OutofRange; }
-
-  static IteratorPosition getInRange() { return IteratorPosition(InRange); }
-  static IteratorPosition getOutofRange() {
-    return IteratorPosition(OutofRange);
-  }
-
-  bool operator==(const IteratorPosition &X) const { return K == X.K; }
-  bool operator!=(const IteratorPosition &X) const { return K != X.K; }
-  void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(K); }
-};
-
-typedef llvm::PointerUnion<const MemRegion *, SymbolRef> RegionOrSymbol;
-
-struct IteratorComparison {
-private:
-  RegionOrSymbol Left, Right;
-  bool Equality;
-
-public:
-  IteratorComparison(RegionOrSymbol L, RegionOrSymbol R, bool Eq)
-      : Left(L), Right(R), Equality(Eq) {}
-
-  RegionOrSymbol getLeft() const { return Left; }
-  RegionOrSymbol getRight() const { return Right; }
-  bool isEquality() const { return Equality; }
-  bool operator==(const IteratorComparison &X) const {
-    return Left == X.Left && Right == X.Right && Equality == X.Equality;
-  }
-  bool operator!=(const IteratorComparison &X) const {
-    return Left != X.Left || Right != X.Right || Equality != X.Equality;
-  }
-  void Profile(llvm::FoldingSetNodeID &ID) const { ID.AddInteger(Equality); }
-};
-
-class IteratorPastEndChecker
-    : public Checker<
-          check::PreCall, check::PostCall, check::PreStmt<CXXOperatorCallExpr>,
-          check::PostStmt<CXXConstructExpr>, check::PostStmt<DeclStmt>,
-          check::PostStmt<MaterializeTemporaryExpr>, check::BeginFunction,
-          check::DeadSymbols, eval::Assume, eval::Call> {
-  mutable IdentifierInfo *II_find = nullptr,
-                         *II_find_end = nullptr, *II_find_first_of = nullptr,
-                         *II_find_if = nullptr, *II_find_if_not = nullptr,
-                         *II_lower_bound = nullptr, *II_upper_bound = nullptr,
-                         *II_search = nullptr, *II_search_n = nullptr;
-
-  std::unique_ptr<BugType> PastEndBugType;
-
-  void handleComparison(CheckerContext &C, const SVal &RetVal, const SVal &LVal,
-                        const SVal &RVal, OverloadedOperatorKind Op) const;
-  void handleAccess(CheckerContext &C, const SVal &Val) const;
-  void handleDecrement(CheckerContext &C, const SVal &Val) const;
-  void handleEnd(CheckerContext &C, const SVal &RetVal) const;
-
-  bool evalFind(CheckerContext &C, const CallExpr *CE) const;
-  bool evalFindEnd(CheckerContext &C, const CallExpr *CE) const;
-  bool evalFindFirstOf(CheckerContext &C, const CallExpr *CE) const;
-  bool evalFindIf(CheckerContext &C, const CallExpr *CE) const;
-  bool evalFindIfNot(CheckerContext &C, const CallExpr *CE) const;
-  bool evalLowerBound(CheckerContext &C, const CallExpr *CE) const;
-  bool evalUpperBound(CheckerContext &C, const CallExpr *CE) const;
-  bool evalSearch(CheckerContext &C, const CallExpr *CE) const;
-  bool evalSearchN(CheckerContext &C, const CallExpr *CE) const;
-  void Find(CheckerContext &C, const CallExpr *CE) const;
-
-  void reportPastEndBug(const StringRef &Message, const SVal &Val,
-                        CheckerContext &C, ExplodedNode *ErrNode) const;
-  void initIdentifiers(ASTContext &Ctx) const;
-
-public:
-  IteratorPastEndChecker();
-
-  void checkPreCall(const CallEvent &Call, CheckerContext &C) const;
-  void checkPostCall(const CallEvent &Call, CheckerContext &C) const;
-  void checkPreStmt(const CXXOperatorCallExpr *COCE, CheckerContext &C) const;
-  void checkBeginFunction(CheckerContext &C) const;
-  void checkPostStmt(const CXXConstructExpr *CCE, CheckerContext &C) const;
-  void checkPostStmt(const DeclStmt *DS, CheckerContext &C) const;
-  void checkPostStmt(const MaterializeTemporaryExpr *MTE,
-                     CheckerContext &C) const;
-  void checkDeadSymbols(SymbolReaper &SR, CheckerContext &C) const;
-  ProgramStateRef evalAssume(ProgramStateRef State, SVal Cond,
-                             bool Assumption) const;
-  bool evalCall(const CallExpr *CE, CheckerContext &C) const;
-};
-}
-
-REGISTER_MAP_WITH_PROGRAMSTATE(IteratorSymbolMap, SymbolRef, IteratorPosition)
-REGISTER_MAP_WITH_PROGRAMSTATE(IteratorRegionMap, const MemRegion *,
-                               IteratorPosition)
-
-REGISTER_MAP_WITH_PROGRAMSTATE(IteratorComparisonMap, const SymExpr *,
-                               IteratorComparison)
-
-#define INIT_ID(Id)                                                            \
-  if (!II_##Id)                                                                \
-  II_##Id = &Ctx.Idents.get(#Id)
-
-namespace {
-
-bool isIteratorType(const QualType &Type);
-bool isIterator(const CXXRecordDecl *CRD);
-bool isEndCall(const FunctionDecl *Func);
-bool isSimpleComparisonOperator(OverloadedOperatorKind OK);
-bool isAccessOperator(OverloadedOperatorKind OK);
-bool isDecrementOperator(OverloadedOperatorKind OK);
-BinaryOperator::Opcode getOpcode(const SymExpr *SE);
-const RegionOrSymbol getRegionOrSymbol(const SVal &Val);
-const ProgramStateRef processComparison(ProgramStateRef State,
-                                        RegionOrSymbol LVal,
-                                        RegionOrSymbol RVal, bool Equal);
-const ProgramStateRef saveComparison(ProgramStateRef State,
-                                     const SymExpr *Condition, const SVal &LVal,
-                                     const SVal &RVal, bool Eq);
-const IteratorComparison *loadComparison(ProgramStateRef State,
-                                         const SymExpr *Condition);
-const IteratorPosition *getIteratorPosition(ProgramStateRef State,
-                                            const SVal &Val);
-const IteratorPosition *getIteratorPosition(ProgramStateRef State,
-                                            RegionOrSymbol RegOrSym);
-ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
-                                    IteratorPosition Pos);
-ProgramStateRef setIteratorPosition(ProgramStateRef State,
-                                    RegionOrSymbol RegOrSym,
-                                    IteratorPosition Pos);
-ProgramStateRef adjustIteratorPosition(ProgramStateRef State,
-                                       RegionOrSymbol RegOrSym,
-                                       IteratorPosition Pos, bool Equal);
-bool contradictingIteratorPositions(IteratorPosition Pos1,
-                                    IteratorPosition Pos2, bool Equal);
-}
-
-IteratorPastEndChecker::IteratorPastEndChecker() {
-  PastEndBugType.reset(
-      new BugType(this, "Iterator Past End", "Misuse of STL APIs"));
-  PastEndBugType->setSuppressOnSink(true);
-}
-
-void IteratorPastEndChecker::checkPreCall(const CallEvent &Call,
-                                          CheckerContext &C) const {
-  // Check for access past end
-  const auto *Func = Call.getDecl()->getAsFunction();
-  if (!Func)
-    return;
-  if (Func->isOverloadedOperator()) {
-    if (isAccessOperator(Func->getOverloadedOperator())) {
-      if (const auto *InstCall = dyn_cast<CXXInstanceCall>(&Call)) {
-        handleAccess(C, InstCall->getCXXThisVal());
-      } else {
-        handleAccess(C, Call.getArgSVal(0));
-      }
-    }
-  }
-}
-
-void IteratorPastEndChecker::checkPostCall(const CallEvent &Call,
-                                           CheckerContext &C) const {
-  // Record end() iterators, iterator decrementation and comparison
-  const auto *Func = Call.getDecl()->getAsFunction();
-  if (!Func)
-    return;
-  if (Func->isOverloadedOperator()) {
-    const auto Op = Func->getOverloadedOperator();
-    if (isSimpleComparisonOperator(Op)) {
-      if (Func->isCXXInstanceMember()) {
-        const auto &InstCall = static_cast<const CXXInstanceCall &>(Call);
-        handleComparison(C, InstCall.getReturnValue(), InstCall.getCXXThisVal(),
-                         InstCall.getArgSVal(0), Op);
-      } else {
-        handleComparison(C, Call.getReturnValue(), Call.getArgSVal(0),
-                         Call.getArgSVal(1), Op);
-      }
-    } else if (isDecrementOperator(Func->getOverloadedOperator())) {
-      if (Func->isCXXInstanceMember()) {
-        const auto &InstCall = static_cast<const CXXInstanceCall &>(Call);
-        handleDecrement(C, InstCall.getCXXThisVal());
-      } else {
-        handleDecrement(C, Call.getArgSVal(0));
-      }
-    }
-  } else if (Func->isCXXInstanceMember()) {
-    if (!isEndCall(Func))
-      return;
-    if (!isIteratorType(Call.getResultType()))
-      return;
-    handleEnd(C, Call.getReturnValue());
-  }
-}
-
-void IteratorPastEndChecker::checkPreStmt(const CXXOperatorCallExpr *COCE,
-                                          CheckerContext &C) const {
-  const auto *ThisExpr = COCE->getArg(0);
-
-  auto State = C.getState();
-  const auto *LCtx = C.getPredecessor()->getLocationContext();
-
-  const auto CurrentThis = State->getSVal(ThisExpr, LCtx);
-  if (const auto *Reg = CurrentThis.getAsRegion()) {
-    if (!Reg->getAs<CXXTempObjectRegion>())
-      return;
-    const auto OldState = C.getPredecessor()->getFirstPred()->getState();
-    const auto OldThis = OldState->getSVal(ThisExpr, LCtx);
-    const auto *Pos = getIteratorPosition(OldState, OldThis);
-    if (!Pos)
-      return;
-    State = setIteratorPosition(State, CurrentThis, *Pos);
-    C.addTransition(State);
-  }
-}
-
-void IteratorPastEndChecker::checkBeginFunction(CheckerContext &C) const {
-  // Copy state of iterator arguments to iterator parameters
-  auto State = C.getState();
-  const auto *LCtx = C.getLocationContext();
-
-  const auto *Site = cast<StackFrameContext>(LCtx)->getCallSite();
-  if (!Site)
-    return;
-
-  const auto *FD = dyn_cast<FunctionDecl>(LCtx->getDecl());
-  if (!FD)
-    return;
-
-  const auto *CE = dyn_cast<CallExpr>(Site);
-  if (!CE)
-    return;
-
-  bool Change = false;
-  int idx = 0;
-  for (const auto P : FD->parameters()) {
-    auto Param = State->getLValue(P, LCtx);
-    auto Arg = State->getSVal(CE->getArg(idx++), LCtx->getParent());
-    const auto *Pos = getIteratorPosition(State, Arg);
-    if (!Pos)
-      continue;
-    State = setIteratorPosition(State, Param, *Pos);
-    Change = true;
-  }
-  if (Change) {
-    C.addTransition(State);
-  }
-}
-
-void IteratorPastEndChecker::checkPostStmt(const CXXConstructExpr *CCE,
-                                           CheckerContext &C) const {
-  // Transfer iterator state in case of copy or move by constructor
-  const auto *ctr = CCE->getConstructor();
-  if (!ctr->isCopyOrMoveConstructor())
-    return;
-  const auto *RHSExpr = CCE->getArg(0);
-
-  auto State = C.getState();
-  const auto *LCtx = C.getLocationContext();
-
-  const auto RetVal = State->getSVal(CCE, LCtx);
-
-  const auto RHSVal = State->getSVal(RHSExpr, LCtx);
-  const auto *RHSPos = getIteratorPosition(State, RHSVal);
-  if (!RHSPos)
-    return;
-  State = setIteratorPosition(State, RetVal, *RHSPos);
-  C.addTransition(State);
-}
-
-void IteratorPastEndChecker::checkPostStmt(const DeclStmt *DS,
-                                           CheckerContext &C) const {
-  // Transfer iterator state to new variable declaration
-  for (const auto *D : DS->decls()) {
-    const auto *VD = dyn_cast<VarDecl>(D);
-    if (!VD || !VD->hasInit())
-      continue;
-
-    auto State = C.getState();
-    const auto *LCtx = C.getPredecessor()->getLocationContext();
-    const auto *Pos =
-        getIteratorPosition(State, State->getSVal(VD->getInit(), LCtx));
-    if (!Pos)
-      continue;
-    State = setIteratorPosition(State, State->getLValue(VD, LCtx), *Pos);
-    C.addTransition(State);
-  }
-}
-
-void IteratorPastEndChecker::checkPostStmt(const MaterializeTemporaryExpr *MTE,
-                                           CheckerContext &C) const {
-  /* Transfer iterator state for to temporary objects */
-  auto State = C.getState();
-  const auto *LCtx = C.getPredecessor()->getLocationContext();
-  const auto *Pos =
-      getIteratorPosition(State, State->getSVal(MTE->GetTemporaryExpr(), LCtx));
-  if (!Pos)
-    return;
-  State = setIteratorPosition(State, State->getSVal(MTE, LCtx), *Pos);
-  C.addTransition(State);
-}
-
-void IteratorPastEndChecker::checkDeadSymbols(SymbolReaper &SR,
-                                              CheckerContext &C) const {
-  auto State = C.getState();
-
-  auto RegionMap = State->get<IteratorRegionMap>();
-  for (const auto Reg : RegionMap) {
-    if (!SR.isLiveRegion(Reg.first)) {
-      State = State->remove<IteratorRegionMap>(Reg.first);
-    }
-  }
-
-  auto SymbolMap = State->get<IteratorSymbolMap>();
-  for (const auto Sym : SymbolMap) {
-    if (SR.isDead(Sym.first)) {
-      State = State->remove<IteratorSymbolMap>(Sym.first);
-    }
-  }
-
-  auto ComparisonMap = State->get<IteratorComparisonMap>();
-  for (const auto Comp : ComparisonMap) {
-    if (SR.isDead(Comp.first)) {
-      State = State->remove<IteratorComparisonMap>(Comp.first);
-    }
-  }
-}
-
-ProgramStateRef IteratorPastEndChecker::evalAssume(ProgramStateRef State,
-                                                   SVal Cond,
-                                                   bool Assumption) const {
-  // Load recorded comparison and transfer iterator state between sides
-  // according to comparison operator and assumption
-  const auto *SE = Cond.getAsSymExpr();
-  if (!SE)
-    return State;
-
-  auto Opc = getOpcode(SE);
-  if (Opc != BO_EQ && Opc != BO_NE)
-    return State;
-
-  bool Negated = false;
-  const auto *Comp = loadComparison(State, SE);
-  if (!Comp) {
-    // Try negated comparison, which is a SymExpr to 0 integer comparison
-    const auto *SIE = dyn_cast<SymIntExpr>(SE);
-    if (!SIE)
-      return State;
-
-    if (SIE->getRHS() != 0)
-      return State;
-
-    SE = SIE->getLHS();
-    Negated = SIE->getOpcode() == BO_EQ; // Equal to zero means negation
-    Opc = getOpcode(SE);
-    if (Opc != BO_EQ && Opc != BO_NE)
-      return State;
-
-    Comp = loadComparison(State, SE);
-    if (!Comp)
-      return State;
-  }
-
-  return processComparison(State, Comp->getLeft(), Comp->getRight(),
-                           (Comp->isEquality() == Assumption) != Negated);
-}
-
-// FIXME: Evaluation of these STL calls should be moved to StdCLibraryFunctions
-//       checker (see patch r284960) or another similar checker for C++ STL
-//       functions (e.g. StdCXXLibraryFunctions or StdCppLibraryFunctions).
-bool IteratorPastEndChecker::evalCall(const CallExpr *CE,
-                                      CheckerContext &C) const {
-  const FunctionDecl *FD = C.getCalleeDecl(CE);
-  if (!FD)
-    return false;
-
-  ASTContext &Ctx = C.getASTContext();
-  initIdentifiers(Ctx);
-
-  if (FD->getKind() == Decl::Function) {
-    if (FD->isInStdNamespace()) {
-      if (FD->getIdentifier() == II_find) {
-        return evalFind(C, CE);
-      } else if (FD->getIdentifier() == II_find_end) {
-        return evalFindEnd(C, CE);
-      } else if (FD->getIdentifier() == II_find_first_of) {
-        return evalFindFirstOf(C, CE);
-      } else if (FD->getIdentifier() == II_find_if) {
-        return evalFindIf(C, CE);
-      } else if (FD->getIdentifier() == II_find_if_not) {
-        return evalFindIfNot(C, CE);
-      } else if (FD->getIdentifier() == II_upper_bound) {
-        return evalUpperBound(C, CE);
-      } else if (FD->getIdentifier() == II_lower_bound) {
-        return evalLowerBound(C, CE);
-      } else if (FD->getIdentifier() == II_search) {
-        return evalSearch(C, CE);
-      } else if (FD->getIdentifier() == II_search_n) {
-        return evalSearchN(C, CE);
-      }
-    }
-  }
-
-  return false;
-}
-
-void IteratorPastEndChecker::handleComparison(CheckerContext &C,
-                                              const SVal &RetVal,
-                                              const SVal &LVal,
-                                              const SVal &RVal,
-                                              OverloadedOperatorKind Op) const {
-  // Record the operands and the operator of the comparison for the next
-  // evalAssume, if the result is a symbolic expression. If it is a concrete
-  // value (only one branch is possible), then transfer the state between
-  // the operands according to the operator and the result
-  auto State = C.getState();
-  if (const auto *Condition = RetVal.getAsSymbolicExpression()) {
-    const auto *LPos = getIteratorPosition(State, LVal);
-    const auto *RPos = getIteratorPosition(State, RVal);
-    if (!LPos && !RPos)
-      return;
-    State = saveComparison(State, Condition, LVal, RVal, Op == OO_EqualEqual);
-    C.addTransition(State);
-  } else if (const auto TruthVal = RetVal.getAs<nonloc::ConcreteInt>()) {
-    if ((State = processComparison(
-             State, getRegionOrSymbol(LVal), getRegionOrSymbol(RVal),
-             (Op == OO_EqualEqual) == (TruthVal->getValue() != 0)))) {
-      C.addTransition(State);
-    } else {
-      C.generateSink(State, C.getPredecessor());
-    }
-  }
-}
-
-void IteratorPastEndChecker::handleAccess(CheckerContext &C,
-                                          const SVal &Val) const {
-  auto State = C.getState();
-  const auto *Pos = getIteratorPosition(State, Val);
-  if (Pos && Pos->isOutofRange()) {
-    auto *N = C.generateNonFatalErrorNode(State);
-    if (!N) {
-      return;
-    }
-    reportPastEndBug("Iterator accessed past its end.", Val, C, N);
-  }
-}
-
-void IteratorPastEndChecker::handleDecrement(CheckerContext &C,
-                                             const SVal &Val) const {
-  auto State = C.getState();
-  const auto *Pos = getIteratorPosition(State, Val);
-  if (Pos && Pos->isOutofRange()) {
-    State = setIteratorPosition(State, Val, IteratorPosition::getInRange());
-    // FIXME: We could also check for iterators ahead of their beginnig in the
-    //       future, but currently we do not care for such errors. We also
-    //       assume that the iterator is not past its end by more then one
-    //       position.
-    C.addTransition(State);
-  }
-}
-
-void IteratorPastEndChecker::handleEnd(CheckerContext &C,
-                                       const SVal &RetVal) const {
-  auto State = C.getState();
-  State = setIteratorPosition(State, RetVal, IteratorPosition::getOutofRange());
-  C.addTransition(State);
-}
-
-bool IteratorPastEndChecker::evalFind(CheckerContext &C,
-                                      const CallExpr *CE) const {
-  if (CE->getNumArgs() == 3 && isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalFindEnd(CheckerContext &C,
-                                         const CallExpr *CE) const {
-  if ((CE->getNumArgs() == 4 || CE->getNumArgs() == 5) &&
-      isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType()) &&
-      isIteratorType(CE->getArg(2)->getType()) &&
-      isIteratorType(CE->getArg(3)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalFindFirstOf(CheckerContext &C,
-                                             const CallExpr *CE) const {
-  if ((CE->getNumArgs() == 4 || CE->getNumArgs() == 5) &&
-      isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType()) &&
-      isIteratorType(CE->getArg(2)->getType()) &&
-      isIteratorType(CE->getArg(3)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalFindIf(CheckerContext &C,
-                                        const CallExpr *CE) const {
-  if (CE->getNumArgs() == 3 && isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalFindIfNot(CheckerContext &C,
-                                           const CallExpr *CE) const {
-  if (CE->getNumArgs() == 3 && isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalLowerBound(CheckerContext &C,
-                                            const CallExpr *CE) const {
-  if ((CE->getNumArgs() == 3 || CE->getNumArgs() == 4) &&
-      isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalUpperBound(CheckerContext &C,
-                                            const CallExpr *CE) const {
-  if ((CE->getNumArgs() == 3 || CE->getNumArgs() == 4) &&
-      isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalSearch(CheckerContext &C,
-                                        const CallExpr *CE) const {
-  if ((CE->getNumArgs() == 4 || CE->getNumArgs() == 5) &&
-      isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType()) &&
-      isIteratorType(CE->getArg(2)->getType()) &&
-      isIteratorType(CE->getArg(3)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-bool IteratorPastEndChecker::evalSearchN(CheckerContext &C,
-                                         const CallExpr *CE) const {
-  if ((CE->getNumArgs() == 4 || CE->getNumArgs() == 5) &&
-      isIteratorType(CE->getArg(0)->getType()) &&
-      isIteratorType(CE->getArg(1)->getType())) {
-    Find(C, CE);
-    return true;
-  }
-  return false;
-}
-
-void IteratorPastEndChecker::Find(CheckerContext &C, const CallExpr *CE) const {
-  auto state = C.getState();
-  auto &svalBuilder = C.getSValBuilder();
-  const auto *LCtx = C.getLocationContext();
-
-  auto RetVal = svalBuilder.conjureSymbolVal(nullptr, CE, LCtx, C.blockCount());
-  auto SecondParam = state->getSVal(CE->getArg(1), LCtx);
-
-  auto stateFound = state->BindExpr(CE, LCtx, RetVal);
-  auto stateNotFound = state->BindExpr(CE, LCtx, SecondParam);
-
-  C.addTransition(stateFound);
-  C.addTransition(stateNotFound);
-}
-
-void IteratorPastEndChecker::reportPastEndBug(const StringRef &Message,
-                                              const SVal &Val,
-                                              CheckerContext &C,
-                                              ExplodedNode *ErrNode) const {
-  auto R = llvm::make_unique<BugReport>(*PastEndBugType, Message, ErrNode);
-  R->markInteresting(Val);
-  C.emitReport(std::move(R));
-}
-
-void IteratorPastEndChecker::initIdentifiers(ASTContext &Ctx) const {
-  INIT_ID(find);
-  INIT_ID(find_end);
-  INIT_ID(find_first_of);
-  INIT_ID(find_if);
-  INIT_ID(find_if_not);
-  INIT_ID(lower_bound);
-  INIT_ID(upper_bound);
-  INIT_ID(search);
-  INIT_ID(search_n);
-}
-
-namespace {
-
-bool isIteratorType(const QualType &Type) {
-  if (Type->isPointerType())
-    return true;
-
-  const auto *CRD = Type->getUnqualifiedDesugaredType()->getAsCXXRecordDecl();
-  return isIterator(CRD);
-}
-
-bool isIterator(const CXXRecordDecl *CRD) {
-  if (!CRD)
-    return false;
-
-  const auto Name = CRD->getName();
-  if (!(Name.endswith_lower("iterator") || Name.endswith_lower("iter") ||
-        Name.endswith_lower("it")))
-    return false;
-
-  bool HasCopyCtor = false, HasCopyAssign = true, HasDtor = false,
-       HasPreIncrOp = false, HasPostIncrOp = false, HasDerefOp = false;
-  for (const auto *Method : CRD->methods()) {
-    if (const auto *Ctor = dyn_cast<CXXConstructorDecl>(Method)) {
-      if (Ctor->isCopyConstructor()) {
-        HasCopyCtor = !Ctor->isDeleted() && Ctor->getAccess() == AS_public;
-      }
-      continue;
-    }
-    if (const auto *Dtor = dyn_cast<CXXDestructorDecl>(Method)) {
-      HasDtor = !Dtor->isDeleted() && Dtor->getAccess() == AS_public;
-      continue;
-    }
-    if (Method->isCopyAssignmentOperator()) {
-      HasCopyAssign = !Method->isDeleted() && Method->getAccess() == AS_public;
-      continue;
-    }
-    if (!Method->isOverloadedOperator())
-      continue;
-    const auto OPK = Method->getOverloadedOperator();
-    if (OPK == OO_PlusPlus) {
-      HasPreIncrOp = HasPreIncrOp || (Method->getNumParams() == 0);
-      HasPostIncrOp = HasPostIncrOp || (Method->getNumParams() == 1);
-      continue;
-    }
-    if (OPK == OO_Star) {
-      HasDerefOp = (Method->getNumParams() == 0);
-      continue;
-    }
-  }
-
-  return HasCopyCtor && HasCopyAssign && HasDtor && HasPreIncrOp &&
-         HasPostIncrOp && HasDerefOp;
-}
-
-bool isEndCall(const FunctionDecl *Func) {
-  const auto *IdInfo = Func->getIdentifier();
-  if (!IdInfo)
-    return false;
-  return IdInfo->getName().endswith_lower("end");
-}
-
-bool isSimpleComparisonOperator(OverloadedOperatorKind OK) {
-  return OK == OO_EqualEqual || OK == OO_ExclaimEqual;
-}
-
-bool isAccessOperator(OverloadedOperatorKind OK) {
-  return OK == OO_Star || OK == OO_Arrow || OK == OO_ArrowStar ||
-         OK == OO_Plus || OK == OO_PlusEqual || OK == OO_PlusPlus ||
-         OK == OO_Subscript;
-}
-
-bool isDecrementOperator(OverloadedOperatorKind OK) {
-  return OK == OO_MinusEqual || OK == OO_MinusMinus;
-}
-
-BinaryOperator::Opcode getOpcode(const SymExpr *SE) {
-  if (const auto *BSE = dyn_cast<BinarySymExpr>(SE)) {
-    return BSE->getOpcode();
-  } else if (const auto *SC = dyn_cast<SymbolConjured>(SE)) {
-    const auto *COE = dyn_cast<CXXOperatorCallExpr>(SC->getStmt());
-    if (!COE)
-      return BO_Comma; // Extremal value, neither EQ nor NE
-    if (COE->getOperator() == OO_EqualEqual) {
-      return BO_EQ;
-    } else if (COE->getOperator() == OO_ExclaimEqual) {
-      return BO_NE;
-    }
-    return BO_Comma; // Extremal value, neither EQ nor NE
-  }
-  return BO_Comma; // Extremal value, neither EQ nor NE
-}
-
-const RegionOrSymbol getRegionOrSymbol(const SVal &Val) {
-  if (const auto Reg = Val.getAsRegion()) {
-    return Reg;
-  } else if (const auto Sym = Val.getAsSymbol()) {
-    return Sym;
-  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
-    return LCVal->getRegion();
-  }
-  return RegionOrSymbol();
-}
-
-const ProgramStateRef processComparison(ProgramStateRef State,
-                                        RegionOrSymbol LVal,
-                                        RegionOrSymbol RVal, bool Equal) {
-  const auto *LPos = getIteratorPosition(State, LVal);
-  const auto *RPos = getIteratorPosition(State, RVal);
-  if (LPos && !RPos) {
-    State = adjustIteratorPosition(State, RVal, *LPos, Equal);
-  } else if (!LPos && RPos) {
-    State = adjustIteratorPosition(State, LVal, *RPos, Equal);
-  } else if (LPos && RPos) {
-    if (contradictingIteratorPositions(*LPos, *RPos, Equal)) {
-      return nullptr;
-    }
-  }
-  return State;
-}
-
-const ProgramStateRef saveComparison(ProgramStateRef State,
-                                     const SymExpr *Condition, const SVal &LVal,
-                                     const SVal &RVal, bool Eq) {
-  const auto Left = getRegionOrSymbol(LVal);
-  const auto Right = getRegionOrSymbol(RVal);
-  if (!Left || !Right)
-    return State;
-  return State->set<IteratorComparisonMap>(Condition,
-                                           IteratorComparison(Left, Right, Eq));
-}
-
-const IteratorComparison *loadComparison(ProgramStateRef State,
-                                         const SymExpr *Condition) {
-  return State->get<IteratorComparisonMap>(Condition);
-}
-
-const IteratorPosition *getIteratorPosition(ProgramStateRef State,
-                                            const SVal &Val) {
-  if (const auto Reg = Val.getAsRegion()) {
-    return State->get<IteratorRegionMap>(Reg);
-  } else if (const auto Sym = Val.getAsSymbol()) {
-    return State->get<IteratorSymbolMap>(Sym);
-  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
-    return State->get<IteratorRegionMap>(LCVal->getRegion());
-  }
-  return nullptr;
-}
-
-const IteratorPosition *getIteratorPosition(ProgramStateRef State,
-                                            RegionOrSymbol RegOrSym) {
-  if (RegOrSym.is<const MemRegion *>()) {
-    return State->get<IteratorRegionMap>(RegOrSym.get<const MemRegion *>());
-  } else if (RegOrSym.is<SymbolRef>()) {
-    return State->get<IteratorSymbolMap>(RegOrSym.get<SymbolRef>());
-  }
-  return nullptr;
-}
-
-ProgramStateRef setIteratorPosition(ProgramStateRef State, const SVal &Val,
-                                    IteratorPosition Pos) {
-  if (const auto Reg = Val.getAsRegion()) {
-    return State->set<IteratorRegionMap>(Reg, Pos);
-  } else if (const auto Sym = Val.getAsSymbol()) {
-    return State->set<IteratorSymbolMap>(Sym, Pos);
-  } else if (const auto LCVal = Val.getAs<nonloc::LazyCompoundVal>()) {
-    return State->set<IteratorRegionMap>(LCVal->getRegion(), Pos);
-  }
-  return nullptr;
-}
-
-ProgramStateRef setIteratorPosition(ProgramStateRef State,
-                                    RegionOrSymbol RegOrSym,
-                                    IteratorPosition Pos) {
-  if (RegOrSym.is<const MemRegion *>()) {
-    return State->set<IteratorRegionMap>(RegOrSym.get<const MemRegion *>(),
-                                         Pos);
-  } else if (RegOrSym.is<SymbolRef>()) {
-    return State->set<IteratorSymbolMap>(RegOrSym.get<SymbolRef>(), Pos);
-  }
-  return nullptr;
-}
-
-ProgramStateRef adjustIteratorPosition(ProgramStateRef State,
-                                       RegionOrSymbol RegOrSym,
-                                       IteratorPosition Pos, bool Equal) {
-
-  if ((Pos.isInRange() && Equal) || (Pos.isOutofRange() && !Equal)) {
-    return setIteratorPosition(State, RegOrSym, IteratorPosition::getInRange());
-  } else if (Pos.isOutofRange() && Equal) {
-    return setIteratorPosition(State, RegOrSym,
-                               IteratorPosition::getOutofRange());
-  } else {
-    return State;
-  }
-}
-
-bool contradictingIteratorPositions(IteratorPosition Pos1,
-                                    IteratorPosition Pos2, bool Equal) {
-  return ((Pos1 != Pos2) && Equal) ||
-         ((Pos1.isOutofRange() && Pos2.isOutofRange()) && !Equal);
-}
-}
-
-void ento::registerIteratorPastEndChecker(CheckerManager &Mgr) {
-  Mgr.registerChecker<IteratorPastEndChecker>();
-}
diff --git a/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp b/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
index 7ef79c683c49..0e3a649e88f7 100644
--- a/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
+++ b/lib/StaticAnalyzer/Checkers/PthreadLockChecker.cpp
@@ -25,7 +25,13 @@ using namespace ento;
 namespace {
 
 struct LockState {
-  enum Kind { Destroyed, Locked, Unlocked } K;
+  enum Kind {
+    Destroyed,
+    Locked,
+    Unlocked,
+    UntouchedAndPossiblyDestroyed,
+    UnlockedAndPossiblyDestroyed
+  } K;
 
 private:
   LockState(Kind K) : K(K) {}
@@ -34,6 +40,12 @@ struct LockState {
   static LockState getLocked() { return LockState(Locked); }
   static LockState getUnlocked() { return LockState(Unlocked); }
   static LockState getDestroyed() { return LockState(Destroyed); }
+  static LockState getUntouchedAndPossiblyDestroyed() {
+    return LockState(UntouchedAndPossiblyDestroyed);
+  }
+  static LockState getUnlockedAndPossiblyDestroyed() {
+    return LockState(UnlockedAndPossiblyDestroyed);
+  }
 
   bool operator==(const LockState &X) const {
     return K == X.K;
@@ -42,13 +54,20 @@ struct LockState {
   bool isLocked() const { return K == Locked; }
   bool isUnlocked() const { return K == Unlocked; }
   bool isDestroyed() const { return K == Destroyed; }
+  bool isUntouchedAndPossiblyDestroyed() const {
+    return K == UntouchedAndPossiblyDestroyed;
+  }
+  bool isUnlockedAndPossiblyDestroyed() const {
+    return K == UnlockedAndPossiblyDestroyed;
+  }
 
   void Profile(llvm::FoldingSetNodeID &ID) const {
     ID.AddInteger(K);
   }
 };
 
-class PthreadLockChecker : public Checker< check::PostStmt<CallExpr> > {
+class PthreadLockChecker
+    : public Checker<check::PostStmt<CallExpr>, check::DeadSymbols> {
   mutable std::unique_ptr<BugType> BT_doublelock;
   mutable std::unique_ptr<BugType> BT_doubleunlock;
   mutable std::unique_ptr<BugType> BT_destroylock;
@@ -61,22 +80,31 @@ class PthreadLockChecker : public Checker< check::PostStmt<CallExpr> > {
   };
 public:
   void checkPostStmt(const CallExpr *CE, CheckerContext &C) const;
+  void checkDeadSymbols(SymbolReaper &SymReaper, CheckerContext &C) const;
 
   void AcquireLock(CheckerContext &C, const CallExpr *CE, SVal lock,
                    bool isTryLock, enum LockingSemantics semantics) const;
 
   void ReleaseLock(CheckerContext &C, const CallExpr *CE, SVal lock) const;
-  void DestroyLock(CheckerContext &C, const CallExpr *CE, SVal Lock) const;
+  void DestroyLock(CheckerContext &C, const CallExpr *CE, SVal Lock,
+                   enum LockingSemantics semantics) const;
   void InitLock(CheckerContext &C, const CallExpr *CE, SVal Lock) const;
   void reportUseDestroyedBug(CheckerContext &C, const CallExpr *CE) const;
+  ProgramStateRef resolvePossiblyDestroyedMutex(ProgramStateRef state,
+                                                const MemRegion *lockR,
+                                                const SymbolRef *sym) const;
 };
 } // end anonymous namespace
 
-// GDM Entry for tracking lock state.
+// A stack of locks for tracking lock-unlock order.
 REGISTER_LIST_WITH_PROGRAMSTATE(LockSet, const MemRegion *)
 
+// An entry for tracking lock states.
 REGISTER_MAP_WITH_PROGRAMSTATE(LockMap, const MemRegion *, LockState)
 
+// Return values for unresolved calls to pthread_mutex_destroy().
+REGISTER_MAP_WITH_PROGRAMSTATE(DestroyRetVal, const MemRegion *, SymbolRef)
+
 void PthreadLockChecker::checkPostStmt(const CallExpr *CE,
                                        CheckerContext &C) const {
   ProgramStateRef state = C.getState();
@@ -113,13 +141,49 @@ void PthreadLockChecker::checkPostStmt(const CallExpr *CE,
            FName == "lck_mtx_unlock" ||
            FName == "lck_rw_done")
     ReleaseLock(C, CE, state->getSVal(CE->getArg(0), LCtx));
-  else if (FName == "pthread_mutex_destroy" ||
-           FName == "lck_mtx_destroy")
-    DestroyLock(C, CE, state->getSVal(CE->getArg(0), LCtx));
+  else if (FName == "pthread_mutex_destroy")
+    DestroyLock(C, CE, state->getSVal(CE->getArg(0), LCtx), PthreadSemantics);
+  else if (FName == "lck_mtx_destroy")
+    DestroyLock(C, CE, state->getSVal(CE->getArg(0), LCtx), XNUSemantics);
   else if (FName == "pthread_mutex_init")
     InitLock(C, CE, state->getSVal(CE->getArg(0), LCtx));
 }
 
+// When a lock is destroyed, in some semantics(like PthreadSemantics) we are not
+// sure if the destroy call has succeeded or failed, and the lock enters one of
+// the 'possibly destroyed' state. There is a short time frame for the
+// programmer to check the return value to see if the lock was successfully
+// destroyed. Before we model the next operation over that lock, we call this
+// function to see if the return value was checked by now and set the lock state
+// - either to destroyed state or back to its previous state.
+
+// In PthreadSemantics, pthread_mutex_destroy() returns zero if the lock is
+// successfully destroyed and it returns a non-zero value otherwise.
+ProgramStateRef PthreadLockChecker::resolvePossiblyDestroyedMutex(
+    ProgramStateRef state, const MemRegion *lockR, const SymbolRef *sym) const {
+  const LockState *lstate = state->get<LockMap>(lockR);
+  // Existence in DestroyRetVal ensures existence in LockMap.
+  // Existence in Destroyed also ensures that the lock state for lockR is either
+  // UntouchedAndPossiblyDestroyed or UnlockedAndPossiblyDestroyed.
+  assert(lstate->isUntouchedAndPossiblyDestroyed() ||
+         lstate->isUnlockedAndPossiblyDestroyed());
+
+  ConstraintManager &CMgr = state->getConstraintManager();
+  ConditionTruthVal retZero = CMgr.isNull(state, *sym);
+  if (retZero.isConstrainedFalse()) {
+    if (lstate->isUntouchedAndPossiblyDestroyed())
+      state = state->remove<LockMap>(lockR);
+    else if (lstate->isUnlockedAndPossiblyDestroyed())
+      state = state->set<LockMap>(lockR, LockState::getUnlocked());
+  } else
+    state = state->set<LockMap>(lockR, LockState::getDestroyed());
+
+  // Removing the map entry (lockR, sym) from DestroyRetVal as the lock state is
+  // now resolved.
+  state = state->remove<DestroyRetVal>(lockR);
+  return state;
+}
+
 void PthreadLockChecker::AcquireLock(CheckerContext &C, const CallExpr *CE,
                                      SVal lock, bool isTryLock,
                                      enum LockingSemantics semantics) const {
@@ -129,6 +193,9 @@ void PthreadLockChecker::AcquireLock(CheckerContext &C, const CallExpr *CE,
     return;
 
   ProgramStateRef state = C.getState();
+  const SymbolRef *sym = state->get<DestroyRetVal>(lockR);
+  if (sym)
+    state = resolvePossiblyDestroyedMutex(state, lockR, sym);
 
   SVal X = state->getSVal(CE, C.getLocationContext());
   if (X.isUnknownOrUndef())
@@ -197,6 +264,9 @@ void PthreadLockChecker::ReleaseLock(CheckerContext &C, const CallExpr *CE,
     return;
 
   ProgramStateRef state = C.getState();
+  const SymbolRef *sym = state->get<DestroyRetVal>(lockR);
+  if (sym)
+    state = resolvePossiblyDestroyedMutex(state, lockR, sym);
 
   if (const LockState *LState = state->get<LockMap>(lockR)) {
     if (LState->isUnlocked()) {
@@ -245,7 +315,8 @@ void PthreadLockChecker::ReleaseLock(CheckerContext &C, const CallExpr *CE,
 }
 
 void PthreadLockChecker::DestroyLock(CheckerContext &C, const CallExpr *CE,
-                                     SVal Lock) const {
+                                     SVal Lock,
+                                     enum LockingSemantics semantics) const {
 
   const MemRegion *LockR = Lock.getAsRegion();
   if (!LockR)
@@ -253,13 +324,38 @@ void PthreadLockChecker::DestroyLock(CheckerContext &C, const CallExpr *CE,
 
   ProgramStateRef State = C.getState();
 
-  const LockState *LState = State->get<LockMap>(LockR);
-  if (!LState || LState->isUnlocked()) {
-    State = State->set<LockMap>(LockR, LockState::getDestroyed());
-    C.addTransition(State);
-    return;
-  }
+  const SymbolRef *sym = State->get<DestroyRetVal>(LockR);
+  if (sym)
+    State = resolvePossiblyDestroyedMutex(State, LockR, sym);
 
+  const LockState *LState = State->get<LockMap>(LockR);
+  // Checking the return value of the destroy method only in the case of
+  // PthreadSemantics
+  if (semantics == PthreadSemantics) {
+    if (!LState || LState->isUnlocked()) {
+      SymbolRef sym = C.getSVal(CE).getAsSymbol();
+      if (!sym) {
+        State = State->remove<LockMap>(LockR);
+        C.addTransition(State);
+        return;
+      }
+      State = State->set<DestroyRetVal>(LockR, sym);
+      if (LState && LState->isUnlocked())
+        State = State->set<LockMap>(
+            LockR, LockState::getUnlockedAndPossiblyDestroyed());
+      else
+        State = State->set<LockMap>(
+            LockR, LockState::getUntouchedAndPossiblyDestroyed());
+      C.addTransition(State);
+      return;
+    }
+  } else {
+    if (!LState || LState->isUnlocked()) {
+      State = State->set<LockMap>(LockR, LockState::getDestroyed());
+      C.addTransition(State);
+      return;
+    }
+  }
   StringRef Message;
 
   if (LState->isLocked()) {
@@ -288,6 +384,10 @@ void PthreadLockChecker::InitLock(CheckerContext &C, const CallExpr *CE,
 
   ProgramStateRef State = C.getState();
 
+  const SymbolRef *sym = State->get<DestroyRetVal>(LockR);
+  if (sym)
+    State = resolvePossiblyDestroyedMutex(State, LockR, sym);
+
   const struct LockState *LState = State->get<LockMap>(LockR);
   if (!LState || LState->isDestroyed()) {
     State = State->set<LockMap>(LockR, LockState::getUnlocked());
@@ -328,6 +428,26 @@ void PthreadLockChecker::reportUseDestroyedBug(CheckerContext &C,
   C.emitReport(std::move(Report));
 }
 
+void PthreadLockChecker::checkDeadSymbols(SymbolReaper &SymReaper,
+                                          CheckerContext &C) const {
+  ProgramStateRef State = C.getState();
+
+  // TODO: Clean LockMap when a mutex region dies.
+
+  DestroyRetValTy TrackedSymbols = State->get<DestroyRetVal>();
+  for (DestroyRetValTy::iterator I = TrackedSymbols.begin(),
+                                 E = TrackedSymbols.end();
+       I != E; ++I) {
+    const SymbolRef Sym = I->second;
+    const MemRegion *lockR = I->first;
+    bool IsSymDead = SymReaper.isDead(Sym);
+    // Remove the dead symbol from the return value symbols map.
+    if (IsSymDead)
+      State = resolvePossiblyDestroyedMutex(State, lockR, &Sym);
+  }
+  C.addTransition(State);
+}
+
 void ento::registerPthreadLockChecker(CheckerManager &mgr) {
   mgr.registerChecker<PthreadLockChecker>();
 }
diff --git a/lib/StaticAnalyzer/Core/ProgramState.cpp b/lib/StaticAnalyzer/Core/ProgramState.cpp
index 31556c792fc5..3215c3ccd21e 100644
--- a/lib/StaticAnalyzer/Core/ProgramState.cpp
+++ b/lib/StaticAnalyzer/Core/ProgramState.cpp
@@ -644,15 +644,33 @@ ProgramStateRef ProgramState::addTaint(const Stmt *S,
   if (const Expr *E = dyn_cast_or_null<Expr>(S))
     S = E->IgnoreParens();
 
-  SymbolRef Sym = getSVal(S, LCtx).getAsSymbol();
+  return addTaint(getSVal(S, LCtx), Kind);
+}
+
+ProgramStateRef ProgramState::addTaint(SVal V,
+                                       TaintTagType Kind) const {
+  SymbolRef Sym = V.getAsSymbol();
   if (Sym)
     return addTaint(Sym, Kind);
 
-  const MemRegion *R = getSVal(S, LCtx).getAsRegion();
-  addTaint(R, Kind);
+  // If the SVal represents a structure, try to mass-taint all values within the
+  // structure. For now it only works efficiently on lazy compound values that
+  // were conjured during a conservative evaluation of a function - either as
+  // return values of functions that return structures or arrays by value, or as
+  // values of structures or arrays passed into the function by reference,
+  // directly or through pointer aliasing. Such lazy compound values are
+  // characterized by having exactly one binding in their captured store within
+  // their parent region, which is a conjured symbol default-bound to the base
+  // region of the parent region.
+  if (auto LCV = V.getAs<nonloc::LazyCompoundVal>()) {
+    if (Optional<SVal> binding = getStateManager().StoreMgr->getDefaultBinding(*LCV)) {
+      if (SymbolRef Sym = binding->getAsSymbol())
+        return addPartialTaint(Sym, LCV->getRegion(), Kind);
+    }
+  }
 
-  // Cannot add taint, so just return the state.
-  return this;
+  const MemRegion *R = V.getAsRegion();
+  return addTaint(R, Kind);
 }
 
 ProgramStateRef ProgramState::addTaint(const MemRegion *R,
@@ -674,6 +692,27 @@ ProgramStateRef ProgramState::addTaint(SymbolRef Sym,
   return NewState;
 }
 
+ProgramStateRef ProgramState::addPartialTaint(SymbolRef ParentSym,
+                                              const SubRegion *SubRegion,
+                                              TaintTagType Kind) const {
+  // Ignore partial taint if the entire parent symbol is already tainted.
+  if (contains<TaintMap>(ParentSym) && *get<TaintMap>(ParentSym) == Kind)
+    return this;
+
+  // Partial taint applies if only a portion of the symbol is tainted.
+  if (SubRegion == SubRegion->getBaseRegion())
+    return addTaint(ParentSym, Kind);
+
+  const TaintedSubRegions *SavedRegs = get<DerivedSymTaint>(ParentSym);
+  TaintedSubRegions Regs =
+      SavedRegs ? *SavedRegs : stateMgr->TSRFactory.getEmptyMap();
+
+  Regs = stateMgr->TSRFactory.add(Regs, SubRegion, Kind);
+  ProgramStateRef NewState = set<DerivedSymTaint>(ParentSym, Regs);
+  assert(NewState);
+  return NewState;
+}
+
 bool ProgramState::isTainted(const Stmt *S, const LocationContext *LCtx,
                              TaintTagType Kind) const {
   if (const Expr *E = dyn_cast_or_null<Expr>(S))
@@ -714,31 +753,52 @@ bool ProgramState::isTainted(SymbolRef Sym, TaintTagType Kind) const {
     return false;
 
   // Traverse all the symbols this symbol depends on to see if any are tainted.
-  bool Tainted = false;
   for (SymExpr::symbol_iterator SI = Sym->symbol_begin(), SE =Sym->symbol_end();
        SI != SE; ++SI) {
     if (!isa<SymbolData>(*SI))
       continue;
 
-    const TaintTagType *Tag = get<TaintMap>(*SI);
-    Tainted = (Tag && *Tag == Kind);
+    if (const TaintTagType *Tag = get<TaintMap>(*SI)) {
+      if (*Tag == Kind)
+        return true;
+    }
 
-    // If this is a SymbolDerived with a tainted parent, it's also tainted.
-    if (const SymbolDerived *SD = dyn_cast<SymbolDerived>(*SI))
-      Tainted = Tainted || isTainted(SD->getParentSymbol(), Kind);
+    if (const SymbolDerived *SD = dyn_cast<SymbolDerived>(*SI)) {
+      // If this is a SymbolDerived with a tainted parent, it's also tainted.
+      if (isTainted(SD->getParentSymbol(), Kind))
+        return true;
+
+      // If this is a SymbolDerived with the same parent symbol as another
+      // tainted SymbolDerived and a region that's a sub-region of that tainted
+      // symbol, it's also tainted.
+      if (const TaintedSubRegions *Regs =
+              get<DerivedSymTaint>(SD->getParentSymbol())) {
+        const TypedValueRegion *R = SD->getRegion();
+        for (auto I : *Regs) {
+          // FIXME: The logic to identify tainted regions could be more
+          // complete. For example, this would not currently identify
+          // overlapping fields in a union as tainted. To identify this we can
+          // check for overlapping/nested byte offsets.
+          if (Kind == I.second &&
+              (R == I.first || R->isSubRegionOf(I.first)))
+            return true;
+        }
+      }
+    }
 
     // If memory region is tainted, data is also tainted.
-    if (const SymbolRegionValue *SRV = dyn_cast<SymbolRegionValue>(*SI))
-      Tainted = Tainted || isTainted(SRV->getRegion(), Kind);
+    if (const SymbolRegionValue *SRV = dyn_cast<SymbolRegionValue>(*SI)) {
+      if (isTainted(SRV->getRegion(), Kind))
+        return true;
+    }
 
-    // If If this is a SymbolCast from a tainted value, it's also tainted.
-    if (const SymbolCast *SC = dyn_cast<SymbolCast>(*SI))
-      Tainted = Tainted || isTainted(SC->getOperand(), Kind);
-
-    if (Tainted)
-      return true;
+    // If this is a SymbolCast from a tainted value, it's also tainted.
+    if (const SymbolCast *SC = dyn_cast<SymbolCast>(*SI)) {
+      if (isTainted(SC->getOperand(), Kind))
+        return true;
+    }
   }
 
-  return Tainted;
+  return false;
 }
 
diff --git a/lib/StaticAnalyzer/Core/RegionStore.cpp b/lib/StaticAnalyzer/Core/RegionStore.cpp
index 3000e13d32c6..28f78fa3ff5e 100644
--- a/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -496,7 +496,10 @@ class RegionStoreManager : public StoreManager {
 
   Optional<SVal> getDefaultBinding(Store S, const MemRegion *R) override {
     RegionBindingsRef B = getRegionBindings(S);
-    return B.getDefaultBinding(R);
+    // Default bindings are always applied over a base region so look up the
+    // base region's default binding, otherwise the lookup will fail when R
+    // is at an offset from R->getBaseRegion().
+    return B.getDefaultBinding(R->getBaseRegion());
   }
 
   SVal getBinding(RegionBindingsConstRef B, Loc L, QualType T = QualType());
diff --git a/test/Analysis/Inputs/system-header-simulator-cxx.h b/test/Analysis/Inputs/system-header-simulator-cxx.h
index 005e7f57af6f..809b768d71e0 100644
--- a/test/Analysis/Inputs/system-header-simulator-cxx.h
+++ b/test/Analysis/Inputs/system-header-simulator-cxx.h
@@ -8,18 +8,61 @@
 typedef unsigned char uint8_t;
 
 typedef __typeof__(sizeof(int)) size_t;
+typedef __typeof__((char*)0-(char*)0) ptrdiff_t;
 void *memmove(void *s1, const void *s2, size_t n);
 
-template <typename T, typename Ptr, typename Ref> struct __iterator {
-  typedef __iterator<T, T *, T &> iterator;
-  typedef __iterator<T, const T *, const T &> const_iterator;
+namespace std {
+  struct input_iterator_tag { };
+  struct output_iterator_tag { };
+  struct forward_iterator_tag : public input_iterator_tag { };
+  struct bidirectional_iterator_tag : public forward_iterator_tag { };
+  struct random_access_iterator_tag : public bidirectional_iterator_tag { };
 
-  __iterator(const Ptr p) : ptr(p) {}
+  template <typename Iterator> struct iterator_traits {
+    typedef typename Iterator::difference_type difference_type;
+    typedef typename Iterator::value_type value_type;
+    typedef typename Iterator::pointer pointer;
+    typedef typename Iterator::reference reference;
+    typedef typename Iterator::iterator_category iterator_category;
+  };
+}
+
+template <typename T, typename Ptr, typename Ref> struct __vector_iterator {
+  typedef __vector_iterator<T, T *, T &> iterator;
+  typedef __vector_iterator<T, const T *, const T &> const_iterator;
+
+  typedef ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef Ptr pointer;
+  typedef Ref reference;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  __vector_iterator(const Ptr p = 0) : ptr(p) {}
+  __vector_iterator(const iterator &rhs): ptr(rhs.base()) {}
+  __vector_iterator<T, Ptr, Ref> operator++() { ++ ptr; return *this; }
+  __vector_iterator<T, Ptr, Ref> operator++(int) {
+    auto tmp = *this;
+    ++ ptr;
+    return tmp;
+  }
+  __vector_iterator<T, Ptr, Ref> operator--() { -- ptr; return *this; }
+  __vector_iterator<T, Ptr, Ref> operator--(int) {
+    auto tmp = *this; -- ptr;
+    return tmp;
+  }
+  __vector_iterator<T, Ptr, Ref> operator+(difference_type n) {
+    return ptr + n;
+  }
+  __vector_iterator<T, Ptr, Ref> operator-(difference_type n) {
+    return ptr - n;
+  }
+  __vector_iterator<T, Ptr, Ref> operator+=(difference_type n) {
+    return ptr += n;
+  }
+  __vector_iterator<T, Ptr, Ref> operator-=(difference_type n) {
+    return ptr -= n;
+  }
 
-  __iterator<T, Ptr, Ref> operator++() { return *this; }
-  __iterator<T, Ptr, Ref> operator++(int) { return *this; }
-  __iterator<T, Ptr, Ref> operator--() { return *this; }
-  __iterator<T, Ptr, Ref> operator--(int) { return *this; }
   Ref operator*() const { return *ptr; }
   Ptr operator->() const { return *ptr; }
 
@@ -29,10 +72,136 @@ template <typename T, typename Ptr, typename Ref> struct __iterator {
   bool operator!=(const iterator &rhs) const { return ptr != rhs.ptr; }
   bool operator!=(const const_iterator &rhs) const { return ptr != rhs.ptr; }
 
+  const Ptr& base() const { return ptr; }
+
 private:
   Ptr ptr;
 };
 
+template <typename T, typename Ptr, typename Ref> struct __deque_iterator {
+  typedef __deque_iterator<T, T *, T &> iterator;
+  typedef __deque_iterator<T, const T *, const T &> const_iterator;
+
+  typedef ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef Ptr pointer;
+  typedef Ref reference;
+  typedef std::random_access_iterator_tag iterator_category;
+
+  __deque_iterator(const Ptr p = 0) : ptr(p) {}
+  __deque_iterator(const iterator &rhs): ptr(rhs.base()) {}
+  __deque_iterator<T, Ptr, Ref> operator++() { ++ ptr; return *this; }
+  __deque_iterator<T, Ptr, Ref> operator++(int) {
+    auto tmp = *this;
+    ++ ptr;
+    return tmp;
+  }
+  __deque_iterator<T, Ptr, Ref> operator--() { -- ptr; return *this; }
+  __deque_iterator<T, Ptr, Ref> operator--(int) {
+    auto tmp = *this; -- ptr;
+    return tmp;
+  }
+  __deque_iterator<T, Ptr, Ref> operator+(difference_type n) {
+    return ptr + n;
+  }
+  __deque_iterator<T, Ptr, Ref> operator-(difference_type n) {
+    return ptr - n;
+  }
+  __deque_iterator<T, Ptr, Ref> operator+=(difference_type n) {
+    return ptr += n;
+  }
+  __deque_iterator<T, Ptr, Ref> operator-=(difference_type n) {
+    return ptr -= n;
+  }
+
+  Ref operator*() const { return *ptr; }
+  Ptr operator->() const { return *ptr; }
+
+  bool operator==(const iterator &rhs) const { return ptr == rhs.ptr; }
+  bool operator==(const const_iterator &rhs) const { return ptr == rhs.ptr; }
+
+  bool operator!=(const iterator &rhs) const { return ptr != rhs.ptr; }
+  bool operator!=(const const_iterator &rhs) const { return ptr != rhs.ptr; }
+
+  const Ptr& base() const { return ptr; }
+
+private:
+  Ptr ptr;
+};
+
+template <typename T, typename Ptr, typename Ref> struct __list_iterator {
+  typedef __list_iterator<T, __typeof__(T::data) *, __typeof__(T::data) &> iterator;
+  typedef __list_iterator<T, const __typeof__(T::data) *, const __typeof__(T::data) &> const_iterator;
+
+  typedef ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef Ptr pointer;
+  typedef Ref reference;
+  typedef std::bidirectional_iterator_tag iterator_category;
+
+  __list_iterator(T* it = 0) : item(it) {}
+  __list_iterator(const iterator &rhs): item(rhs.base()) {}
+  __list_iterator<T, Ptr, Ref> operator++() { item = item->next; return *this; }
+  __list_iterator<T, Ptr, Ref> operator++(int) {
+    auto tmp = *this;
+    item = item->next;
+    return tmp;
+  }
+  __list_iterator<T, Ptr, Ref> operator--() { item = item->prev; return *this; }
+  __list_iterator<T, Ptr, Ref> operator--(int) {
+    auto tmp = *this;
+    item = item->prev;
+    return tmp;
+  }
+
+  Ref operator*() const { return item->data; }
+  Ptr operator->() const { return item->data; }
+
+  bool operator==(const iterator &rhs) const { return item == rhs->item; }
+  bool operator==(const const_iterator &rhs) const { return item == rhs->item; }
+
+  bool operator!=(const iterator &rhs) const { return item != rhs->item; }
+  bool operator!=(const const_iterator &rhs) const { return item != rhs->item; }
+
+  const T* &base() const { return item; }
+
+private:
+  T* item;
+};
+
+template <typename T, typename Ptr, typename Ref> struct __fwdl_iterator {
+  typedef __fwdl_iterator<T, __typeof__(T::data) *, __typeof__(T::data) &> iterator;
+  typedef __fwdl_iterator<T, const __typeof__(T::data) *, const __typeof__(T::data) &> const_iterator;
+
+  typedef ptrdiff_t difference_type;
+  typedef T value_type;
+  typedef Ptr pointer;
+  typedef Ref reference;
+  typedef std::forward_iterator_tag iterator_category;
+
+  __fwdl_iterator(T* it = 0) : item(it) {}
+  __fwdl_iterator(const iterator &rhs): item(rhs.base()) {}
+  __fwdl_iterator<T, Ptr, Ref> operator++() { item = item->next; return *this; }
+  __fwdl_iterator<T, Ptr, Ref> operator++(int) {
+    auto tmp = *this;
+    item = item->next;
+    return tmp;
+  }
+  Ref operator*() const { return item->data; }
+  Ptr operator->() const { return item->data; }
+
+  bool operator==(const iterator &rhs) const { return item == rhs->item; }
+  bool operator==(const const_iterator &rhs) const { return item == rhs->item; }
+
+  bool operator!=(const iterator &rhs) const { return item != rhs->item; }
+  bool operator!=(const const_iterator &rhs) const { return item != rhs->item; }
+
+  const T* &base() const { return item; }
+
+private:
+  T* item;
+};
+
 namespace std {
   template <class T1, class T2>
   struct pair {
@@ -43,29 +212,45 @@ namespace std {
     pair(const T1 &a, const T2 &b) : first(a), second(b) {}
     
     template<class U1, class U2>
-    pair(const pair<U1, U2> &other) : first(other.first), second(other.second) {}
+    pair(const pair<U1, U2> &other) : first(other.first),
+                                      second(other.second) {}
   };
   
   typedef __typeof__(sizeof(int)) size_t;
+
+  template <class T> class initializer_list;
   
+  template< class T > struct remove_reference      {typedef T type;};
+  template< class T > struct remove_reference<T&>  {typedef T type;};
+  template< class T > struct remove_reference<T&&> {typedef T type;};
+
+  template<class T> 
+  typename remove_reference<T>::type&& move(T&& a) {
+    typedef typename remove_reference<T>::type&& RvalRef;
+    return static_cast<RvalRef>(a);
+  }
+
   template<typename T>
   class vector {
-    typedef __iterator<T, T *, T &> iterator;
-    typedef __iterator<T, const T *, const T &> const_iterator;
+    typedef T value_type;
+    typedef size_t size_type;
+    typedef __vector_iterator<T, T *, T &> iterator;
+    typedef __vector_iterator<T, const T *, const T &> const_iterator;
 
     T *_start;
     T *_finish;
     T *_end_of_storage;
   public:
     vector() : _start(0), _finish(0), _end_of_storage(0) {}
+    template <typename InputIterator>
+    vector(InputIterator first, InputIterator last);
+    vector(const vector &other);
+    vector(vector &&other);
     ~vector();
     
     size_t size() const {
       return size_t(_finish - _start);
     }
-    
-    void push_back();
-    T pop_back();
 
     T &operator[](size_t n) {
       return _start[n];
@@ -77,10 +262,124 @@ namespace std {
     
     iterator begin() { return iterator(_start); }
     const_iterator begin() const { return const_iterator(_start); }
+    const_iterator cbegin() const { return const_iterator(_start); }
     iterator end() { return iterator(_finish); }
     const_iterator end() const { return const_iterator(_finish); }
+    const_iterator cend() const { return const_iterator(_finish); }
+    T& front() { return *begin(); }
+    const T& front() const { return *begin(); }
+    T& back() { return *(end() - 1); }
+    const T& back() const { return *(end() - 1); }
   };
   
+  template<typename T>
+  class list {
+    struct __item {
+      T data;
+      __item *prev, *next;
+    } *_start, *_finish;
+  public:
+    typedef T value_type;
+    typedef size_t size_type;
+    typedef __list_iterator<__item, T *, T &> iterator;
+    typedef __list_iterator<__item, const T *, const T &> const_iterator;
+
+    list() : _start(0), _finish(0) {}
+    template <typename InputIterator>
+    list(InputIterator first, InputIterator last);
+    list(const list &other);
+    list(list &&other);
+    ~list();
+    
+    list& operator=(const list &other);
+    list& operator=(list &&other);
+    list& operator=(std::initializer_list<T> ilist);
+
+    iterator begin() { return iterator(_start); }
+    const_iterator begin() const { return const_iterator(_start); }
+    const_iterator cbegin() const { return const_iterator(_start); }
+    iterator end() { return iterator(_finish); }
+    const_iterator end() const { return const_iterator(_finish); }
+    const_iterator cend() const { return const_iterator(_finish); }
+
+    T& front() { return *begin(); }
+    const T& front() const { return *begin(); }
+    T& back() { return *--end(); }
+    const T& back() const { return *--end(); }
+  };
+
+  template<typename T>
+  class deque {
+    typedef T value_type;
+    typedef size_t size_type;
+    typedef __deque_iterator<T, T *, T &> iterator;
+    typedef __deque_iterator<T, const T *, const T &> const_iterator;
+
+    T *_start;
+    T *_finish;
+    T *_end_of_storage;
+  public:
+    deque() : _start(0), _finish(0), _end_of_storage(0) {}
+    template <typename InputIterator>
+    deque(InputIterator first, InputIterator last);
+    deque(const deque &other);
+    deque(deque &&other);
+    ~deque();
+    
+    size_t size() const {
+      return size_t(_finish - _start);
+    }
+    
+    T &operator[](size_t n) {
+      return _start[n];
+    }
+    
+    const T &operator[](size_t n) const {
+      return _start[n];
+    }
+    
+    iterator begin() { return iterator(_start); }
+    const_iterator begin() const { return const_iterator(_start); }
+    const_iterator cbegin() const { return const_iterator(_start); }
+    iterator end() { return iterator(_finish); }
+    const_iterator end() const { return const_iterator(_finish); }
+    const_iterator cend() const { return const_iterator(_finish); }
+    T& front() { return *begin(); }
+    const T& front() const { return *begin(); }
+    T& back() { return *(end() - 1); }
+    const T& back() const { return *(end() - 1); }
+  };
+  
+  template<typename T>
+  class forward_list {
+    struct __item {
+      T data;
+      __item *next;
+    } *_start;
+  public:
+    typedef T value_type;
+    typedef size_t size_type;
+    typedef __fwdl_iterator<__item, T *, T &> iterator;
+    typedef __fwdl_iterator<__item, const T *, const T &> const_iterator;
+
+    forward_list() : _start(0) {}
+    template <typename InputIterator>
+    forward_list(InputIterator first, InputIterator last);
+    forward_list(const forward_list &other);
+    forward_list(forward_list &&other);
+    ~forward_list();
+    
+    iterator begin() { return iterator(_start); }
+    const_iterator begin() const { return const_iterator(_start); }
+    const_iterator cbegin() const { return const_iterator(_start); }
+    iterator end() { return iterator(); }
+    const_iterator end() const { return const_iterator(); }
+    const_iterator cend() const { return const_iterator(); }
+
+    T& front() { return *begin(); }
+    const T& front() const { return *begin(); }
+  };
+
   class exception {
   public:
     exception() throw();
@@ -247,41 +546,41 @@ namespace std {
   OutputIter copy_backward(InputIter II, InputIter IE, OutputIter OI) {
     return __copy_backward(II, IE, OI);
   }
+}
+
+template <class BidirectionalIterator, class Distance>
+void __advance (BidirectionalIterator& it, Distance n,
+                std::bidirectional_iterator_tag) {
+  if (n >= 0) while(n-- > 0) ++it; else while (n++<0) --it;
+}
+
+template <class RandomAccessIterator, class Distance>
+void __advance (RandomAccessIterator& it, Distance n,
+                std::random_access_iterator_tag) {
+  it += n;
+}
+
+namespace std {
+  template <class InputIterator, class Distance>
+  void advance (InputIterator& it, Distance n) {
+    __advance(it, n, typename InputIterator::iterator_category());
+  }
+
+  template <class BidirectionalIterator>
+  BidirectionalIterator
+  prev (BidirectionalIterator it,
+        typename iterator_traits<BidirectionalIterator>::difference_type n =
+        1) {
+    advance(it, -n);
+    return it;
+  }
 
   template <class InputIterator, class T>
   InputIterator find(InputIterator first, InputIterator last, const T &val);
-  template <class ForwardIterator1, class ForwardIterator2>
-  ForwardIterator1 find_end(ForwardIterator1 first1, ForwardIterator1 last1,
-                            ForwardIterator2 first2, ForwardIterator2 last2);
-  template <class ForwardIterator1, class ForwardIterator2>
-  ForwardIterator1 find_first_of(ForwardIterator1 first1,
-                                 ForwardIterator1 last1,
-                                 ForwardIterator2 first2,
-                                 ForwardIterator2 last2);
-  template <class InputIterator, class UnaryPredicate>
-  InputIterator find_if(InputIterator first, InputIterator last,
-                        UnaryPredicate pred);
-  template <class InputIterator, class UnaryPredicate>
-  InputIterator find_if_not(InputIterator first, InputIterator last,
-                            UnaryPredicate pred);
-  template <class InputIterator, class T>
-  InputIterator lower_bound(InputIterator first, InputIterator last,
-                            const T &val);
-  template <class InputIterator, class T>
-  InputIterator upper_bound(InputIterator first, InputIterator last,
-                            const T &val);
-  template <class ForwardIterator1, class ForwardIterator2>
-  ForwardIterator1 search(ForwardIterator1 first1, ForwardIterator1 last1,
-                          ForwardIterator2 first2, ForwardIterator2 last2);
-  template <class ForwardIterator1, class ForwardIterator2>
-  ForwardIterator1 search_n(ForwardIterator1 first1, ForwardIterator1 last1,
-                            ForwardIterator2 first2, ForwardIterator2 last2);
 
-  struct input_iterator_tag { };
-  struct output_iterator_tag { };
-  struct forward_iterator_tag : public input_iterator_tag { };
-  struct bidirectional_iterator_tag : public forward_iterator_tag { };
-  struct random_access_iterator_tag : public bidirectional_iterator_tag { };
+  template <class InputIterator, class OutputIterator>
+  OutputIterator copy(InputIterator first, InputIterator last,
+                      OutputIterator result);
 
 }
 
diff --git a/test/Analysis/diagnostics/explicit-suppression.cpp b/test/Analysis/diagnostics/explicit-suppression.cpp
index 193846c082bc..96d2b4a7d095 100644
--- a/test/Analysis/diagnostics/explicit-suppression.cpp
+++ b/test/Analysis/diagnostics/explicit-suppression.cpp
@@ -19,6 +19,6 @@ class C {
 void testCopyNull(C *I, C *E) {
   std::copy(I, E, (C *)0);
 #ifndef SUPPRESSED
-  // expected-warning@../Inputs/system-header-simulator-cxx.h:191 {{Called C++ object pointer is null}}
+  // expected-warning@../Inputs/system-header-simulator-cxx.h:490 {{Called C++ object pointer is null}}
 #endif
 }
diff --git a/test/Analysis/iterator-past-end.cpp b/test/Analysis/iterator-past-end.cpp
deleted file mode 100644
index 252d1044bd16..000000000000
--- a/test/Analysis/iterator-past-end.cpp
+++ /dev/null
@@ -1,205 +0,0 @@
-// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,alpha.cplusplus.IteratorPastEnd -analyzer-eagerly-assume -analyzer-config c++-container-inlining=false %s -verify
-// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,alpha.cplusplus.IteratorPastEnd -analyzer-eagerly-assume -analyzer-config c++-container-inlining=true -DINLINE=1 %s -verify
-
-#include "Inputs/system-header-simulator-cxx.h"
-
-void simple_good(const std::vector<int> &v) {
-  auto i = v.end();
-  if (i != v.end())
-    *i; // no-warning
-}
-
-void simple_good_negated(const std::vector<int> &v) {
-  auto i = v.end();
-  if (!(i == v.end()))
-    *i; // no-warning
-}
-
-void simple_bad(const std::vector<int> &v) {
-  auto i = v.end();
-  *i; // expected-warning{{Iterator accessed past its end}}
-}
-
-void copy(const std::vector<int> &v) {
-  auto i1 = v.end();
-  auto i2 = i1;
-  *i2; // expected-warning{{Iterator accessed past its end}}
-}
-
-void decrease(const std::vector<int> &v) {
-  auto i = v.end();
-  --i;
-  *i; // no-warning
-}
-
-void copy_and_decrease1(const std::vector<int> &v) {
-  auto i1 = v.end();
-  auto i2 = i1;
-  --i1;
-  *i1; // no-warning
-}
-
-void copy_and_decrease2(const std::vector<int> &v) {
-  auto i1 = v.end();
-  auto i2 = i1;
-  --i1;
-  *i2; // expected-warning{{Iterator accessed past its end}}
-}
-
-void copy_and_increase1(const std::vector<int> &v) {
-  auto i1 = v.begin();
-  auto i2 = i1;
-  ++i1;
-  if (i1 == v.end())
-    *i2; // no-warning
-}
-
-void copy_and_increase2(const std::vector<int> &v) {
-  auto i1 = v.begin();
-  auto i2 = i1;
-  ++i1;
-  if (i2 == v.end())
-    *i2; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_find(std::vector<int> &vec, int e) {
-  auto first = std::find(vec.begin(), vec.end(), e);
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_find(std::vector<int> &vec, int e) {
-  auto first = std::find(vec.begin(), vec.end(), e);
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_find_end(std::vector<int> &vec, std::vector<int> &seq) {
-  auto last = std::find_end(vec.begin(), vec.end(), seq.begin(), seq.end());
-  if (vec.end() != last)
-    *last; // no-warning
-}
-
-void bad_find_end(std::vector<int> &vec, std::vector<int> &seq) {
-  auto last = std::find_end(vec.begin(), vec.end(), seq.begin(), seq.end());
-  *last; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_find_first_of(std::vector<int> &vec, std::vector<int> &seq) {
-  auto first =
-      std::find_first_of(vec.begin(), vec.end(), seq.begin(), seq.end());
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_find_first_of(std::vector<int> &vec, std::vector<int> &seq) {
-  auto first = std::find_end(vec.begin(), vec.end(), seq.begin(), seq.end());
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-bool odd(int i) { return i % 2; }
-
-void good_find_if(std::vector<int> &vec) {
-  auto first = std::find_if(vec.begin(), vec.end(), odd);
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_find_if(std::vector<int> &vec, int e) {
-  auto first = std::find_if(vec.begin(), vec.end(), odd);
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_find_if_not(std::vector<int> &vec) {
-  auto first = std::find_if_not(vec.begin(), vec.end(), odd);
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_find_if_not(std::vector<int> &vec, int e) {
-  auto first = std::find_if_not(vec.begin(), vec.end(), odd);
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_lower_bound(std::vector<int> &vec, int e) {
-  auto first = std::lower_bound(vec.begin(), vec.end(), e);
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_lower_bound(std::vector<int> &vec, int e) {
-  auto first = std::lower_bound(vec.begin(), vec.end(), e);
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_upper_bound(std::vector<int> &vec, int e) {
-  auto last = std::lower_bound(vec.begin(), vec.end(), e);
-  if (vec.end() != last)
-    *last; // no-warning
-}
-
-void bad_upper_bound(std::vector<int> &vec, int e) {
-  auto last = std::lower_bound(vec.begin(), vec.end(), e);
-  *last; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_search(std::vector<int> &vec, std::vector<int> &seq) {
-  auto first = std::search(vec.begin(), vec.end(), seq.begin(), seq.end());
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_search(std::vector<int> &vec, std::vector<int> &seq) {
-  auto first = std::search(vec.begin(), vec.end(), seq.begin(), seq.end());
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-void good_search_n(std::vector<int> &vec, std::vector<int> &seq) {
-  auto nth = std::search_n(vec.begin(), vec.end(), seq.begin(), seq.end());
-  if (vec.end() != nth)
-    *nth; // no-warning
-}
-
-void bad_search_n(std::vector<int> &vec, std::vector<int> &seq) {
-  auto nth = std::search_n(vec.begin(), vec.end(), seq.begin(), seq.end());
-  *nth; // expected-warning{{Iterator accessed past its end}}
-}
-
-template <class InputIterator, class T>
-InputIterator nonStdFind(InputIterator first, InputIterator last,
-                         const T &val) {
-  for (auto i = first; i != last; ++i) {
-    if (*i == val) {
-      return i;
-    }
-  }
-  return last;
-}
-
-void good_non_std_find(std::vector<int> &vec, int e) {
-  auto first = nonStdFind(vec.begin(), vec.end(), e);
-  if (vec.end() != first)
-    *first; // no-warning
-}
-
-void bad_non_std_find(std::vector<int> &vec, int e) {
-  auto first = nonStdFind(vec.begin(), vec.end(), e);
-  *first; // expected-warning{{Iterator accessed past its end}}
-}
-
-void tricky(std::vector<int> &vec, int e) {
-  const auto first = vec.begin();
-  const auto comp1 = (first != vec.end()), comp2 = (first == vec.end());
-  if (comp1)
-    *first;
-}
-
-void loop(std::vector<int> &vec, int e) {
-  auto start = vec.begin();
-  while (true) {
-    auto item = std::find(start, vec.end(), e);
-    if (item == vec.end())
-      break;
-    *item;          // no-warning
-    start = ++item; // no-warning
-  }
-}
diff --git a/test/Analysis/iterator-range.cpp b/test/Analysis/iterator-range.cpp
new file mode 100644
index 000000000000..79b45188ab5d
--- /dev/null
+++ b/test/Analysis/iterator-range.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,alpha.cplusplus.IteratorRange -analyzer-eagerly-assume -analyzer-config c++-container-inlining=false %s -verify
+// RUN: %clang_analyze_cc1 -std=c++11 -analyzer-checker=core,cplusplus,alpha.cplusplus.IteratorRange -analyzer-eagerly-assume -analyzer-config c++-container-inlining=true -DINLINE=1 %s -verify
+
+#include "Inputs/system-header-simulator-cxx.h"
+
+void clang_analyzer_warnIfReached();
+
+void simple_good_end(const std::vector<int> &v) {
+  auto i = v.end();
+  if (i != v.end()) {
+    clang_analyzer_warnIfReached();
+    *i; // no-warning
+  }
+}
+
+void simple_bad_end(const std::vector<int> &v) {
+  auto i = v.end();
+  *i; // expected-warning{{Iterator accessed outside of its range}}
+}
diff --git a/test/Analysis/pthreadlock.c b/test/Analysis/pthreadlock.c
index 98868172523d..56a92d7d4d92 100644
--- a/test/Analysis/pthreadlock.c
+++ b/test/Analysis/pthreadlock.c
@@ -176,6 +176,42 @@ ok22(void) {
   pthread_mutex_unlock(pmtx);  // no-warning
 }
 
+void ok23(void) {
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_destroy(&mtx1);        // no-warning
+}
+
+void ok24(void) {
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_lock(&mtx1);           // no-warning
+}
+
+void ok25(void) {
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_unlock(&mtx1);         // no-warning
+}
+
+void ok26(void) {
+  pthread_mutex_unlock(&mtx1);           // no-warning
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_lock(&mtx1);           // no-warning
+}
+
+void ok27(void) {
+  pthread_mutex_unlock(&mtx1);           // no-warning
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_lock(&mtx1);           // no-warning
+  else
+    pthread_mutex_init(&mtx1, NULL); // no-warning
+}
+
+void ok28(void) {
+  if (pthread_mutex_destroy(&mtx1) != 0) { // no-warning
+    pthread_mutex_lock(&mtx1);             // no-warning
+    pthread_mutex_unlock(&mtx1);           // no-warning
+    pthread_mutex_destroy(&mtx1);          // no-warning
+  }
+}
 
 void
 bad1(void)
@@ -392,3 +428,46 @@ bad26(void)
 	pthread_mutex_unlock(&mtx1);		// no-warning
 	pthread_mutex_init(&mtx1, NULL);	// expected-warning{{This lock has already been initialized}}
 }
+
+void bad27(void) {
+  pthread_mutex_unlock(&mtx1);            // no-warning
+  int ret = pthread_mutex_destroy(&mtx1); // no-warning
+  if (ret != 0)                           // no-warning
+    pthread_mutex_lock(&mtx1);            // no-warning
+  else
+    pthread_mutex_unlock(&mtx1); // expected-warning{{This lock has already been destroyed}}
+}
+
+void bad28(void) {
+  pthread_mutex_unlock(&mtx1);            // no-warning
+  int ret = pthread_mutex_destroy(&mtx1); // no-warning
+  if (ret != 0)                           // no-warning
+    pthread_mutex_lock(&mtx1);            // no-warning
+  else
+    pthread_mutex_lock(&mtx1); // expected-warning{{This lock has already been destroyed}}
+}
+
+void bad29(void) {
+  pthread_mutex_lock(&mtx1);             // no-warning
+  pthread_mutex_unlock(&mtx1);           // no-warning
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_init(&mtx1, NULL);     // expected-warning{{This lock has already been initialized}}
+  else
+    pthread_mutex_init(&mtx1, NULL); // no-warning
+}
+
+void bad30(void) {
+  pthread_mutex_lock(&mtx1);             // no-warning
+  pthread_mutex_unlock(&mtx1);           // no-warning
+  if (pthread_mutex_destroy(&mtx1) != 0) // no-warning
+    pthread_mutex_init(&mtx1, NULL);     // expected-warning{{This lock has already been initialized}}
+  else
+    pthread_mutex_destroy(&mtx1); // expected-warning{{This lock has already been destroyed}}
+}
+
+void bad31(void) {
+  int ret = pthread_mutex_destroy(&mtx1); // no-warning
+  pthread_mutex_lock(&mtx1);              // expected-warning{{This lock has already been destroyed}}
+  if (ret != 0)
+    pthread_mutex_lock(&mtx1);
+}
diff --git a/test/Analysis/taint-generic.c b/test/Analysis/taint-generic.c
index 8efed66dacbf..d3ca246b82a8 100644
--- a/test/Analysis/taint-generic.c
+++ b/test/Analysis/taint-generic.c
@@ -192,20 +192,41 @@ void testStruct() {
 
 void testStructArray() {
   struct {
-    char buf[16];
-    struct {
-      int length;
-    } st[1];
-  } tainted;
+    int length;
+  } tainted[4];
 
-  char buffer[16];
+  char dstbuf[16], srcbuf[16];
   int sock;
 
   sock = socket(AF_INET, SOCK_STREAM, 0);
-  read(sock, &tainted.buf[0], sizeof(tainted.buf));
-  read(sock, &tainted.st[0], sizeof(tainted.st));
-  // FIXME: tainted.st[0].length should be marked tainted
-  __builtin_memcpy(buffer, tainted.buf, tainted.st[0].length); // no-warning
+  __builtin_memset(srcbuf, 0, sizeof(srcbuf));
+
+  read(sock, &tainted[0], sizeof(tainted));
+  __builtin_memcpy(dstbuf, srcbuf, tainted[0].length); // expected-warning {{Untrusted data is used to specify the buffer size}}
+
+  __builtin_memset(&tainted, 0, sizeof(tainted));
+  read(sock, &tainted, sizeof(tainted));
+  __builtin_memcpy(dstbuf, srcbuf, tainted[0].length); // expected-warning {{Untrusted data is used to specify the buffer size}}
+
+  __builtin_memset(&tainted, 0, sizeof(tainted));
+  // If we taint element 1, we should not raise an alert on taint for element 0 or element 2
+  read(sock, &tainted[1], sizeof(tainted));
+  __builtin_memcpy(dstbuf, srcbuf, tainted[0].length); // no-warning
+  __builtin_memcpy(dstbuf, srcbuf, tainted[2].length); // no-warning
+}
+
+void testUnion() {
+  union {
+    int x;
+    char y[4];
+  } tainted;
+
+  char buffer[4];
+
+  int sock = socket(AF_INET, SOCK_STREAM, 0);
+  read(sock, &tainted.y, sizeof(tainted.y));
+  // FIXME: overlapping regions aren't detected by isTainted yet
+  __builtin_memcpy(buffer, tainted.y, tainted.x);
 }
 
 int testDivByZero() {
diff --git a/test/CodeGen/altivec-ct.c b/test/CodeGen/altivec-ct.c
new file mode 100644
index 000000000000..1a3e14dc1fd5
--- /dev/null
+++ b/test/CodeGen/altivec-ct.c
@@ -0,0 +1,82 @@
+// RUN: %clang_cc1 -triple powerpc64le-linux-gnu -S -O0 -o - %s -target-feature +altivec -target-feature +vsx | FileCheck %s -check-prefix=CHECK -check-prefix=VSX
+// RUN: %clang_cc1 -triple powerpc-linux-gnu -S -O0 -o - %s -target-feature +altivec -target-feature -vsx | FileCheck %s
+
+// REQUIRES: powerpc-registered-target
+
+#include <altivec.h>
+
+// CHECK-LABEL: test1
+// CHECK: vcfsx
+vector float test1(vector int x) {
+  return vec_ctf(x, 0);
+}
+
+// CHECK-LABEL: test2
+// CHECK: vcfux
+vector float test2(vector unsigned int x) {
+  return vec_ctf(x, 0);
+}
+
+#ifdef __VSX__
+// VSX-LABEL: test3
+vector double test3(vector signed long long x) {
+  return vec_ctf(x, 0);
+}
+
+// VSX-LABEL: test4
+vector double test4(vector unsigned long long x) {
+  return vec_ctf(x, 0);
+}
+#endif
+
+// CHECK-LABEL: test5
+// CHECK: vcfsx
+vector float test5(vector int x) {
+  return vec_vcfsx(x, 0);
+}
+
+// CHECK-LABEL: test6
+// CHECK: vcfux
+vector float test6(vector unsigned int x) {
+  return vec_vcfux(x, 0);
+}
+
+// CHECK-LABEL: test7
+// CHECK: vctsxs
+vector int test7(vector float x) {
+  return vec_cts(x, 0);
+}
+
+#ifdef __VSX__
+// VSX-LABEL: test8
+vector signed long long test8(vector double x) {
+  return vec_cts(x, 0);
+}
+
+#endif
+
+// CHECK-LABEL: test9
+// CHECK: vctsxs
+vector int test9(vector float x) {
+  return vec_vctsxs(x, 0);
+}
+
+// CHECK-LABEL: test10
+// CHECK: vctuxs
+vector unsigned test10(vector float x) {
+  return vec_ctu(x, 0);
+}
+
+#ifdef __VSX__
+// VSX-LABEL: test11
+vector unsigned long long test11(vector double x) {
+  return vec_ctu(x, 0);
+}
+
+#endif
+
+// CHECK-LABEL: test12
+// CHECK: vctuxs
+vector unsigned test12(vector float x) {
+  return vec_vctuxs(x, 0);
+}
diff --git a/test/CodeGen/arm_neon_intrinsics.c b/test/CodeGen/arm_neon_intrinsics.c
index a8b03b5d9b0b..ae7c78e08f86 100644
--- a/test/CodeGen/arm_neon_intrinsics.c
+++ b/test/CodeGen/arm_neon_intrinsics.c
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -triple thumbv7s-apple-darwin -target-abi apcs-gnu\
-// RUN:  -target-cpu swift -fallow-half-arguments-and-returns -ffreestanding -emit-llvm -o - %s \
+// RUN:  -target-cpu swift -fallow-half-arguments-and-returns -ffreestanding \
+// RUN:  -disable-O0-optnone -emit-llvm -o - %s \
 // RUN:  | opt -S -mem2reg | FileCheck %s
 
 // REQUIRES: long-tests
@@ -3480,11 +3481,11 @@ float32_t test_vgetq_lane_f32(float32x4_t a) {
 }
 
 // CHECK-LABEL: @test_vgetq_lane_f16(
-// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 16
+// CHECK:   [[__REINT_244:%.*]] = alloca <8 x half>, align 8
 // CHECK:   [[__REINT1_244:%.*]] = alloca i16, align 2
-// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 16
+// CHECK:   store <8 x half> %a, <8 x half>* [[__REINT_244]], align 8
 // CHECK:   [[TMP0:%.*]] = bitcast <8 x half>* [[__REINT_244]] to <8 x i16>*
-// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 16
+// CHECK:   [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 8
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> [[TMP1]] to <16 x i8>
 // CHECK:   [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <8 x i16>
 // CHECK:   [[VGET_LANE:%.*]] = extractelement <8 x i16> [[TMP3]], i32 3
@@ -4542,7 +4543,7 @@ poly16x4_t test_vld1_lane_p16(poly16_t const * a, poly16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET]] to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
 uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
@@ -4550,7 +4551,7 @@ uint8x16x2_t test_vld2q_u8(uint8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
@@ -4559,7 +4560,7 @@ uint16x8x2_t test_vld2q_u16(uint16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
@@ -4568,7 +4569,7 @@ uint32x4x2_t test_vld2q_u32(uint32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET]] to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
 int8x16x2_t test_vld2q_s8(int8_t const * a) {
@@ -4576,7 +4577,7 @@ int8x16x2_t test_vld2q_s8(int8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
@@ -4585,7 +4586,7 @@ int16x8x2_t test_vld2q_s16(int16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x i32>, <4 x i32>
@@ -4594,7 +4595,7 @@ int32x4x2_t test_vld2q_s32(int32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
@@ -4603,7 +4604,7 @@ float16x8x2_t test_vld2q_f16(float16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <4 x float>, <4 x float>
@@ -4612,7 +4613,7 @@ float32x4x2_t test_vld2q_f32(float32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET]] to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <16 x i8>, <16 x i8>
 poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
@@ -4620,7 +4621,7 @@ poly8x16x2_t test_vld2q_p8(poly8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD2Q_V:%.*]] = call { <8 x i16>, <8 x i16>
@@ -4839,24 +4840,24 @@ poly16x4x2_t test_vld2_dup_p16(poly16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -4866,24 +4867,24 @@ uint16x8x2_t test_vld2q_lane_u16(uint16_t const * a, uint16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
@@ -4893,24 +4894,24 @@ uint32x4x2_t test_vld2q_lane_u32(uint32_t const * a, uint32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -4920,24 +4921,24 @@ int16x8x2_t test_vld2q_lane_s16(int16_t const * a, int16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
@@ -4947,24 +4948,24 @@ int32x4x2_t test_vld2q_lane_s32(int32_t const * a, int32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -4974,24 +4975,24 @@ float16x8x2_t test_vld2q_lane_f16(float16_t const * a, float16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
@@ -5001,24 +5002,24 @@ float32x4x2_t test_vld2q_lane_f32(float32_t const * a, float32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld2q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -5283,7 +5284,7 @@ poly16x4x2_t test_vld2_lane_p16(poly16_t const * a, poly16x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x3_t* [[__RET]] to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
 uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
@@ -5291,7 +5292,7 @@ uint8x16x3_t test_vld3q_u8(uint8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
@@ -5300,7 +5301,7 @@ uint16x8x3_t test_vld3q_u16(uint16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
@@ -5309,7 +5310,7 @@ uint32x4x3_t test_vld3q_u32(uint32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x3_t* [[__RET]] to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
 int8x16x3_t test_vld3q_s8(int8_t const * a) {
@@ -5317,7 +5318,7 @@ int8x16x3_t test_vld3q_s8(int8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
@@ -5326,7 +5327,7 @@ int16x8x3_t test_vld3q_s16(int16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>
@@ -5335,7 +5336,7 @@ int32x4x3_t test_vld3q_s32(int32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
@@ -5344,7 +5345,7 @@ float16x8x3_t test_vld3q_f16(float16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>
@@ -5353,7 +5354,7 @@ float32x4x3_t test_vld3q_f32(float32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x3_t* [[__RET]] to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>
 poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
@@ -5361,7 +5362,7 @@ poly8x16x3_t test_vld3q_p8(poly8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD3Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>
@@ -5580,28 +5581,28 @@ poly16x4x3_t test_vld3_dup_p16(poly16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -5612,28 +5613,28 @@ uint16x8x3_t test_vld3q_lane_u16(uint16_t const * a, uint16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
@@ -5644,28 +5645,28 @@ uint32x4x3_t test_vld3q_lane_u32(uint32_t const * a, uint32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -5676,28 +5677,28 @@ int16x8x3_t test_vld3q_lane_s16(int16_t const * a, int16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
@@ -5708,28 +5709,28 @@ int32x4x3_t test_vld3q_lane_s32(int32_t const * a, int32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -5740,28 +5741,28 @@ float16x8x3_t test_vld3q_lane_f16(float16_t const * a, float16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
@@ -5772,28 +5773,28 @@ float32x4x3_t test_vld3q_lane_f32(float32_t const * a, float32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld3q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x3_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -6103,7 +6104,7 @@ poly16x4x3_t test_vld3_lane_p16(poly16_t const * a, poly16x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_u8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint8x16x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x4_t* [[__RET]] to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
 uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
@@ -6111,7 +6112,7 @@ uint8x16x4_t test_vld4q_u8(uint8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_u16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
@@ -6120,7 +6121,7 @@ uint16x8x4_t test_vld4q_u16(uint16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_u32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
@@ -6129,7 +6130,7 @@ uint32x4x4_t test_vld4q_u32(uint32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_s8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int8x16x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x4_t* [[__RET]] to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
 int8x16x4_t test_vld4q_s8(int8_t const * a) {
@@ -6137,7 +6138,7 @@ int8x16x4_t test_vld4q_s8(int8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_s16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
@@ -6146,7 +6147,7 @@ int16x8x4_t test_vld4q_s16(int16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_s32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>
@@ -6155,7 +6156,7 @@ int32x4x4_t test_vld4q_s32(int32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_f16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
@@ -6164,7 +6165,7 @@ float16x8x4_t test_vld4q_f16(float16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_f32(
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <4 x float>, <4 x float>, <4 x float>, <4 x float>
@@ -6173,7 +6174,7 @@ float32x4x4_t test_vld4q_f32(float32_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_p8(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly8x16x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x4_t* [[__RET]] to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>
 poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
@@ -6181,7 +6182,7 @@ poly8x16x4_t test_vld4q_p8(poly8_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_p16(
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VLD4Q_V:%.*]] = call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>
@@ -6400,32 +6401,32 @@ poly16x4x4_t test_vld4_dup_p16(poly16_t const * a) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -6437,32 +6438,32 @@ uint16x8x4_t test_vld4q_lane_u16(uint16_t const * a, uint16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.uint32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
@@ -6474,32 +6475,32 @@ uint32x4x4_t test_vld4q_lane_u32(uint32_t const * a, uint32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -6511,32 +6512,32 @@ int16x8x4_t test_vld4q_lane_s16(int16_t const * a, int16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.int32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x i32> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x i32> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x i32> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <4 x i32> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x i32>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x i32>
@@ -6548,32 +6549,32 @@ int32x4x4_t test_vld4q_lane_s32(int32_t const * a, int32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x half> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x half> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x half> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x half> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -6585,32 +6586,32 @@ float16x8x4_t test_vld4q_lane_f16(float16_t const * a, float16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.float32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.float32x4x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <4 x float> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <4 x float> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <4 x float> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <4 x float> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <4 x float>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <4 x float>
@@ -6622,32 +6623,32 @@ float32x4x4_t test_vld4q_lane_f32(float32_t const * a, float32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vld4q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 8
+// CHECK:   [[__RET:%.*]] = alloca %struct.poly16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly16x8x4_t* [[__RET]] to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP6:%.*]] = bitcast <8 x i16> [[TMP5]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP8:%.*]] = bitcast <8 x i16> [[TMP7]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP9:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP10:%.*]] = bitcast <8 x i16> [[TMP9]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP11:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP12:%.*]] = bitcast <8 x i16> [[TMP11]] to <16 x i8>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP6]] to <8 x i16>
 // CHECK:   [[TMP14:%.*]] = bitcast <16 x i8> [[TMP8]] to <8 x i16>
@@ -14548,21 +14549,21 @@ float32x4_t test_vsetq_lane_f32(float32_t a, float32x4_t b) {
 
 // CHECK-LABEL: @test_vsetq_lane_f16(
 // CHECK:   [[__REINT_248:%.*]] = alloca half, align 2
-// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 16
-// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 16
+// CHECK:   [[__REINT1_248:%.*]] = alloca <8 x half>, align 8
+// CHECK:   [[__REINT2_248:%.*]] = alloca <8 x i16>, align 8
 // CHECK:   [[TMP0:%.*]] = load half, half* %a, align 2
 // CHECK:   store half [[TMP0]], half* [[__REINT_248]], align 2
-// CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 16
+// CHECK:   store <8 x half> %b, <8 x half>* [[__REINT1_248]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast half* [[__REINT_248]] to i16*
 // CHECK:   [[TMP2:%.*]] = load i16, i16* [[TMP1]], align 2
 // CHECK:   [[TMP3:%.*]] = bitcast <8 x half>* [[__REINT1_248]] to <8 x i16>*
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[TMP3]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[TMP6:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[VSET_LANE:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[TMP2]], i32 3
-// CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 16
+// CHECK:   store <8 x i16> [[VSET_LANE]], <8 x i16>* [[__REINT2_248]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16>* [[__REINT2_248]] to <8 x half>*
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[TMP7]], align 8
 // CHECK:   ret <8 x half> [[TMP8]]
 float16x8_t test_vsetq_lane_f16(float16_t *a, float16x8_t b) {
   return vsetq_lane_f16(*a, b, 3);
@@ -16193,20 +16194,20 @@ void test_vst1_lane_p16(poly16_t * a, poly16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x2_t, %struct.uint8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
 // CHECK:   ret void
 void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
@@ -16214,22 +16215,22 @@ void test_vst2q_u8(uint8_t * a, uint8x16x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16240,22 +16241,22 @@ void test_vst2q_u16(uint16_t * a, uint16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -16266,20 +16267,20 @@ void test_vst2q_u32(uint32_t * a, uint32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x2_t, %struct.int8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
 // CHECK:   ret void
 void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
@@ -16287,22 +16288,22 @@ void test_vst2q_s8(int8_t * a, int8x16x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16313,22 +16314,22 @@ void test_vst2q_s16(int16_t * a, int16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -16339,22 +16340,22 @@ void test_vst2q_s32(int32_t * a, int32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16365,22 +16366,22 @@ void test_vst2q_f16(float16_t * a, float16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
@@ -16391,20 +16392,20 @@ void test_vst2q_f32(float32_t * a, float32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <16 x i8>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x2_t, %struct.poly8x16x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <16 x i8>], [2 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   call void @llvm.arm.neon.vst2.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], i32 1)
 // CHECK:   ret void
 void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
@@ -16412,22 +16413,22 @@ void test_vst2q_p8(poly8_t * a, poly8x16x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16735,22 +16736,22 @@ void test_vst2_p16(poly16_t * a, poly16x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x2_t, %struct.uint16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16761,22 +16762,22 @@ void test_vst2q_lane_u16(uint16_t * a, uint16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x2_t, %struct.uint32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -16787,22 +16788,22 @@ void test_vst2q_lane_u32(uint32_t * a, uint32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x2_t, %struct.int16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16813,22 +16814,22 @@ void test_vst2q_lane_s16(int16_t * a, int16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x i32>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x2_t, %struct.int32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x i32>], [2 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -16839,22 +16840,22 @@ void test_vst2q_lane_s32(int32_t * a, int32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x half>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x2_t, %struct.float16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x half>], [2 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -16865,22 +16866,22 @@ void test_vst2q_lane_f16(float16_t * a, float16x8x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <4 x float>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x2_t, %struct.float32x4x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <4 x float>], [2 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
@@ -16891,22 +16892,22 @@ void test_vst2q_lane_f32(float32_t * a, float32x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst2q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x2_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [2 x <8 x i16>]* [[COERCE_DIVE]] to [4 x i64]*
-// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 16
+// CHECK:   store [4 x i64] [[B]].coerce, [4 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x2_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x2_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 32, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x2_t, %struct.poly16x8x2_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [2 x <8 x i16>], [2 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[TMP8:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP9:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17162,23 +17163,23 @@ void test_vst2_lane_p16(poly16_t * a, poly16x4x2_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x3_t, %struct.uint8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 8
 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
 // CHECK:   ret void
 void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
@@ -17186,26 +17187,26 @@ void test_vst3q_u8(uint8_t * a, uint8x16x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17217,26 +17218,26 @@ void test_vst3q_u16(uint16_t * a, uint16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -17248,23 +17249,23 @@ void test_vst3q_u32(uint32_t * a, uint32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x3_t, %struct.int8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 8
 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
 // CHECK:   ret void
 void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
@@ -17272,26 +17273,26 @@ void test_vst3q_s8(int8_t * a, int8x16x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17303,26 +17304,26 @@ void test_vst3q_s16(int16_t * a, int16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -17334,26 +17335,26 @@ void test_vst3q_s32(int32_t * a, int32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17365,26 +17366,26 @@ void test_vst3q_f16(float16_t * a, float16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
@@ -17396,23 +17397,23 @@ void test_vst3q_f32(float32_t * a, float32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <16 x i8>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x3_t, %struct.poly8x16x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <16 x i8>], [3 x <16 x i8>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 8
 // CHECK:   call void @llvm.arm.neon.vst3.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], i32 1)
 // CHECK:   ret void
 void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
@@ -17420,26 +17421,26 @@ void test_vst3q_p8(poly8_t * a, poly8x16x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17802,26 +17803,26 @@ void test_vst3_p16(poly16_t * a, poly16x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x3_t, %struct.uint16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17833,26 +17834,26 @@ void test_vst3q_lane_u16(uint16_t * a, uint16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x3_t, %struct.uint32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -17864,26 +17865,26 @@ void test_vst3q_lane_u32(uint32_t * a, uint32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x3_t, %struct.int16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17895,26 +17896,26 @@ void test_vst3q_lane_s16(int16_t * a, int16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x i32>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x3_t, %struct.int32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x i32>], [3 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -17926,26 +17927,26 @@ void test_vst3q_lane_s32(int32_t * a, int32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x half>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x3_t, %struct.float16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x half>], [3 x <8 x half>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -17957,26 +17958,26 @@ void test_vst3q_lane_f16(float16_t * a, float16x8x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <4 x float>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x3_t, %struct.float32x4x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <4 x float>], [3 x <4 x float>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
@@ -17988,26 +17989,26 @@ void test_vst3q_lane_f32(float32_t * a, float32x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst3q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x3_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x3_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [3 x <8 x i16>]* [[COERCE_DIVE]] to [6 x i64]*
-// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 16
+// CHECK:   store [6 x i64] [[B]].coerce, [6 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x3_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x3_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 48, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x3_t, %struct.poly16x8x3_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [3 x <8 x i16>], [3 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[TMP10:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP11:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -18308,26 +18309,26 @@ void test_vst3_lane_p16(poly16_t * a, poly16x4x3_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_u8(
-// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint8x16x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint8x16x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 8
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint8x16x4_t, %struct.uint8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 8
 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
 // CHECK:   ret void
 void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
@@ -18335,30 +18336,30 @@ void test_vst4q_u8(uint8_t * a, uint8x16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -18371,30 +18372,30 @@ void test_vst4q_u16(uint16_t * a, uint16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -18407,26 +18408,26 @@ void test_vst4q_u32(uint32_t * a, uint32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_s8(
-// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int8x16x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int8x16x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 8
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int8x16x4_t, %struct.int8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 8
 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
 // CHECK:   ret void
 void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
@@ -18434,30 +18435,30 @@ void test_vst4q_s8(int8_t * a, int8x16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -18470,30 +18471,30 @@ void test_vst4q_s16(int16_t * a, int16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -18506,30 +18507,30 @@ void test_vst4q_s32(int32_t * a, int32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -18542,30 +18543,30 @@ void test_vst4q_f16(float16_t * a, float16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
@@ -18578,26 +18579,26 @@ void test_vst4q_f32(float32_t * a, float32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_p8(
-// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly8x16x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly8x16x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <16 x i8>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly8x16x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly8x16x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP3:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX]], align 8
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP4:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX2]], align 8
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP5:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX4]], align 8
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly8x16x4_t, %struct.poly8x16x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <16 x i8>], [4 x <16 x i8>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP6:%.*]] = load <16 x i8>, <16 x i8>* [[ARRAYIDX6]], align 8
 // CHECK:   call void @llvm.arm.neon.vst4.p0i8.v16i8(i8* %a, <16 x i8> [[TMP3]], <16 x i8> [[TMP4]], <16 x i8> [[TMP5]], <16 x i8> [[TMP6]], i32 1)
 // CHECK:   ret void
 void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
@@ -18605,30 +18606,30 @@ void test_vst4q_p8(poly8_t * a, poly8x16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -19046,30 +19047,30 @@ void test_vst4_p16(poly16_t * a, poly16x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_u16(
-// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint16x8x4_t, %struct.uint16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -19082,30 +19083,30 @@ void test_vst4q_lane_u16(uint16_t * a, uint16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_u32(
-// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.uint32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.uint32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.uint32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.uint32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.uint32x4x4_t, %struct.uint32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -19118,30 +19119,30 @@ void test_vst4q_lane_u32(uint32_t * a, uint32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_s16(
-// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int16x8x4_t, %struct.int16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -19154,30 +19155,30 @@ void test_vst4q_lane_s16(int16_t * a, int16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_s32(
-// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.int32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.int32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x i32>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.int32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.int32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i32* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x i32> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x i32> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.int32x4x4_t, %struct.int32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x i32>], [4 x <4 x i32>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <4 x i32>, <4 x i32>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x i32> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x i32>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x i32>
@@ -19190,30 +19191,30 @@ void test_vst4q_lane_s32(int32_t * a, int32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_f16(
-// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x half>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast half* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x half> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x half> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x half> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float16x8x4_t, %struct.float16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x half>], [4 x <8 x half>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x half>, <8 x half>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x half> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -19226,30 +19227,30 @@ void test_vst4q_lane_f16(float16_t * a, float16x8x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_f32(
-// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.float32x4x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.float32x4x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <4 x float>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.float32x4x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.float32x4x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast float* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <4 x float> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <4 x float> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.float32x4x4_t, %struct.float32x4x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <4 x float>], [4 x <4 x float>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <4 x float>, <4 x float>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <4 x float> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <4 x float>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <4 x float>
@@ -19262,30 +19263,30 @@ void test_vst4q_lane_f32(float32_t * a, float32x4x4_t b) {
 }
 
 // CHECK-LABEL: @test_vst4q_lane_p16(
-// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 16
-// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 16
+// CHECK:   [[B:%.*]] = alloca %struct.poly16x8x4_t, align 8
+// CHECK:   [[__S1:%.*]] = alloca %struct.poly16x8x4_t, align 8
 // CHECK:   [[COERCE_DIVE:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[B]], i32 0, i32 0
 // CHECK:   [[TMP0:%.*]] = bitcast [4 x <8 x i16>]* [[COERCE_DIVE]] to [8 x i64]*
-// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 16
+// CHECK:   store [8 x i64] [[B]].coerce, [8 x i64]* [[TMP0]], align 8
 // CHECK:   [[TMP1:%.*]] = bitcast %struct.poly16x8x4_t* [[__S1]] to i8*
 // CHECK:   [[TMP2:%.*]] = bitcast %struct.poly16x8x4_t* [[B]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP1]], i8* [[TMP2]], i32 64, i32 8, i1 false)
 // CHECK:   [[TMP3:%.*]] = bitcast i16* %a to i8*
 // CHECK:   [[VAL:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL]], i32 0, i32 0
-// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 16
+// CHECK:   [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX]], align 8
 // CHECK:   [[TMP5:%.*]] = bitcast <8 x i16> [[TMP4]] to <16 x i8>
 // CHECK:   [[VAL1:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX2:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL1]], i32 0, i32 1
-// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 16
+// CHECK:   [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX2]], align 8
 // CHECK:   [[TMP7:%.*]] = bitcast <8 x i16> [[TMP6]] to <16 x i8>
 // CHECK:   [[VAL3:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX4:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL3]], i32 0, i32 2
-// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 16
+// CHECK:   [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX4]], align 8
 // CHECK:   [[TMP9:%.*]] = bitcast <8 x i16> [[TMP8]] to <16 x i8>
 // CHECK:   [[VAL5:%.*]] = getelementptr inbounds %struct.poly16x8x4_t, %struct.poly16x8x4_t* [[__S1]], i32 0, i32 0
 // CHECK:   [[ARRAYIDX6:%.*]] = getelementptr inbounds [4 x <8 x i16>], [4 x <8 x i16>]* [[VAL5]], i32 0, i32 3
-// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 16
+// CHECK:   [[TMP10:%.*]] = load <8 x i16>, <8 x i16>* [[ARRAYIDX6]], align 8
 // CHECK:   [[TMP11:%.*]] = bitcast <8 x i16> [[TMP10]] to <16 x i8>
 // CHECK:   [[TMP12:%.*]] = bitcast <16 x i8> [[TMP5]] to <8 x i16>
 // CHECK:   [[TMP13:%.*]] = bitcast <16 x i8> [[TMP7]] to <8 x i16>
@@ -20630,7 +20631,7 @@ poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vtrnq_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -20640,14 +20641,14 @@ poly16x4x2_t test_vtrn_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !30
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
   return vtrnq_s8(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -20659,14 +20660,14 @@ int8x16x2_t test_vtrnq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !33
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
   return vtrnq_s16(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
@@ -20678,14 +20679,14 @@ int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !36
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
   return vtrnq_s32(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -20695,14 +20696,14 @@ int32x4x2_t test_vtrnq_s32(int32x4_t a, int32x4_t b) {
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !39
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
   return vtrnq_u8(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -20714,14 +20715,14 @@ uint8x16x2_t test_vtrnq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !42
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
   return vtrnq_u16(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
@@ -20733,14 +20734,14 @@ uint16x8x2_t test_vtrnq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   store <4 x i32> [[VTRN1_I]], <4 x i32>* [[TMP4]], !noalias !45
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
   return vtrnq_u32(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
@@ -20752,14 +20753,14 @@ uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   store <4 x float> [[VTRN1_I]], <4 x float>* [[TMP4]], !noalias !48
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
   return vtrnq_f32(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VTRN_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
@@ -20769,14 +20770,14 @@ float32x4x2_t test_vtrnq_f32(float32x4_t a, float32x4_t b) {
 // CHECK:   store <16 x i8> [[VTRN1_I]], <16 x i8>* [[TMP2]], !noalias !51
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
   return vtrnq_p8(a, b);
 }
 
 // CHECK-LABEL: @test_vtrnq_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -20788,7 +20789,7 @@ poly8x16x2_t test_vtrnq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   store <8 x i16> [[VTRN1_I]], <8 x i16>* [[TMP4]], !noalias !54
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 poly16x8x2_t test_vtrnq_p16(poly16x8_t a, poly16x8_t b) {
   return vtrnq_p16(a, b);
@@ -21124,7 +21125,7 @@ poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vuzpq_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -21134,14 +21135,14 @@ poly16x4x2_t test_vuzp_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !84
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
   return vuzpq_s8(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -21153,14 +21154,14 @@ int8x16x2_t test_vuzpq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !87
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
   return vuzpq_s16(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
@@ -21172,14 +21173,14 @@ int16x8x2_t test_vuzpq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !90
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
   return vuzpq_s32(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -21189,14 +21190,14 @@ int32x4x2_t test_vuzpq_s32(int32x4_t a, int32x4_t b) {
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !93
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
   return vuzpq_u8(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -21208,14 +21209,14 @@ uint8x16x2_t test_vuzpq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !96
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
   return vuzpq_u16(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
@@ -21227,14 +21228,14 @@ uint16x8x2_t test_vuzpq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   store <4 x i32> [[VUZP1_I]], <4 x i32>* [[TMP4]], !noalias !99
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
   return vuzpq_u32(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
@@ -21246,14 +21247,14 @@ uint32x4x2_t test_vuzpq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   store <4 x float> [[VUZP1_I]], <4 x float>* [[TMP4]], !noalias !102
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
   return vuzpq_f32(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VUZP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -21263,14 +21264,14 @@ float32x4x2_t test_vuzpq_f32(float32x4_t a, float32x4_t b) {
 // CHECK:   store <16 x i8> [[VUZP1_I]], <16 x i8>* [[TMP2]], !noalias !105
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
   return vuzpq_p8(a, b);
 }
 
 // CHECK-LABEL: @test_vuzpq_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -21282,7 +21283,7 @@ poly8x16x2_t test_vuzpq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   store <8 x i16> [[VUZP1_I]], <8 x i16>* [[TMP4]], !noalias !108
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 poly16x8x2_t test_vuzpq_p16(poly16x8_t a, poly16x8_t b) {
   return vuzpq_p16(a, b);
@@ -21454,7 +21455,7 @@ poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
 }
 
 // CHECK-LABEL: @test_vzipq_s8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -21464,14 +21465,14 @@ poly16x4x2_t test_vzip_p16(poly16x4_t a, poly16x4_t b) {
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !138
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.int8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.int8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
   return vzipq_s8(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_s16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -21483,14 +21484,14 @@ int8x16x2_t test_vzipq_s8(int8x16_t a, int8x16_t b) {
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !141
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
   return vzipq_s16(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_s32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.int32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
@@ -21502,14 +21503,14 @@ int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) {
 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !144
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.int32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.int32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
   return vzipq_s32(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_u8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -21519,14 +21520,14 @@ int32x4x2_t test_vzipq_s32(int32x4_t a, int32x4_t b) {
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !147
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.uint8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.uint8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
   return vzipq_u8(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_u16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -21538,14 +21539,14 @@ uint8x16x2_t test_vzipq_u8(uint8x16_t a, uint8x16_t b) {
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !150
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
   return vzipq_u16(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_u32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.uint32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x i32> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x i32> %b to <16 x i8>
@@ -21557,14 +21558,14 @@ uint16x8x2_t test_vzipq_u16(uint16x8_t a, uint16x8_t b) {
 // CHECK:   store <4 x i32> [[VZIP1_I]], <4 x i32>* [[TMP4]], !noalias !153
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.uint32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.uint32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
   return vzipq_u32(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_f32(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.float32x4x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <4 x float> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <4 x float> %b to <16 x i8>
@@ -21576,14 +21577,14 @@ uint32x4x2_t test_vzipq_u32(uint32x4_t a, uint32x4_t b) {
 // CHECK:   store <4 x float> [[VZIP1_I]], <4 x float>* [[TMP4]], !noalias !156
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.float32x4x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.float32x4x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
   return vzipq_f32(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_p8(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly8x16x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast i8* [[TMP0]] to <16 x i8>*
 // CHECK:   [[VZIP_I:%.*]] = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
@@ -21593,14 +21594,14 @@ float32x4x2_t test_vzipq_f32(float32x4_t a, float32x4_t b) {
 // CHECK:   store <16 x i8> [[VZIP1_I]], <16 x i8>* [[TMP2]], !noalias !159
 // CHECK:   [[TMP3:%.*]] = bitcast %struct.poly8x16x2_t* %agg.result to i8*
 // CHECK:   [[TMP4:%.*]] = bitcast %struct.poly8x16x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP3]], i8* [[TMP4]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
   return vzipq_p8(a, b);
 }
 
 // CHECK-LABEL: @test_vzipq_p16(
-// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 16
+// CHECK:   [[__RET_I:%.*]] = alloca %struct.poly16x8x2_t, align 8
 // CHECK:   [[TMP0:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
 // CHECK:   [[TMP1:%.*]] = bitcast <8 x i16> %a to <16 x i8>
 // CHECK:   [[TMP2:%.*]] = bitcast <8 x i16> %b to <16 x i8>
@@ -21612,7 +21613,7 @@ poly8x16x2_t test_vzipq_p8(poly8x16_t a, poly8x16_t b) {
 // CHECK:   store <8 x i16> [[VZIP1_I]], <8 x i16>* [[TMP4]], !noalias !162
 // CHECK:   [[TMP5:%.*]] = bitcast %struct.poly16x8x2_t* %agg.result to i8*
 // CHECK:   [[TMP6:%.*]] = bitcast %struct.poly16x8x2_t* [[__RET_I]] to i8*
-// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 16, i1 false)
+// CHECK:   call void @llvm.memcpy.p0i8.p0i8.i32(i8* [[TMP5]], i8* [[TMP6]], i32 32, i32 8, i1 false)
 // CHECK:   ret void
 poly16x8x2_t test_vzipq_p16(poly16x8_t a, poly16x8_t b) {
   return vzipq_p16(a, b);
diff --git a/test/CodeGen/union-align.c b/test/CodeGen/union-align.c
index 89a9456e609d..2055d93d8efc 100644
--- a/test/CodeGen/union-align.c
+++ b/test/CodeGen/union-align.c
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -emit-llvm %s -o - | grep load | grep "4 x float" | not grep "align 4"
-// RUN: %clang_cc1 -emit-llvm %s -o - | grep load | grep "4 x float" | grep "align 16"
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | grep load | grep "4 x float" | not grep "align 4"
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | grep load | grep "4 x float" | grep "align 16"
 // PR3432
 // rdar://6536377
 
diff --git a/test/CodeGenOpenCL/bool_cast.cl b/test/CodeGenOpenCL/bool_cast.cl
index 8c86b06577d6..ab40eccf571f 100644
--- a/test/CodeGenOpenCL/bool_cast.cl
+++ b/test/CodeGenOpenCL/bool_cast.cl
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - -O0 | FileCheck %s
+// RUN: %clang_cc1 %s -triple x86_64-unknown-linux-gnu -emit-llvm -o - -O0 | FileCheck %s
 
 typedef unsigned char uchar4 __attribute((ext_vector_type(4)));
 typedef unsigned int int4 __attribute((ext_vector_type(4)));
diff --git a/test/Lexer/preamble.c b/test/Lexer/preamble.c
index 5b2739abefcc..762271f2e3f1 100644
--- a/test/Lexer/preamble.c
+++ b/test/Lexer/preamble.c
@@ -9,15 +9,12 @@
 #pragma unknown
 #endif
 #ifdef WIBBLE
-#include "honk"
-#else
-int foo();
+#include "foo"
+int bar;
 #endif
 
 // This test checks for detection of the preamble of a file, which
-// includes all of the starting comments and #includes. Note that any
-// changes to the preamble part of this file must be mirrored in
-// Inputs/preamble.txt, since we diff against it.
+// includes all of the starting comments and #includes.
 
 // RUN: %clang_cc1 -print-preamble %s > %t
 // RUN: echo END. >> %t
@@ -33,4 +30,6 @@ int foo();
 // CHECK-NEXT: #endif
 // CHECK-NEXT: #pragma unknown
 // CHECK-NEXT: #endif
+// CHECK-NEXT: #ifdef WIBBLE
+// CHECK-NEXT: #include "foo"
 // CHECK-NEXT: END.
diff --git a/test/Lexer/preamble2.c b/test/Lexer/preamble2.c
new file mode 100644
index 000000000000..499a9a22a5b3
--- /dev/null
+++ b/test/Lexer/preamble2.c
@@ -0,0 +1,19 @@
+// Preamble detection test: header with an include guard.
+#ifndef HEADER_H
+#define HEADER_H
+#include "foo"
+int bar;
+#endif
+
+// This test checks for detection of the preamble of a file, which
+// includes all of the starting comments and #includes.
+
+// RUN: %clang_cc1 -print-preamble %s > %t
+// RUN: echo END. >> %t
+// RUN: FileCheck < %t %s
+
+// CHECK: // Preamble detection test: header with an include guard.
+// CHECK-NEXT: #ifndef HEADER_H
+// CHECK-NEXT: #define HEADER_H
+// CHECK-NEXT: #include "foo"
+// CHECK-NEXT: END.
diff --git a/test/Modules/Inputs/preprocess/a.h b/test/Modules/Inputs/preprocess/a.h
new file mode 100644
index 000000000000..1292e99b4b25
--- /dev/null
+++ b/test/Modules/Inputs/preprocess/a.h
@@ -0,0 +1,2 @@
+#include "c.h"
+T a();
diff --git a/test/Modules/Inputs/preprocess/b.h b/test/Modules/Inputs/preprocess/b.h
new file mode 100644
index 000000000000..e8a9f55968f5
--- /dev/null
+++ b/test/Modules/Inputs/preprocess/b.h
@@ -0,0 +1,2 @@
+#include "c.h"
+T b();
diff --git a/test/Modules/Inputs/preprocess/c.h b/test/Modules/Inputs/preprocess/c.h
new file mode 100644
index 000000000000..718f5dc18295
--- /dev/null
+++ b/test/Modules/Inputs/preprocess/c.h
@@ -0,0 +1,4 @@
+#ifndef C_H
+#define C_H
+using T = int;
+#endif
diff --git a/test/Modules/Inputs/preprocess/module.modulemap b/test/Modules/Inputs/preprocess/module.modulemap
index 943435a953d0..f700db03beac 100644
--- a/test/Modules/Inputs/preprocess/module.modulemap
+++ b/test/Modules/Inputs/preprocess/module.modulemap
@@ -1,2 +1,7 @@
 module fwd { header "fwd.h" export * }
 module file { header "file.h" header "file2.h" export * }
+module nested {
+  module a { header "a.h" }
+  module b { header "b.h" }
+  module c { header "c.h" }
+}
diff --git a/test/Modules/preprocess-nested.cpp b/test/Modules/preprocess-nested.cpp
new file mode 100644
index 000000000000..8fccf137e94f
--- /dev/null
+++ b/test/Modules/preprocess-nested.cpp
@@ -0,0 +1,59 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+
+// RUN: %clang_cc1 -fmodules -fmodules-local-submodule-visibility -fmodule-name=nested -I%S/Inputs/preprocess -x c++-module-map %S/Inputs/preprocess/module.modulemap -E -o %t/no-rewrite.ii
+// RUN: %clang_cc1 -fmodules -fmodules-local-submodule-visibility -fmodule-name=nested -I%S/Inputs/preprocess -x c++-module-map %S/Inputs/preprocess/module.modulemap -E -frewrite-includes -o %t/rewrite.ii
+
+// RUN: FileCheck %s --input-file %t/no-rewrite.ii --check-prefix=CHECK --check-prefix=NO-REWRITE
+// RUN: FileCheck %s --input-file %t/rewrite.ii    --check-prefix=CHECK --check-prefix=REWRITE
+
+// Check that we can build a module from the preprocessed output.
+// FIXME: For now, the files need to exist.
+// RUN: touch %t/a.h %t/b.h %t/c.h
+// RUN: %clang_cc1 -fmodules -fmodules-local-submodule-visibility -fmodule-name=nested -x c++-module-map-cpp-output %t/no-rewrite.ii -emit-module -o %t/no-rewrite.pcm
+// RUN: %clang_cc1 -fmodules -fmodules-local-submodule-visibility -fmodule-name=nested -x c++-module-map-cpp-output %t/rewrite.ii -emit-module -o %t/rewrite.pcm
+
+// Check the module we built works.
+// RUN: %clang_cc1 -fmodules -fmodule-file=%t/no-rewrite.pcm %s -I%t -verify -fno-modules-error-recovery
+// RUN: %clang_cc1 -fmodules -fmodule-file=%t/rewrite.pcm %s -I%t -verify -fno-modules-error-recovery -DREWRITE
+
+// == module map
+// CHECK: # 1 "{{.*}}module.modulemap"
+// CHECK: module nested {
+// CHECK:   module a {
+// CHECK:     header "a.h"
+// CHECK:   }
+// CHECK:   module b {
+// CHECK:     header "b.h"
+// CHECK:   }
+// CHECK:   module c {
+// CHECK:     header "c.h"
+// CHECK:   }
+// CHECK: }
+
+// CHECK: #pragma clang module begin nested.a
+// CHECK: #pragma clang module begin nested.c
+// CHECK: using T = int;
+// CHECK: #pragma clang module end
+// CHECK: T a();
+// CHECK: #pragma clang module end
+
+// CHECK: #pragma clang module begin nested.b
+// CHECK: #pragma clang module import nested.c
+// CHECK-NOT: #pragma clang module begin nested.c
+// CHECK-NOT: using T = int;
+// CHECK-NOT: #pragma clang module end
+// CHECK: T b();
+// CHECK: #pragma clang module end
+
+// CHECK: #pragma clang module import nested.c
+
+#pragma clang module import nested.b
+
+int n = b();
+T c; // expected-error {{must be imported}}
+#ifdef REWRITE
+// expected-note@rewrite.ii:* {{declar}}
+#else
+// expected-note@no-rewrite.ii:* {{declar}}
+#endif
diff --git a/test/Modules/preprocess-unavailable.cpp b/test/Modules/preprocess-unavailable.cpp
new file mode 100644
index 000000000000..e568cd7b5251
--- /dev/null
+++ b/test/Modules/preprocess-unavailable.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -x c++-module-map %s -fmodule-name=a -verify
+module a {
+  module b {
+    requires cplusplus11
+  }
+}
+#pragma clang module contents
+// expected-error@3 {{module 'a.b' requires feature 'cplusplus11'}}
+#pragma clang module begin a.b // expected-note {{entering module 'a' due to this pragma}}
+int f();
+int g() { f(); }
+#pragma clang module end // expected-error {{no matching '#pragma clang module begin'}}
diff --git a/test/OpenMP/target_data_messages.c b/test/OpenMP/target_data_messages.c
index 153b4377290c..fd41e7df2a4c 100644
--- a/test/OpenMP/target_data_messages.c
+++ b/test/OpenMP/target_data_messages.c
@@ -4,7 +4,7 @@ void foo() { }
 
 int main(int argc, char **argv) {
   int a;
-  #pragma omp target data // expected-error {{expected at least one map clause for '#pragma omp target data'}}
+  #pragma omp target data // expected-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}}
   {}
   L1:
     foo();
diff --git a/test/OpenMP/target_enter_data_map_messages.c b/test/OpenMP/target_enter_data_map_messages.c
index 6f5aad1ef650..8a7de3b96eff 100644
--- a/test/OpenMP/target_enter_data_map_messages.c
+++ b/test/OpenMP/target_enter_data_map_messages.c
@@ -4,7 +4,7 @@
 int main(int argc, char **argv) {
 
   int r;
-  #pragma omp target enter data // expected-error {{expected at least one map clause for '#pragma omp target enter data'}}
+  #pragma omp target enter data // expected-error {{expected at least one 'map' clause for '#pragma omp target enter data'}}
 
   #pragma omp target enter data map(r) // expected-error {{map type must be specified for '#pragma omp target enter data'}}
   #pragma omp target enter data map(tofrom: r) // expected-error {{map type 'tofrom' is not allowed for '#pragma omp target enter data'}}
diff --git a/test/OpenMP/target_enter_data_nowait_messages.cpp b/test/OpenMP/target_enter_data_nowait_messages.cpp
index e682e8c47d70..508b1b254906 100644
--- a/test/OpenMP/target_enter_data_nowait_messages.cpp
+++ b/test/OpenMP/target_enter_data_nowait_messages.cpp
@@ -6,7 +6,7 @@ int main(int argc, char **argv) {
   #pragma omp nowait target enter data map(to: i) // expected-error {{expected an OpenMP directive}}
   #pragma omp target nowait enter data map(to: i) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
   #pragma omp target enter nowait data map(to: i) // expected-error {{expected an OpenMP directive}}
-  #pragma omp target enter data nowait() map(to: i) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}} expected-error {{expected at least one map clause for '#pragma omp target enter data'}}
+  #pragma omp target enter data nowait() map(to: i) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}} expected-error {{expected at least one 'map' clause for '#pragma omp target enter data'}}
   #pragma omp target enter data map(to: i) nowait( // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
   #pragma omp target enter data map(to: i) nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target enter data' are ignored}}
   #pragma omp target enter data map(to: i) nowait device (-10u)
diff --git a/test/OpenMP/target_exit_data_map_messages.c b/test/OpenMP/target_exit_data_map_messages.c
index a9953fbbb4c6..84bd7e79fd03 100644
--- a/test/OpenMP/target_exit_data_map_messages.c
+++ b/test/OpenMP/target_exit_data_map_messages.c
@@ -4,7 +4,7 @@
 int main(int argc, char **argv) {
 
   int r;
-  #pragma omp target exit data // expected-error {{expected at least one map clause for '#pragma omp target exit data'}}
+  #pragma omp target exit data // expected-error {{expected at least one 'map' clause for '#pragma omp target exit data'}}
 
   #pragma omp target exit data map(r) // expected-error {{map type must be specified for '#pragma omp target exit data'}}
   #pragma omp target exit data map(tofrom: r) // expected-error {{map type 'tofrom' is not allowed for '#pragma omp target exit data'}}
diff --git a/test/OpenMP/target_exit_data_nowait_messages.cpp b/test/OpenMP/target_exit_data_nowait_messages.cpp
index cd743d89278e..d0286031f362 100644
--- a/test/OpenMP/target_exit_data_nowait_messages.cpp
+++ b/test/OpenMP/target_exit_data_nowait_messages.cpp
@@ -6,7 +6,7 @@ int main(int argc, char **argv) {
   #pragma omp nowait target exit data map(from: i) // expected-error {{expected an OpenMP directive}}
   #pragma omp target nowait exit data map(from: i) // expected-warning {{extra tokens at the end of '#pragma omp target' are ignored}}
   #pragma omp target exit nowait data map(from: i) // expected-error {{expected an OpenMP directive}}
-  #pragma omp target exit data nowait() map(from: i) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}} expected-error {{expected at least one map clause for '#pragma omp target exit data'}}
+  #pragma omp target exit data nowait() map(from: i) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}} expected-error {{expected at least one 'map' clause for '#pragma omp target exit data'}}
   #pragma omp target exit data map(from: i) nowait( // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
   #pragma omp target exit data map(from: i) nowait (argc)) // expected-warning {{extra tokens at the end of '#pragma omp target exit data' are ignored}}
   #pragma omp target exit data map(from: i) nowait device (-10u)
diff --git a/test/OpenMP/target_map_messages.cpp b/test/OpenMP/target_map_messages.cpp
index 9d166ae9a1d9..b9640b965d50 100644
--- a/test/OpenMP/target_map_messages.cpp
+++ b/test/OpenMP/target_map_messages.cpp
@@ -467,7 +467,7 @@ int main(int argc, char **argv) {
   int y;
   int to, tofrom, always;
   const int (&l)[5] = da;
-#pragma omp target data map // expected-error {{expected '(' after 'map'}} expected-error {{expected at least one map clause for '#pragma omp target data'}}
+#pragma omp target data map // expected-error {{expected '(' after 'map'}} expected-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}}
 #pragma omp target data map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
 #pragma omp target data map() // expected-error {{expected expression}}
 #pragma omp target data map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
diff --git a/test/OpenMP/target_teams_map_messages.cpp b/test/OpenMP/target_teams_map_messages.cpp
index 15839b848532..89425d622c10 100644
--- a/test/OpenMP/target_teams_map_messages.cpp
+++ b/test/OpenMP/target_teams_map_messages.cpp
@@ -464,7 +464,7 @@ int main(int argc, char **argv) {
   int y;
   int to, tofrom, always;
   const int (&l)[5] = da;
-#pragma omp target data map // expected-error {{expected '(' after 'map'}} expected-error {{expected at least one map clause for '#pragma omp target data'}}
+#pragma omp target data map // expected-error {{expected '(' after 'map'}} expected-error {{expected at least one 'map' or 'use_device_ptr' clause for '#pragma omp target data'}}
 #pragma omp target data map( // expected-error {{expected ')'}} expected-note {{to match this '('}} expected-error {{expected expression}}
 #pragma omp target data map() // expected-error {{expected expression}}
 #pragma omp target data map(alloc) // expected-error {{use of undeclared identifier 'alloc'}}
diff --git a/test/SemaOpenCL/arithmetic-conversions.cl b/test/SemaOpenCL/arithmetic-conversions.cl
new file mode 100644
index 000000000000..23103caf85ef
--- /dev/null
+++ b/test/SemaOpenCL/arithmetic-conversions.cl
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 %s -triple spir-unknown-unknown -verify -pedantic -fsyntax-only -cl-std=CL1.2
+
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef long long2 __attribute__((ext_vector_type(2)));
+typedef int int2 __attribute__((ext_vector_type(2)));
+
+kernel void foo1(float2 in, global float2 *out) { *out = in + 0.5;} // expected-error {{scalar operand type has greater rank than the type of the vector element. ('float2' (vector of 2 'float' values) and 'double')}}
+
+kernel void foo2(float2 in, global float2 *out) { *out = 0.5 + in;} // expected-error {{scalar operand type has greater rank than the type of the vector element. ('double' and 'float2' (vector of 2 'float' values))}}
+
+kernel void foo3(float2 in, global float2 *out) { *out = 0.5f + in;}
+
+kernel void foo4(long2 in, global long2 *out) { *out = 5 + in;}
+
+kernel void foo5(float2 in, global float2 *out) {
+    float* f;
+    *out = f + in; // expected-error{{cannot convert between vector and non-scalar values ('float *' and 'float2' (vector of 2 'float' values))}}
+}
+
+kernel void foo6(int2 in, global int2 *out) {
+    int* f;
+    *out = f + in; // expected-error{{cannot convert between vector and non-scalar values ('int *' and 'int2' (vector of 2 'int' values))}}
+}
diff --git a/test/SemaOpenCL/clang-builtin-version.cl b/test/SemaOpenCL/clang-builtin-version.cl
index 8574682ab93b..3e270e64c69d 100644
--- a/test/SemaOpenCL/clang-builtin-version.cl
+++ b/test/SemaOpenCL/clang-builtin-version.cl
@@ -4,13 +4,13 @@
 
 kernel void dse_builtins() {
   int tmp;
-  enqueue_kernel(tmp, tmp, tmp, ^(void) { // expected-warning{{implicit declaration of function 'enqueue_kernel' is invalid in C99}}
+  enqueue_kernel(tmp, tmp, tmp, ^(void) { // expected-error{{implicit declaration of function 'enqueue_kernel' is invalid in OpenCL}}
     return;
   });
-  unsigned size = get_kernel_work_group_size(^(void) { // expected-warning{{implicit declaration of function 'get_kernel_work_group_size' is invalid in C99}}
+  unsigned size = get_kernel_work_group_size(^(void) { // expected-error{{implicit declaration of function 'get_kernel_work_group_size' is invalid in OpenCL}}
     return;
   });
-  size = get_kernel_preferred_work_group_size_multiple(^(void) { // expected-warning{{implicit declaration of function 'get_kernel_preferred_work_group_size_multiple' is invalid in C99}}
+  size = get_kernel_preferred_work_group_size_multiple(^(void) { // expected-error{{implicit declaration of function 'get_kernel_preferred_work_group_size_multiple' is invalid in OpenCL}}
     return;
   });
 }
@@ -18,27 +18,48 @@ kernel void dse_builtins() {
 void pipe_builtins() {
   int tmp;
 
-  read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'read_pipe' is invalid in C99}}
-  write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'write_pipe' is invalid in C99}}
+  foo(void); // expected-error{{implicit declaration of function 'foo' is invalid in OpenCL}}
+  // expected-note@-1{{'foo' declared here}}
+  // expected-error@-2{{expected expression}}
+  boo(); // expected-error{{implicit declaration of function 'boo' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'foo'?}}
 
-  reserve_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'reserve_read_pipe' is invalid in C99}}
-  reserve_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'reserve_write_pipe' is invalid in C99}}
+  read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'read_pipe' is invalid in OpenCL}}
+  write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'write_pipe' is invalid in OpenCL}}
 
-  work_group_reserve_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'work_group_reserve_read_pipe' is invalid in C99}}
-  work_group_reserve_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'work_group_reserve_write_pipe' is invalid in C99}}
+  reserve_read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'reserve_read_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{'reserve_read_pipe' declared here}}
+  reserve_write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'reserve_write_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'reserve_read_pipe'?}}
 
-  sub_group_reserve_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'sub_group_reserve_write_pipe' is invalid in C99}}
-  sub_group_reserve_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'sub_group_reserve_read_pipe' is invalid in C99}}
+  work_group_reserve_read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'work_group_reserve_read_pipe' is invalid in OpenCL}}
+  // expected-note@-1 2{{'work_group_reserve_read_pipe' declared here}}
+  work_group_reserve_write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'work_group_reserve_write_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'work_group_reserve_read_pipe'?}}
+  // expected-note@-2{{'work_group_reserve_write_pipe' declared here}}
 
-  commit_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'commit_read_pipe' is invalid in C99}}
-  commit_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'commit_write_pipe' is invalid in C99}}
+  sub_group_reserve_write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'sub_group_reserve_write_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'work_group_reserve_write_pipe'?}}
+  sub_group_reserve_read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'sub_group_reserve_read_pipe' is invalid in OpenCL}}
 
-  work_group_commit_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'work_group_commit_read_pipe' is invalid in C99}}
-  work_group_commit_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'work_group_commit_write_pipe' is invalid in C99}}
+  commit_read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'commit_read_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{'commit_read_pipe' declared here}}
+  commit_write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'commit_write_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'commit_read_pipe'?}}
 
-  sub_group_commit_write_pipe(tmp, tmp); // expected-warning{{implicit declaration of function 'sub_group_commit_write_pipe' is invalid in C99}}
-  sub_group_commit_read_pipe(tmp, tmp);  // expected-warning{{implicit declaration of function 'sub_group_commit_read_pipe' is invalid in C99}}
+  work_group_commit_read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'work_group_commit_read_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{'work_group_commit_read_pipe' declared here}}
+  // expected-note@-2{{did you mean 'work_group_reserve_read_pipe'?}}
+  work_group_commit_write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'work_group_commit_write_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{'work_group_commit_write_pipe' declared here}}
+  // expected-note@-2{{did you mean 'work_group_commit_read_pipe'?}}
 
-  get_pipe_num_packets(tmp); // expected-warning{{implicit declaration of function 'get_pipe_num_packets' is invalid in C99}}
-  get_pipe_max_packets(tmp); // expected-warning{{implicit declaration of function 'get_pipe_max_packets' is invalid in C99}}
+  sub_group_commit_write_pipe(tmp, tmp); // expected-error{{implicit declaration of function 'sub_group_commit_write_pipe' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'work_group_commit_write_pipe'?}}
+  sub_group_commit_read_pipe(tmp, tmp);  // expected-error{{implicit declaration of function 'sub_group_commit_read_pipe' is invalid in OpenCL}}
+
+  get_pipe_num_packets(tmp); // expected-error{{implicit declaration of function 'get_pipe_num_packets' is invalid in OpenCL}}
+  // expected-note@-1{{'get_pipe_num_packets' declared here}}
+  get_pipe_max_packets(tmp); // expected-error{{implicit declaration of function 'get_pipe_max_packets' is invalid in OpenCL}}
+  // expected-note@-1{{did you mean 'get_pipe_num_packets'?}}
 }
diff --git a/test/SemaOpenCL/cond.cl b/test/SemaOpenCL/cond.cl
index 60f70564d861..851947cd390f 100644
--- a/test/SemaOpenCL/cond.cl
+++ b/test/SemaOpenCL/cond.cl
@@ -89,7 +89,7 @@ float2 ntest04(int2 C, int2 X, float2 Y)
 
 float2 ntest05(int2 C, int2 X, float Y)
 {
-  return C ? X : Y; // expected-error {{cannot convert between vector values of different size ('int2' (vector of 2 'int' values) and 'float')}}
+  return C ? X : Y; // expected-error {{scalar operand type has greater rank than the type of the vector element. ('int2' (vector of 2 'int' values) and 'float'}}
 }
 
 char2 ntest06(int2 C, char2 X, char2 Y)
diff --git a/test/SemaOpenCL/to_addr_builtin.cl b/test/SemaOpenCL/to_addr_builtin.cl
index a145626cfcee..56db4abed579 100644
--- a/test/SemaOpenCL/to_addr_builtin.cl
+++ b/test/SemaOpenCL/to_addr_builtin.cl
@@ -10,7 +10,7 @@ void test(void) {
 
   glob = to_global(glob, loc);
 #if __OPENCL_C_VERSION__ < CL_VERSION_2_0
-  // expected-warning@-2{{implicit declaration of function 'to_global' is invalid in C99}}
+  // expected-error@-2{{implicit declaration of function 'to_global' is invalid in OpenCL}}
   // expected-warning@-3{{incompatible integer to pointer conversion assigning to '__global int *' from 'int'}}
 #else
   // expected-error@-5{{invalid number of arguments to function: 'to_global'}}
diff --git a/tools/c-index-test/c-index-test.c b/tools/c-index-test/c-index-test.c
index 1f5d60443197..31ad828a2f10 100644
--- a/tools/c-index-test/c-index-test.c
+++ b/tools/c-index-test/c-index-test.c
@@ -1742,6 +1742,8 @@ int perform_test_load_source(int argc, const char **argv,
       return -1;
 
     if (Repeats > 1) {
+      clang_suspendTranslationUnit(TU);
+
       Err = clang_reparseTranslationUnit(TU, num_unsaved_files, unsaved_files,
                                          clang_defaultReparseOptions(TU));
       if (Err != CXError_Success) {
diff --git a/tools/libclang/CIndex.cpp b/tools/libclang/CIndex.cpp
index 394ee0ec4390..2d92de19d99c 100644
--- a/tools/libclang/CIndex.cpp
+++ b/tools/libclang/CIndex.cpp
@@ -3918,6 +3918,20 @@ void clang_disposeTranslationUnit(CXTranslationUnit CTUnit) {
   }
 }
 
+unsigned clang_suspendTranslationUnit(CXTranslationUnit CTUnit) {
+  if (CTUnit) {
+    ASTUnit *Unit = cxtu::getASTUnit(CTUnit);
+
+    if (Unit && Unit->isUnsafeToFree())
+      return false;
+
+    Unit->ResetForParse();
+    return true;
+  }
+
+  return false;
+}
+
 unsigned clang_defaultReparseOptions(CXTranslationUnit TU) {
   return CXReparse_None;
 }
diff --git a/tools/libclang/libclang.exports b/tools/libclang/libclang.exports
index d9a406e5741b..f3758469cb60 100644
--- a/tools/libclang/libclang.exports
+++ b/tools/libclang/libclang.exports
@@ -305,6 +305,7 @@ clang_remap_getFilenames
 clang_remap_getNumFiles
 clang_reparseTranslationUnit
 clang_saveTranslationUnit
+clang_suspendTranslationUnit
 clang_sortCodeCompletionResults
 clang_toggleCrashRecovery
 clang_tokenize