From e831c3dab0fd00c84d09be70b9d1b27285cba04d Mon Sep 17 00:00:00 2001
From: dim <dim@FreeBSD.org>
Date: Mon, 29 May 2017 16:25:25 +0000
Subject: [PATCH] Vendor import of llvm trunk r304149:
 https://llvm.org/svn/llvm-project/llvm/trunk@304149

---
 CMakeLists.txt                                |    7 +
 docs/Benchmarking.rst                         |   87 +
 docs/GettingStartedVS.rst                     |    4 +
 docs/LangRef.rst                              |  471 ++-
 docs/Vectorizers.rst                          |    4 +-
 docs/index.rst                                |    1 +
 .../BuildingAJIT/Chapter1/KaleidoscopeJIT.h   |    6 +-
 .../BuildingAJIT/Chapter2/KaleidoscopeJIT.h   |   10 +-
 .../BuildingAJIT/Chapter3/KaleidoscopeJIT.h   |    9 +-
 .../BuildingAJIT/Chapter4/KaleidoscopeJIT.h   |   13 +-
 .../BuildingAJIT/Chapter5/KaleidoscopeJIT.h   |   14 +-
 .../BuildingAJIT/Chapter5/Server/server.cpp   |   28 +-
 .../Kaleidoscope/include/KaleidoscopeJIT.h    |    9 +-
 include/llvm/ADT/Triple.h                     |    1 +
 include/llvm/Analysis/InstructionSimplify.h   |  291 +-
 include/llvm/Analysis/LoopPass.h              |    5 +-
 include/llvm/Analysis/ScalarEvolution.h       |    6 +
 include/llvm/Analysis/TargetTransformInfo.h   |    7 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |    2 +
 include/llvm/Analysis/ValueTracking.h         |    3 +-
 include/llvm/CodeGen/AsmPrinter.h             |    6 +-
 include/llvm/CodeGen/AtomicExpandUtils.h      |   26 +-
 include/llvm/CodeGen/DIE.h                    |   86 +-
 include/llvm/CodeGen/FaultMaps.h              |   35 +-
 include/llvm/CodeGen/GlobalISel/Localizer.h   |   78 +
 include/llvm/CodeGen/ISDOpcodes.h             |    8 +
 include/llvm/CodeGen/LiveInterval.h           |   86 +-
 include/llvm/CodeGen/LiveIntervalAnalysis.h   |   43 +-
 include/llvm/CodeGen/LiveIntervalUnion.h      |   14 +-
 include/llvm/CodeGen/LivePhysRegs.h           |  120 +-
 include/llvm/CodeGen/LiveRangeEdit.h          |   80 +-
 include/llvm/CodeGen/LiveStackAnalysis.h      |   23 +-
 include/llvm/CodeGen/MachineBasicBlock.h      |  121 +-
 .../llvm/CodeGen/MachineBlockFrequencyInfo.h  |   17 +-
 .../llvm/CodeGen/MachineDominanceFrontier.h   |   25 +-
 include/llvm/CodeGen/MachineDominators.h      |   18 +-
 include/llvm/CodeGen/MachineInstr.h           |   37 +-
 include/llvm/CodeGen/MachineRegisterInfo.h    |    5 +
 include/llvm/CodeGen/MachineValueType.h       |    2 +-
 include/llvm/CodeGen/ScheduleDAG.h            |    6 +-
 include/llvm/CodeGen/ScheduleDAGInstrs.h      |    5 +-
 include/llvm/CodeGen/SelectionDAG.h           |    5 +
 include/llvm/CodeGen/SelectionDAGNodes.h      |   26 +
 include/llvm/DebugInfo/CodeView/CVRecord.h    |    8 +
 .../llvm/DebugInfo/CodeView/CVTypeVisitor.h   |    1 +
 .../DebugInfo/CodeView/TypeDeserializer.h     |   11 +
 .../DebugInfo/CodeView/TypeIndexDiscovery.h   |   33 +
 include/llvm/DebugInfo/CodeView/TypeRecord.h  |   11 +-
 .../llvm/DebugInfo/CodeView/TypeSerializer.h  |   37 +-
 .../DebugInfo/CodeView/TypeStreamMerger.h     |   69 +-
 .../DebugInfo/CodeView/TypeTableBuilder.h     |   31 +-
 .../DebugInfo/CodeView/TypeTableCollection.h  |    4 +-
 include/llvm/DebugInfo/DWARF/DWARFContext.h   |   18 +-
 .../DebugInfo/DWARF/DWARFDebugRangeList.h     |    3 +
 include/llvm/DebugInfo/DWARF/DWARFDie.h       |    3 +-
 include/llvm/DebugInfo/DWARF/DWARFFormValue.h |    2 +
 include/llvm/DebugInfo/DWARF/DWARFRelocMap.h  |    3 +
 include/llvm/DebugInfo/DWARF/DWARFUnit.h      |   12 +-
 .../llvm/DebugInfo/MSF/MappedBlockStream.h    |   16 +-
 .../DebugInfo/PDB/Native/DbiStreamBuilder.h   |    1 +
 .../PDB/Native/PDBTypeServerHandler.h         |    5 +-
 include/llvm/DebugInfo/PDB/Native/TpiStream.h |    7 +
 include/llvm/IR/Attributes.h                  |   46 +-
 include/llvm/IR/BasicBlock.h                  |   45 +
 include/llvm/IR/IntrinsicInst.h               |   13 +
 include/llvm/IR/Intrinsics.td                 |   58 +-
 include/llvm/IR/IntrinsicsAMDGPU.td           |   10 +
 include/llvm/IR/Metadata.h                    |    1 +
 include/llvm/IR/Module.h                      |    5 +-
 include/llvm/InitializePasses.h               |    2 +
 include/llvm/LTO/Config.h                     |    2 +-
 include/llvm/Object/Binary.h                  |    4 +-
 include/llvm/Object/COFF.h                    |    1 +
 include/llvm/Object/ELFObjectFile.h           |   12 +
 include/llvm/Object/MachO.h                   |    1 +
 include/llvm/Object/ObjectFile.h              |    6 +
 include/llvm/Object/RelocVisitor.h            |  513 +--
 include/llvm/Object/Wasm.h                    |    1 +
 include/llvm/Option/OptTable.h                |    8 +
 include/llvm/ProfileData/InstrProf.h          |    6 +-
 include/llvm/TableGen/Record.h                |   48 +-
 include/llvm/Target/TargetLowering.h          |    6 +-
 include/llvm/Transforms/Scalar.h              |    7 +
 include/llvm/Transforms/Scalar/GVN.h          |   34 +-
 include/llvm/Transforms/Utils/Local.h         |   12 +
 lib/Analysis/ConstantFolding.cpp              |    7 +-
 lib/Analysis/InstructionSimplify.cpp          |  152 +-
 lib/Analysis/Lint.cpp                         |    8 +-
 lib/Analysis/LoopPass.cpp                     |   23 +-
 lib/Analysis/ScalarEvolution.cpp              |   74 +-
 lib/Analysis/ScalarEvolutionExpander.cpp      |   20 +-
 lib/Analysis/TargetTransformInfo.cpp          |    4 +
 lib/Analysis/ValueTracking.cpp                |    6 +-
 lib/Bitcode/Writer/BitcodeWriter.cpp          |   34 +-
 lib/Bitcode/Writer/ValueEnumerator.cpp        |    7 +-
 lib/CodeGen/AsmPrinter/AsmPrinter.cpp         |   10 +-
 lib/CodeGen/AsmPrinter/CodeViewDebug.cpp      |    8 +-
 lib/CodeGen/AsmPrinter/DIEHash.cpp            |  120 +-
 lib/CodeGen/AsmPrinter/DIEHash.h              |   55 +-
 lib/CodeGen/AsmPrinter/DIEHashAttributes.def  |   55 +
 lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp   |    9 +-
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp   |    8 +-
 lib/CodeGen/AsmPrinter/DwarfCompileUnit.h     |    4 +-
 lib/CodeGen/AsmPrinter/DwarfDebug.cpp         |  125 +-
 lib/CodeGen/AsmPrinter/DwarfDebug.h           |    9 +-
 lib/CodeGen/AtomicExpandPass.cpp              |    2 +-
 lib/CodeGen/BasicTargetTransformInfo.cpp      |    2 -
 lib/CodeGen/BranchCoalescing.cpp              |    6 +-
 lib/CodeGen/BranchFolding.cpp                 |   11 +-
 lib/CodeGen/BranchFolding.h                   |    1 +
 lib/CodeGen/BranchRelaxation.cpp              |    6 +-
 lib/CodeGen/CodeGenPrepare.cpp                |    4 +-
 lib/CodeGen/DeadMachineInstructionElim.cpp    |    4 +-
 lib/CodeGen/DetectDeadLanes.cpp               |    3 +-
 lib/CodeGen/DwarfEHPrepare.cpp                |    4 +-
 lib/CodeGen/EarlyIfConversion.cpp             |    8 +-
 lib/CodeGen/ExpandISelPseudos.cpp             |    2 +-
 lib/CodeGen/ExpandPostRAPseudos.cpp           |    2 +-
 lib/CodeGen/FuncletLayout.cpp                 |    2 +-
 lib/CodeGen/GlobalISel/CMakeLists.txt         |    5 +-
 lib/CodeGen/GlobalISel/GlobalISel.cpp         |    1 +
 lib/CodeGen/GlobalISel/Localizer.cpp          |  125 +
 lib/CodeGen/GlobalMerge.cpp                   |    5 +-
 lib/CodeGen/IfConversion.cpp                  |   33 +-
 lib/CodeGen/ImplicitNullChecks.cpp            |    4 +-
 lib/CodeGen/InterleavedAccessPass.cpp         |    6 +-
 lib/CodeGen/LexicalScopes.cpp                 |    5 +-
 lib/CodeGen/LiveDebugValues.cpp               |    4 +-
 lib/CodeGen/LiveDebugVariables.cpp            |    6 +-
 lib/CodeGen/LiveIntervalAnalysis.cpp          |   49 +-
 lib/CodeGen/LivePhysRegs.cpp                  |   89 +-
 lib/CodeGen/LiveStackAnalysis.cpp             |    4 +-
 lib/CodeGen/LocalStackSlotAllocation.cpp      |    4 +-
 lib/CodeGen/LowerEmuTLS.cpp                   |    2 +-
 lib/CodeGen/MachineBlockFrequencyInfo.cpp     |    6 +-
 lib/CodeGen/MachineBlockPlacement.cpp         |    4 +-
 lib/CodeGen/MachineCSE.cpp                    |   12 +-
 lib/CodeGen/MachineCombiner.cpp               |    4 +-
 lib/CodeGen/MachineCopyPropagation.cpp        |    4 +-
 lib/CodeGen/MachineLICM.cpp                   |   10 +-
 lib/CodeGen/MachineOutliner.cpp               |    2 +-
 lib/CodeGen/MachinePipeliner.cpp              |    4 +-
 lib/CodeGen/MachineScheduler.cpp              |   14 +-
 lib/CodeGen/MachineSink.cpp                   |    8 +-
 lib/CodeGen/MachineTraceMetrics.cpp           |    8 +-
 lib/CodeGen/MachineVerifier.cpp               |    5 -
 lib/CodeGen/OptimizePHIs.cpp                  |    4 +-
 lib/CodeGen/PHIElimination.cpp                |    4 +-
 lib/CodeGen/PostRASchedulerList.cpp           |    4 +-
 lib/CodeGen/ProcessImplicitDefs.cpp           |    6 +-
 lib/CodeGen/PrologEpilogInserter.cpp          |    9 +-
 lib/CodeGen/RenameIndependentSubregs.cpp      |    4 +-
 lib/CodeGen/SafeStack.cpp                     |    6 +-
 lib/CodeGen/ScalarizeMaskedMemIntrin.cpp      |    8 +-
 lib/CodeGen/ScheduleDAGInstrs.cpp             |  206 +-
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp      |  176 +-
 lib/CodeGen/SelectionDAG/LegalizeDAG.cpp      |   67 +
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp     |   79 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   90 +-
 .../SelectionDAG/SelectionDAGBuilder.h        |    2 +-
 lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp |   55 +-
 lib/CodeGen/SelectionDAG/TargetLowering.cpp   |   34 +-
 lib/CodeGen/ShadowStackGCLowering.cpp         |    6 +-
 lib/CodeGen/ShrinkWrap.cpp                    |    5 +-
 lib/CodeGen/SjLjEHPrepare.cpp                 |    2 +-
 lib/CodeGen/SlotIndexes.cpp                   |    2 +-
 lib/CodeGen/SpillPlacement.cpp                |    6 +-
 lib/CodeGen/StackColoring.cpp                 |   10 +-
 lib/CodeGen/StackProtector.cpp                |    4 +-
 lib/CodeGen/StackSlotColoring.cpp             |    6 +-
 lib/CodeGen/TailDuplication.cpp               |    3 +-
 lib/CodeGen/TailDuplicator.cpp                |    4 +-
 lib/CodeGen/TwoAddressInstructionPass.cpp     |    6 +-
 lib/CodeGen/WinEHPrepare.cpp                  |    2 +-
 lib/DebugInfo/CodeView/CMakeLists.txt         |    1 +
 lib/DebugInfo/CodeView/CVTypeVisitor.cpp      |   20 +-
 lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp |  371 ++
 lib/DebugInfo/CodeView/TypeSerializer.cpp     |  234 +-
 lib/DebugInfo/CodeView/TypeStreamMerger.cpp   |  436 +--
 .../CodeView/TypeTableCollection.cpp          |    3 +-
 lib/DebugInfo/DWARF/DWARFContext.cpp          |  119 +-
 lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp   |    8 +-
 lib/DebugInfo/DWARF/DWARFDie.cpp              |   13 +-
 lib/DebugInfo/DWARF/DWARFFormValue.cpp        |    4 +-
 lib/DebugInfo/DWARF/DWARFUnit.cpp             |   39 +-
 lib/DebugInfo/MSF/MappedBlockStream.cpp       |   32 +-
 lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp |   25 +-
 .../PDB/Native/PDBTypeServerHandler.cpp       |   19 +-
 lib/DebugInfo/PDB/Native/TpiStream.cpp        |    4 +
 lib/Demangle/ItaniumDemangle.cpp              |   58 +-
 .../RuntimeDyld/RuntimeDyldELF.cpp            |   37 +-
 lib/Fuzzer/FuzzerUtilPosix.cpp                |   17 +-
 lib/Fuzzer/test/fuzzer-segv.test              |    2 +
 lib/IR/AttributeImpl.h                        |   47 +-
 lib/IR/Attributes.cpp                         |  394 +-
 lib/IR/BasicBlock.cpp                         |   16 +-
 lib/IR/DebugLoc.cpp                           |    2 +-
 lib/IR/Instructions.cpp                       |    6 +
 lib/IR/IntrinsicInst.cpp                      |   26 +-
 lib/IR/Module.cpp                             |    4 +-
 lib/IR/Verifier.cpp                           |   40 +-
 lib/LTO/LTO.cpp                               |   44 +-
 lib/LTO/LTOBackend.cpp                        |   21 +-
 lib/Linker/IRMover.cpp                        |   17 +-
 lib/MC/WasmObjectWriter.cpp                   |   32 +-
 lib/Object/COFFObjectFile.cpp                 |    4 +
 lib/Object/MachOObjectFile.cpp                |    4 +
 lib/Object/WasmObjectFile.cpp                 |    4 +
 lib/Option/OptTable.cpp                       |   14 +
 lib/Passes/PassBuilder.cpp                    |   25 +-
 lib/ProfileData/InstrProf.cpp                 |   16 +-
 lib/Support/APInt.cpp                         |    4 +-
 lib/Support/BinaryStreamReader.cpp            |   27 +-
 lib/Support/ConvertUTF.cpp                    |   31 +
 lib/Support/DebugCounter.cpp                  |    2 +
 lib/Support/DynamicLibrary.cpp                |   11 +-
 lib/Support/GraphWriter.cpp                   |    1 +
 lib/Support/Host.cpp                          |    1 +
 lib/Support/Path.cpp                          |    1 +
 lib/Support/Triple.cpp                        |   17 +-
 lib/Support/YAMLParser.cpp                    |    4 +
 lib/TableGen/Record.cpp                       |   39 +-
 lib/Target/AArch64/AArch64AsmPrinter.cpp      |    5 +-
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  119 +-
 lib/Target/AArch64/AArch64FrameLowering.cpp   |   11 +-
 lib/Target/AArch64/AArch64ISelLowering.cpp    |   31 +-
 lib/Target/AArch64/AArch64InstrInfo.cpp       |  127 +-
 lib/Target/AArch64/AArch64InstrInfo.h         |    2 +-
 lib/Target/AArch64/AArch64InstrInfo.td        |    2 +-
 lib/Target/AArch64/AArch64MacroFusion.cpp     |   13 +
 lib/Target/AArch64/AArch64SchedFalkor.td      |   84 +-
 .../AArch64/AArch64SchedFalkorDetails.td      | 1063 +++--
 .../AArch64/AArch64SchedFalkorWriteRes.td     |  403 --
 lib/Target/AArch64/AArch64Subtarget.cpp       |    1 -
 lib/Target/AArch64/AArch64TargetMachine.cpp   |   12 +-
 lib/Target/AMDGPU/AMDGPU.td                   |   20 +-
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp      |   45 +-
 lib/Target/AMDGPU/AMDGPUISelLowering.h        |    3 +
 lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp     |    2 +
 lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp     |  213 +-
 lib/Target/AMDGPU/AMDGPUSubtarget.h           |    8 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |   83 +-
 .../Disassembler/AMDGPUDisassembler.cpp       |   93 +-
 .../AMDGPU/Disassembler/AMDGPUDisassembler.h  |    5 +
 lib/Target/AMDGPU/GCNIterativeScheduler.cpp   |    2 +
 lib/Target/AMDGPU/GCNMinRegStrategy.cpp       |    2 +
 lib/Target/AMDGPU/GCNRegPressure.cpp          |    6 +-
 .../AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h |   12 +
 .../AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp   |   46 +
 lib/Target/AMDGPU/R600ISelLowering.cpp        |    8 +
 lib/Target/AMDGPU/R600ISelLowering.h          |    2 +
 lib/Target/AMDGPU/R600RegisterInfo.td         |    2 +-
 lib/Target/AMDGPU/SIDefines.h                 |   19 +-
 lib/Target/AMDGPU/SIISelLowering.cpp          |   52 +-
 lib/Target/AMDGPU/SIISelLowering.h            |    2 +
 lib/Target/AMDGPU/SIInstrInfo.cpp             |    4 +
 lib/Target/AMDGPU/SIInstrInfo.td              |  180 +-
 lib/Target/AMDGPU/SOPInstructions.td          |    4 +-
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   11 +
 lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h      |    4 +
 lib/Target/AMDGPU/VOP1Instructions.td         |   33 +-
 lib/Target/AMDGPU/VOP2Instructions.td         |   70 +-
 lib/Target/AMDGPU/VOP3Instructions.td         |    9 +-
 lib/Target/AMDGPU/VOPCInstructions.td         |   37 +
 lib/Target/AMDGPU/VOPInstructions.td          |  116 +-
 lib/Target/ARM/ARMCallLowering.cpp            |  111 +-
 lib/Target/ARM/ARMCallLowering.h              |    5 +-
 lib/Target/ARM/ARMExpandPseudoInsts.cpp       |    4 +-
 lib/Target/ARM/ARMISelLowering.cpp            |    8 +-
 lib/Target/ARM/ARMISelLowering.h              |    2 +-
 lib/Target/ARM/ARMInstrNEON.td                |  496 +--
 lib/Target/ARM/ARMSchedule.td                 |   11 +
 lib/Target/ARM/ARMScheduleA9.td               |    9 +
 lib/Target/ARM/ARMScheduleR52.td              |  103 +-
 lib/Target/ARM/ARMScheduleSwift.td            |   10 +
 lib/Target/ARM/ARMTargetMachine.cpp           |   63 +-
 lib/Target/ARM/ARMTargetMachine.h             |   62 +-
 lib/Target/ARM/ARMTargetObjectFile.cpp        |    4 +-
 lib/Target/ARM/Thumb1FrameLowering.cpp        |   10 +-
 lib/Target/AVR/AVRInstrInfo.td                |    1 -
 lib/Target/BPF/BPFISelLowering.cpp            |    9 +-
 lib/Target/BPF/BPFISelLowering.h              |    4 +
 lib/Target/Hexagon/HexagonFrameLowering.cpp   |    2 +-
 lib/Target/Hexagon/HexagonInstrInfo.cpp       |    4 +-
 lib/Target/Hexagon/HexagonPseudo.td           |   39 +-
 lib/Target/Hexagon/HexagonRegisterInfo.cpp    |    5 -
 lib/Target/Hexagon/HexagonRegisterInfo.h      |    1 -
 lib/Target/Hexagon/HexagonVLIWPacketizer.cpp  |    4 +-
 lib/Target/LLVMBuild.txt                      |    1 +
 lib/Target/MSP430/MSP430.td                   |   14 +
 lib/Target/MSP430/MSP430ISelDAGToDAG.cpp      |    4 +-
 lib/Target/MSP430/MSP430ISelLowering.cpp      |   27 +-
 lib/Target/MSP430/MSP430InstrInfo.td          |    5 +
 lib/Target/MSP430/MSP430RegisterInfo.cpp      |    2 +-
 lib/Target/MSP430/MSP430Subtarget.cpp         |   27 +-
 lib/Target/MSP430/MSP430Subtarget.h           |   11 +
 lib/Target/Mips/MipsISelLowering.cpp          |   34 +-
 lib/Target/Mips/MipsSubtarget.cpp             |    7 +-
 lib/Target/Mips/MipsSubtarget.h               |    7 +-
 lib/Target/Nios2/CMakeLists.txt               |   18 +
 lib/Target/Nios2/LLVMBuild.txt                |   61 +
 lib/Target/Nios2/MCTargetDesc/CMakeLists.txt  |    2 +
 lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt   |   25 +
 .../Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp  |   25 +
 .../Nios2/MCTargetDesc/Nios2MCTargetDesc.h    |   34 +
 lib/Target/Nios2/Nios2.h                      |   25 +
 lib/Target/Nios2/Nios2.td                     |   29 +
 lib/Target/Nios2/Nios2InstrFormats.td         |  117 +
 lib/Target/Nios2/Nios2InstrInfo.td            |   50 +
 lib/Target/Nios2/Nios2RegisterInfo.td         |   60 +
 lib/Target/Nios2/Nios2TargetMachine.cpp       |   46 +
 lib/Target/Nios2/Nios2TargetMachine.h         |   30 +
 lib/Target/Nios2/TargetInfo/CMakeLists.txt    |    1 +
 lib/Target/Nios2/TargetInfo/LLVMBuild.txt     |   23 +
 .../Nios2/TargetInfo/Nios2TargetInfo.cpp      |   24 +
 lib/Target/PowerPC/PPCExpandISEL.cpp          |    2 +-
 lib/Target/PowerPC/PPCISelLowering.cpp        |   92 +-
 lib/Target/PowerPC/PPCISelLowering.h          |    6 +-
 lib/Target/PowerPC/PPCInstr64Bit.td           |    4 +-
 lib/Target/PowerPC/PPCInstrInfo.cpp           |    2 +
 lib/Target/PowerPC/PPCInstrInfo.td            |    2 +-
 lib/Target/PowerPC/PPCInstrVSX.td             |   74 +-
 lib/Target/SystemZ/SystemZExpandPseudo.cpp    |    2 +-
 lib/Target/SystemZ/SystemZInstrInfo.cpp       |   30 +-
 .../SystemZ/SystemZTargetTransformInfo.h      |    1 +
 lib/Target/X86/AsmParser/X86AsmParser.cpp     |    5 +-
 lib/Target/X86/CMakeLists.txt                 |    1 +
 lib/Target/X86/X86.td                         |    3 +
 lib/Target/X86/X86FloatingPoint.cpp           |    4 +-
 lib/Target/X86/X86ISelLowering.cpp            |    8 +
 lib/Target/X86/X86InstrAVX512.td              |  150 +-
 lib/Target/X86/X86InstrArithmetic.td          |   24 +-
 lib/Target/X86/X86InstrFMA.td                 |   13 +-
 lib/Target/X86/X86InstrFormats.td             |   10 +
 lib/Target/X86/X86InstrInfo.cpp               | 3406 +----------------
 lib/Target/X86/X86InstrInfo.td                |   23 +-
 lib/Target/X86/X86InstrMMX.td                 |    5 +-
 lib/Target/X86/X86InstrSSE.td                 |   66 +-
 lib/Target/X86/X86InstrXOP.td                 |    8 +-
 lib/Target/X86/X86InstructionSelector.cpp     |   20 +
 lib/Target/X86/X86LegalizerInfo.cpp           |   17 +
 lib/Target/X86/X86LegalizerInfo.h             |    1 +
 lib/Target/X86/X86Subtarget.cpp               |    1 +
 lib/Target/X86/X86Subtarget.h                 |    4 +
 lib/Transforms/Coroutines/CoroCleanup.cpp     |    1 +
 lib/Transforms/Coroutines/CoroEarly.cpp       |    3 +
 lib/Transforms/Coroutines/CoroElide.cpp       |    1 +
 lib/Transforms/Coroutines/CoroFrame.cpp       |   33 +-
 lib/Transforms/Coroutines/CoroSplit.cpp       |   86 +-
 lib/Transforms/IPO/PartialInlining.cpp        |   10 +-
 lib/Transforms/IPO/PassManagerBuilder.cpp     |   15 +
 .../InstCombine/InstCombineAddSub.cpp         |   14 +-
 .../InstCombine/InstCombineAndOrXor.cpp       |   12 +-
 .../InstCombine/InstCombineCalls.cpp          |    8 +-
 .../InstCombine/InstCombineCasts.cpp          |   16 +-
 .../InstCombine/InstCombineCompares.cpp       |   48 +-
 .../InstCombine/InstCombineInternal.h         |    6 +
 .../InstCombine/InstCombineMulDivRem.cpp      |    8 +-
 .../InstCombine/InstCombineShifts.cpp         |    3 +-
 .../InstCombineSimplifyDemanded.cpp           |   92 +-
 .../InstCombine/InstructionCombining.cpp      |   19 +-
 .../Instrumentation/PGOInstrumentation.cpp    |    2 +-
 .../Instrumentation/SanitizerCoverage.cpp     |    5 +-
 lib/Transforms/Scalar/CMakeLists.txt          |    1 +
 lib/Transforms/Scalar/ConstantHoisting.cpp    |    6 +-
 lib/Transforms/Scalar/GVN.cpp                 |  164 +-
 lib/Transforms/Scalar/GVNSink.cpp             |  872 +++++
 lib/Transforms/Scalar/GuardWidening.cpp       |    4 +-
 .../Scalar/InductiveRangeCheckElimination.cpp |    7 +-
 lib/Transforms/Scalar/JumpThreading.cpp       |   42 +-
 lib/Transforms/Scalar/LoopIdiomRecognize.cpp  |   33 +-
 lib/Transforms/Scalar/LoopUnswitch.cpp        |    7 +-
 lib/Transforms/Scalar/NewGVN.cpp              |   70 +-
 lib/Transforms/Scalar/SCCP.cpp                |    3 +-
 lib/Transforms/Scalar/SROA.cpp                |    2 +-
 lib/Transforms/Scalar/Scalar.cpp              |    1 +
 lib/Transforms/Scalar/SimpleLoopUnswitch.cpp  |   76 +-
 lib/Transforms/Utils/CloneFunction.cpp        |    2 +-
 lib/Transforms/Utils/FunctionComparator.cpp   |   10 +-
 lib/Transforms/Utils/InlineFunction.cpp       |    7 +-
 lib/Transforms/Utils/Local.cpp                |   68 +-
 lib/Transforms/Utils/SimplifyCFG.cpp          |   50 +-
 lib/Transforms/Utils/SimplifyLibCalls.cpp     |    4 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp    |  157 +-
 test/Analysis/CostModel/AArch64/falkor.ll     |   26 -
 .../constant_functions_multi_dim.ll           |   80 +
 test/Analysis/IVUsers/quadradic-exit-value.ll |   36 +-
 .../ScalarEvolution/different-loops-recs.ll   |   64 +-
 .../AArch64/GlobalISel/arm64-fallback.ll      |    2 +-
 .../GlobalISel/gisel-commandline-option.ll    |    5 +-
 .../GlobalISel/localizer-in-O0-pipeline.mir   |   96 +
 test/CodeGen/AArch64/GlobalISel/localizer.mir |  312 ++
 test/CodeGen/AArch64/aarch64-stp-cluster.ll   |    2 +-
 test/CodeGen/AArch64/arm64-csldst-mmo.ll      |    2 +-
 test/CodeGen/AArch64/arm64-inline-asm.ll      |    7 +
 test/CodeGen/AArch64/arm64-ldp-cluster.ll     |    4 +-
 .../AArch64/arm64-misched-basic-A53.ll        |    4 +-
 .../AArch64/arm64-misched-basic-A57.ll        |    2 +-
 .../AArch64/arm64-misched-forwarding-A53.ll   |    2 +-
 .../AArch64/arm64-misched-memdep-bug.ll       |    2 +-
 .../CodeGen/AArch64/arm64-misched-multimmo.ll |    2 +-
 test/CodeGen/AArch64/arm64-vabs.ll            |   24 +-
 test/CodeGen/AArch64/arm64-vadd.ll            |   12 +-
 test/CodeGen/AArch64/arm64-vmul.ll            |   24 +-
 test/CodeGen/AArch64/arm64-vshift.ll          |   12 +-
 test/CodeGen/AArch64/arm64-vsub.ll            |   24 +-
 test/CodeGen/AArch64/asm-print-comments.ll    |   17 +
 test/CodeGen/AArch64/cmpxchg-O0.ll            |   10 +-
 test/CodeGen/AArch64/fast-isel-cmpxchg.ll     |    9 +-
 .../AArch64/live-interval-analysis.mir        |    4 +-
 test/CodeGen/AArch64/misched-fusion-aes.ll    |  145 +-
 test/CodeGen/AArch64/optimize-imm.ll          |   19 +
 test/CodeGen/AArch64/scheduledag-constreg.mir |    2 +-
 .../CodeGen/AArch64/tailcall_misched_graph.ll |    2 +-
 .../AMDGPU/GlobalISel/legalize-constant.mir   |   20 +
 test/CodeGen/AMDGPU/bfe-combine.ll            |   41 +
 test/CodeGen/AMDGPU/extload-align.ll          |    4 +-
 test/CodeGen/AMDGPU/kernel-args.ll            |    4 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll    |   15 +
 test/CodeGen/AMDGPU/load-constant-i16.ll      |    2 +-
 test/CodeGen/AMDGPU/load-constant-i8.ll       |    2 +-
 test/CodeGen/AMDGPU/load-global-i16.ll        |    4 +-
 test/CodeGen/AMDGPU/load-global-i8.ll         |    4 +-
 test/CodeGen/AMDGPU/load-local-i16.ll         |    2 +-
 test/CodeGen/AMDGPU/load-local-i8.ll          |    2 +-
 test/CodeGen/AMDGPU/min.ll                    |    2 +-
 test/CodeGen/AMDGPU/parallelandifcollapse.ll  |    2 +-
 test/CodeGen/AMDGPU/r600.bitcast.ll           |    2 +-
 test/CodeGen/AMDGPU/schedule-regpressure.mir  |    2 +-
 test/CodeGen/AMDGPU/setcc.ll                  |    2 +-
 test/CodeGen/AMDGPU/shl-add-to-add-shl.ll     |   40 +
 test/CodeGen/AMDGPU/shl.ll                    |    2 +-
 test/CodeGen/AMDGPU/sra.ll                    |    2 +-
 test/CodeGen/AMDGPU/store-global.ll           |    4 +-
 test/CodeGen/AMDGPU/store-private.ll          |    4 +-
 test/CodeGen/AMDGPU/sub.ll                    |    2 +-
 test/CodeGen/AMDGPU/unknown-processor.ll      |    4 +-
 test/CodeGen/AMDGPU/vector-alloca.ll          |   22 +
 .../ARM/GlobalISel/arm-irtranslator.ll        |  354 ++
 .../CodeGen/ARM/GlobalISel/arm-unsupported.ll |   12 +-
 test/CodeGen/ARM/arm-shrink-wrapping.ll       |    7 +-
 test/CodeGen/ARM/constantpool-promote-dbg.ll  |    2 +-
 test/CodeGen/ARM/constantpool-promote-ldrh.ll |    4 +-
 test/CodeGen/ARM/constantpool-promote.ll      |   24 +-
 test/CodeGen/ARM/cortexr52-misched-basic.ll   |    4 +-
 test/CodeGen/ARM/fastisel-thumb-litpool.ll    |    1 +
 test/CodeGen/ARM/memcpy-inline.ll             |   13 +-
 test/CodeGen/ARM/memset-inline.ll             |   84 +-
 test/CodeGen/ARM/misched-copy-arm.ll          |    2 +-
 test/CodeGen/ARM/misched-fp-basic.ll          |    6 +-
 test/CodeGen/ARM/misched-int-basic-thumb2.mir |    6 +-
 test/CodeGen/ARM/misched-int-basic.mir        |    6 +-
 test/CodeGen/ARM/single-issue-r52.mir         |    4 +-
 test/CodeGen/ARM/vcombine.ll                  |    4 +-
 test/CodeGen/ARM/vext.ll                      |    8 +-
 test/CodeGen/Hexagon/post-ra-kill-update.mir  |    2 +-
 .../Lanai/lanai-misched-trivial-disjoint.ll   |    2 +-
 .../PR32721_ifcvt_triangle_unanalyzable.mir   |   24 -
 test/CodeGen/MSP430/hwmult16.ll               |    1 +
 test/CodeGen/MSP430/hwmult32.ll               |    1 +
 test/CodeGen/MSP430/hwmultf5.ll               |    1 +
 test/CodeGen/MSP430/vararg.ll                 |    2 +-
 test/CodeGen/Nios2/lit.local.cfg              |    3 +
 test/CodeGen/Nios2/target_support.ll          |   11 +
 test/CodeGen/PowerPC/atomics-constant.ll      |   23 +
 test/CodeGen/PowerPC/build-vector-tests.ll    |  216 +-
 test/CodeGen/PowerPC/livephysregs.mir         |   52 +
 .../PowerPC/p8altivec-shuffles-pred.ll        |    2 +-
 .../PowerPC/p9-xxinsertw-xxextractuw.ll       |   72 +-
 test/CodeGen/PowerPC/ppc64-i128-abi.ll        |    8 +-
 test/CodeGen/PowerPC/pr25157-peephole.ll      |    2 +-
 test/CodeGen/PowerPC/pr27078.ll               |    8 +-
 test/CodeGen/PowerPC/swaps-le-6.ll            |    8 +-
 test/CodeGen/PowerPC/vec_sldwi.ll             |  307 ++
 test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll   |   48 +-
 test/CodeGen/PowerPC/vsx-ldst.ll              |    4 +-
 test/CodeGen/PowerPC/vsx-p9.ll                |   72 +-
 test/CodeGen/PowerPC/vsx_insert_extract_le.ll |    4 +-
 test/CodeGen/PowerPC/vsx_shuffle_le.ll        |   48 +-
 test/CodeGen/Thumb/machine-cse-physreg.mir    |   35 +
 test/CodeGen/X86/2009-02-26-MachineLICMBug.ll |    2 +-
 test/CodeGen/X86/GlobalISel/memop-vec.ll      |  113 +-
 .../X86/GlobalISel/regbankselect-AVX2.mir     |   55 +-
 .../X86/GlobalISel/regbankselect-AVX512.mir   |   87 +-
 .../X86/GlobalISel/select-leaf-constant.mir   |   96 +
 .../X86/GlobalISel/select-memop-v256.mir      |  188 +
 .../X86/GlobalISel/select-memop-v512.mir      |  127 +
 test/CodeGen/X86/avx-vzeroupper.ll            |  221 +-
 test/CodeGen/X86/avx512bw-intrinsics.ll       |    6 +-
 .../CodeGen/X86/avx512vpopcntdq-intrinsics.ll |   88 +
 test/CodeGen/X86/fast-isel-select-cmp.ll      |   10 +-
 test/CodeGen/X86/fp-intrinsics.ll             |  147 +
 test/CodeGen/X86/hoist-invariant-load.ll      |    2 +-
 test/CodeGen/X86/misched-copy.ll              |    2 +-
 test/CodeGen/X86/or-branch.ll                 |   66 +-
 test/CodeGen/X86/pr27681.mir                  |    2 +-
 test/CodeGen/X86/sandybridge-loads.ll         |   42 +-
 .../CodeGen/X86/sse42-intrinsics-fast-isel.ll |    6 +-
 test/CodeGen/X86/stack-folding-fp-avx1.ll     |   21 +-
 test/CodeGen/X86/twoaddr-coalesce-2.ll        |    2 +-
 test/CodeGen/X86/vector-narrow-binop.ll       |   63 +-
 test/CodeGen/X86/vector-popcnt-128.ll         |  109 +-
 test/CodeGen/X86/vector-popcnt-256.ll         |   42 +
 test/CodeGen/X86/vector-popcnt-512.ll         |   53 +
 test/CodeGen/X86/vector-shuffle-512-v16.ll    |    3 +-
 test/CodeGen/X86/vector-shuffle-avx512.ll     |   34 +-
 test/CodeGen/X86/vector-sqrt.ll               |   18 +-
 test/CodeGen/X86/vector-trunc-math.ll         |    6 +-
 test/CodeGen/X86/vector-tzcnt-128.ll          |  159 +
 test/CodeGen/X86/vector-tzcnt-256.ll          |  379 +-
 test/CodeGen/X86/vector-tzcnt-512.ll          |  153 +
 test/CodeGen/X86/wide-integer-cmp.ll          |    1 -
 test/CodeGen/X86/widened-broadcast.ll         |   73 +-
 test/CodeGen/X86/x86-interleaved-access.ll    |   12 +-
 test/CodeGen/X86/x87.ll                       |   11 +-
 test/CodeGen/XCore/epilogue_prologue.ll       |   24 +-
 test/DebugInfo/Generic/empty.ll               |    9 +-
 test/DebugInfo/Generic/nodebug.ll             |   13 +-
 test/DebugInfo/Generic/skeletoncu.ll          |    4 +-
 test/DebugInfo/Inputs/split-dwarf-dwp.cpp     |   12 +
 test/DebugInfo/Inputs/split-dwarf-dwp.o       |  Bin 0 -> 2744 bytes
 test/DebugInfo/Inputs/split-dwarf-dwp.o.dwp   |  Bin 0 -> 1256 bytes
 test/DebugInfo/MIR/X86/empty-inline.mir       |  122 +
 test/DebugInfo/PDB/Inputs/merge-ids-1.yaml    |   36 +
 test/DebugInfo/PDB/Inputs/merge-ids-2.yaml    |   31 +
 .../PDB/Inputs/merge-ids-and-types-1.yaml     |  113 +
 .../PDB/Inputs/merge-ids-and-types-2.yaml     |  143 +
 .../{merge1.yaml => merge-types-1.yaml}       |    0
 .../{merge2.yaml => merge-types-2.yaml}       |    0
 test/DebugInfo/PDB/Inputs/source-names-1.yaml |    8 +
 test/DebugInfo/PDB/Inputs/source-names-2.yaml |    8 +
 .../PDB/pdbdump-merge-ids-and-types.test      |   65 +
 test/DebugInfo/PDB/pdbdump-mergeids.test      |   31 +
 test/DebugInfo/PDB/pdbdump-mergetypes.test    |    4 +-
 test/DebugInfo/PDB/pdbdump-objfilename.yaml   |   14 +
 test/DebugInfo/PDB/pdbdump-source-names.test  |   20 +
 test/DebugInfo/X86/array.ll                   |  150 +-
 test/DebugInfo/X86/dbg-value-frame-index.ll   |    2 +-
 test/DebugInfo/X86/debug-loc-offset.ll        |    8 +-
 test/DebugInfo/X86/debug-macro.ll             |   72 +-
 test/DebugInfo/X86/empty.ll                   |   11 +-
 test/DebugInfo/X86/fission-hash.ll            |   10 +-
 test/DebugInfo/X86/gnu-public-names-empty.ll  |    9 +-
 test/DebugInfo/X86/gnu-public-names-gmlt.ll   |   68 +
 .../X86/split-dwarf-multiple-cu-hash.ll       |   42 +
 test/DebugInfo/X86/split-dwarf-omit-empty.ll  |   54 +
 test/DebugInfo/dwo.ll                         |    4 +-
 test/DebugInfo/llvm-symbolizer.test           |    8 +
 test/DebugInfo/omit-empty.ll                  |   12 +
 test/DebugInfo/skeletoncu.ll                  |    4 +-
 test/ExecutionEngine/MCJIT/lit.local.cfg      |    3 +-
 test/ExecutionEngine/OrcMCJIT/lit.local.cfg   |    3 +-
 test/ExecutionEngine/OrcMCJIT/pr32650.ll      |   28 +
 test/Feature/fp-intrinsics.ll                 |  148 +
 .../SanitizerCoverage/chains.ll               |   33 +
 .../SanitizerCoverage/postdominator_check.ll  |   85 +
 test/LTO/Resolution/X86/linkonce.ll           |   11 +
 test/LTO/Resolution/X86/type-checked-load.ll  |   16 +
 test/Linker/Inputs/module-flags-pic-2-b.ll    |    5 +-
 test/Linker/module-flags-pic-2-a.ll           |   13 +-
 test/MC/AMDGPU/vop_sdwa.s                     |  441 ++-
 test/MC/Disassembler/AMDGPU/sdwa_gfx9.txt     |  477 +++
 test/MC/Disassembler/X86/avx-512.txt          |   23 +
 test/MC/WebAssembly/unnamed-data.ll           |   53 +
 test/MC/X86/pr22004.s                         |    3 +
 test/MC/X86/x86-64-avx512vpopcntdq.s          |  225 ++
 test/Other/new-pm-defaults.ll                 |    1 +
 test/TableGen/GlobalISelEmitter.td            |   26 +
 test/Transforms/Coroutines/coro-debug.ll      |  142 +
 test/Transforms/Coroutines/coro-frame.ll      |   11 +-
 .../Transforms/Coroutines/coro-materialize.ll |   52 +
 test/Transforms/EarlyCSE/const-speculation.ll |   39 +
 test/Transforms/GVN/PRE/phi-translate-2.ll    |  105 +
 test/Transforms/GVN/PRE/pre-gep-load.ll       |    2 +-
 test/Transforms/GVN/PRE/pre-load.ll           |    6 +-
 test/Transforms/GVNSink/dither.ll             |   42 +
 test/Transforms/GVNSink/indirect-call.ll      |   70 +
 test/Transforms/GVNSink/sink-common-code.ll   |  694 ++++
 test/Transforms/GVNSink/struct.ll             |   71 +
 .../GlobalDCE/externally_available.ll         |   19 +-
 test/Transforms/Inline/prof-update-instr.ll   |   57 +
 .../{prof-update.ll => prof-update-sample.ll} |    0
 .../InstCombine/2008-07-10-ICmpBinOp.ll       |   19 -
 .../InstCombine/2008-08-17-ICmpXorSignbit.ll  |   87 -
 test/Transforms/InstCombine/alloca.ll         |    7 +-
 .../InstCombine/bitcast-vec-canon.ll          |   37 +-
 test/Transforms/InstCombine/bitcast.ll        |   45 +
 test/Transforms/InstCombine/ctpop.ll          |   27 +-
 .../InstCombine/icmp-xor-signbit.ll           |  228 ++
 test/Transforms/InstCombine/icmp.ll           |   64 +
 test/Transforms/InstSimplify/call.ll          |  253 +-
 test/Transforms/InstSimplify/or.ll            |   41 +-
 test/Transforms/JumpThreading/assume.ll       |  145 +-
 .../JumpThreading/fold-not-thread.ll          |    4 +-
 test/Transforms/JumpThreading/guards.ll       |   91 +-
 test/Transforms/LoopIdiom/pr33114.ll          |   35 +
 .../X86/incorrect-offset-scaling.ll           |   12 +-
 .../{ => X86}/lsr-expand-quadratic.ll         |   26 +-
 .../LoopStrengthReduce/nonintegral.ll         |   45 +
 .../LoopStrengthReduce/post-inc-icmpzero.ll   |    4 +-
 .../AArch64/no_vector_instructions.ll         |   26 +
 .../LoopVectorize/SystemZ/addressing.ll       |   72 +
 .../X86/vectorization-remarks-missed.ll       |  124 +-
 test/Transforms/NewGVN/pr32403.ll             |    3 +-
 test/Transforms/NewGVN/pr32836.ll             |   45 +
 .../SimpleLoopUnswitch/trivial-unswitch.ll    |   61 +
 test/Verifier/fp-intrinsics.ll                |   39 +-
 test/Verifier/module-flags-1.ll               |    6 +-
 test/tools/gold/X86/relocation-model-pic.ll   |   63 +
 test/tools/llvm-nm/X86/Inputs/example.lib     |  Bin 0 -> 2000 bytes
 test/tools/llvm-nm/X86/importlibrary.test     |    7 +
 .../llvm-profdata/memop-size-prof.proftext    |    2 +-
 tools/gold/gold-plugin.cpp                    |    6 +-
 tools/llvm-nm/llvm-nm.cpp                     |   17 +-
 .../PrettyClassLayoutGraphicalDumper.cpp      |    3 +-
 tools/llvm-pdbdump/YAMLOutputStyle.cpp        |   14 +
 tools/llvm-pdbdump/llvm-pdbdump.cpp           |   37 +-
 tools/llvm-pdbdump/llvm-pdbdump.h             |    1 +
 tools/llvm-profdata/llvm-profdata.cpp         |    2 +-
 tools/llvm-readobj/COFFDumper.cpp             |    4 +-
 unittests/Analysis/ScalarEvolutionTest.cpp    |   88 +-
 unittests/DebugInfo/CodeView/CMakeLists.txt   |    1 +
 unittests/DebugInfo/CodeView/ErrorChecking.h  |    9 +
 .../CodeView/TypeIndexDiscoveryTest.cpp       |  496 +++
 .../DebugInfo/DWARF/DWARFDebugInfoTest.cpp    |   24 +-
 .../DebugInfo/PDB/MappedBlockStreamTest.cpp   |   51 +-
 unittests/IR/BasicBlockTest.cpp               |   75 +
 unittests/IR/CMakeLists.txt                   |    1 +
 unittests/IR/DominatorTreeTest.cpp            |  127 +-
 unittests/Support/CrashRecoveryTest.cpp       |    4 -
 utils/TableGen/CMakeLists.txt                 |    1 +
 utils/TableGen/GlobalISelEmitter.cpp          |   92 +-
 utils/TableGen/TableGen.cpp                   |    6 +
 utils/TableGen/TableGenBackends.h             |    1 +
 utils/TableGen/X86FoldTablesEmitter.cpp       |  720 ++++
 utils/{abtest => }/abtest.py                  |   58 +-
 utils/abtest/mark_aarch64fns.py               |   65 -
 utils/abtest/mark_armfns.py                   |   54 -
 utils/git-svn/git-llvm                        |   10 +
 utils/lit/lit/TestRunner.py                   |    2 +-
 utils/release/merge-request.sh                |    2 +-
 641 files changed, 20467 insertions(+), 10059 deletions(-)
 create mode 100644 docs/Benchmarking.rst
 create mode 100644 include/llvm/CodeGen/GlobalISel/Localizer.h
 create mode 100644 include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h
 create mode 100644 lib/CodeGen/AsmPrinter/DIEHashAttributes.def
 create mode 100644 lib/CodeGen/GlobalISel/Localizer.cpp
 create mode 100644 lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
 delete mode 100644 lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
 create mode 100644 lib/Target/Nios2/CMakeLists.txt
 create mode 100644 lib/Target/Nios2/LLVMBuild.txt
 create mode 100644 lib/Target/Nios2/MCTargetDesc/CMakeLists.txt
 create mode 100644 lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt
 create mode 100644 lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp
 create mode 100644 lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
 create mode 100644 lib/Target/Nios2/Nios2.h
 create mode 100644 lib/Target/Nios2/Nios2.td
 create mode 100644 lib/Target/Nios2/Nios2InstrFormats.td
 create mode 100644 lib/Target/Nios2/Nios2InstrInfo.td
 create mode 100644 lib/Target/Nios2/Nios2RegisterInfo.td
 create mode 100644 lib/Target/Nios2/Nios2TargetMachine.cpp
 create mode 100644 lib/Target/Nios2/Nios2TargetMachine.h
 create mode 100644 lib/Target/Nios2/TargetInfo/CMakeLists.txt
 create mode 100644 lib/Target/Nios2/TargetInfo/LLVMBuild.txt
 create mode 100644 lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp
 create mode 100644 lib/Transforms/Scalar/GVNSink.cpp
 delete mode 100644 test/Analysis/CostModel/AArch64/falkor.ll
 create mode 100644 test/Analysis/Delinearization/constant_functions_multi_dim.ll
 create mode 100644 test/CodeGen/AArch64/GlobalISel/localizer-in-O0-pipeline.mir
 create mode 100644 test/CodeGen/AArch64/GlobalISel/localizer.mir
 create mode 100644 test/CodeGen/AArch64/asm-print-comments.ll
 create mode 100644 test/CodeGen/AMDGPU/bfe-combine.ll
 create mode 100644 test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll
 create mode 100644 test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
 delete mode 100644 test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir
 create mode 100644 test/CodeGen/Nios2/lit.local.cfg
 create mode 100644 test/CodeGen/Nios2/target_support.ll
 create mode 100644 test/CodeGen/PowerPC/atomics-constant.ll
 create mode 100644 test/CodeGen/PowerPC/livephysregs.mir
 create mode 100644 test/CodeGen/PowerPC/vec_sldwi.ll
 create mode 100644 test/CodeGen/Thumb/machine-cse-physreg.mir
 create mode 100644 test/CodeGen/X86/GlobalISel/select-leaf-constant.mir
 create mode 100644 test/CodeGen/X86/GlobalISel/select-memop-v256.mir
 create mode 100644 test/CodeGen/X86/GlobalISel/select-memop-v512.mir
 create mode 100644 test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll
 create mode 100644 test/DebugInfo/Inputs/split-dwarf-dwp.cpp
 create mode 100644 test/DebugInfo/Inputs/split-dwarf-dwp.o
 create mode 100644 test/DebugInfo/Inputs/split-dwarf-dwp.o.dwp
 create mode 100644 test/DebugInfo/MIR/X86/empty-inline.mir
 create mode 100644 test/DebugInfo/PDB/Inputs/merge-ids-1.yaml
 create mode 100644 test/DebugInfo/PDB/Inputs/merge-ids-2.yaml
 create mode 100644 test/DebugInfo/PDB/Inputs/merge-ids-and-types-1.yaml
 create mode 100644 test/DebugInfo/PDB/Inputs/merge-ids-and-types-2.yaml
 rename test/DebugInfo/PDB/Inputs/{merge1.yaml => merge-types-1.yaml} (100%)
 rename test/DebugInfo/PDB/Inputs/{merge2.yaml => merge-types-2.yaml} (100%)
 create mode 100644 test/DebugInfo/PDB/Inputs/source-names-1.yaml
 create mode 100644 test/DebugInfo/PDB/Inputs/source-names-2.yaml
 create mode 100644 test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test
 create mode 100644 test/DebugInfo/PDB/pdbdump-mergeids.test
 create mode 100644 test/DebugInfo/PDB/pdbdump-objfilename.yaml
 create mode 100644 test/DebugInfo/PDB/pdbdump-source-names.test
 create mode 100644 test/DebugInfo/X86/gnu-public-names-gmlt.ll
 create mode 100644 test/DebugInfo/X86/split-dwarf-multiple-cu-hash.ll
 create mode 100644 test/DebugInfo/X86/split-dwarf-omit-empty.ll
 create mode 100644 test/DebugInfo/omit-empty.ll
 create mode 100644 test/ExecutionEngine/OrcMCJIT/pr32650.ll
 create mode 100644 test/Instrumentation/SanitizerCoverage/chains.ll
 create mode 100644 test/Instrumentation/SanitizerCoverage/postdominator_check.ll
 create mode 100644 test/LTO/Resolution/X86/linkonce.ll
 create mode 100644 test/LTO/Resolution/X86/type-checked-load.ll
 create mode 100644 test/MC/Disassembler/AMDGPU/sdwa_gfx9.txt
 create mode 100644 test/MC/WebAssembly/unnamed-data.ll
 create mode 100644 test/MC/X86/pr22004.s
 create mode 100644 test/MC/X86/x86-64-avx512vpopcntdq.s
 create mode 100644 test/Transforms/Coroutines/coro-debug.ll
 create mode 100644 test/Transforms/Coroutines/coro-materialize.ll
 create mode 100644 test/Transforms/EarlyCSE/const-speculation.ll
 create mode 100644 test/Transforms/GVN/PRE/phi-translate-2.ll
 create mode 100644 test/Transforms/GVNSink/dither.ll
 create mode 100644 test/Transforms/GVNSink/indirect-call.ll
 create mode 100644 test/Transforms/GVNSink/sink-common-code.ll
 create mode 100644 test/Transforms/GVNSink/struct.ll
 create mode 100644 test/Transforms/Inline/prof-update-instr.ll
 rename test/Transforms/Inline/{prof-update.ll => prof-update-sample.ll} (100%)
 delete mode 100644 test/Transforms/InstCombine/2008-07-10-ICmpBinOp.ll
 delete mode 100644 test/Transforms/InstCombine/2008-08-17-ICmpXorSignbit.ll
 create mode 100644 test/Transforms/InstCombine/icmp-xor-signbit.ll
 create mode 100644 test/Transforms/LoopIdiom/pr33114.ll
 rename test/Transforms/LoopStrengthReduce/{ => X86}/lsr-expand-quadratic.ll (51%)
 create mode 100644 test/Transforms/LoopStrengthReduce/nonintegral.ll
 create mode 100644 test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
 create mode 100644 test/Transforms/LoopVectorize/SystemZ/addressing.ll
 create mode 100644 test/Transforms/NewGVN/pr32836.ll
 create mode 100644 test/tools/gold/X86/relocation-model-pic.ll
 create mode 100644 test/tools/llvm-nm/X86/Inputs/example.lib
 create mode 100644 test/tools/llvm-nm/X86/importlibrary.test
 create mode 100644 unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp
 create mode 100644 unittests/IR/BasicBlockTest.cpp
 create mode 100644 utils/TableGen/X86FoldTablesEmitter.cpp
 rename utils/{abtest => }/abtest.py (81%)
 delete mode 100755 utils/abtest/mark_aarch64fns.py
 delete mode 100755 utils/abtest/mark_armfns.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 78e2e0166257..a5b96569f9c6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -44,6 +44,13 @@ if (NOT PACKAGE_VERSION)
     "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}${LLVM_VERSION_SUFFIX}")
 endif()
 
+if ((CMAKE_GENERATOR MATCHES "Visual Studio") AND (CMAKE_GENERATOR_TOOLSET STREQUAL ""))
+  message(WARNING "Visual Studio generators use the x86 host compiler by "
+                  "default, even for 64-bit targets. This can result in linker "
+                  "instability and out of memory errors. To use the 64-bit "
+                  "host compiler, pass -Thost=x64 on the CMake command line.")
+endif()
+
 project(LLVM
   ${cmake_3_0_PROJ_VERSION}
   ${cmake_3_0_LANGUAGES}
diff --git a/docs/Benchmarking.rst b/docs/Benchmarking.rst
new file mode 100644
index 000000000000..0f88db745a68
--- /dev/null
+++ b/docs/Benchmarking.rst
@@ -0,0 +1,87 @@
+==================================
+Benchmarking tips
+==================================
+
+
+Introduction
+============
+
+For benchmarking a patch we want to reduce all possible sources of
+noise as much as possible. How to do that is very OS dependent.
+
+Note that low noise is required, but not sufficient. It does not
+exclude measurement bias. See
+https://www.cis.upenn.edu/~cis501/papers/producing-wrong-data.pdf for
+example.
+
+General
+================================
+
+* Use a high resolution timer, e.g. perf under linux.
+
+* Run the benchmark multiple times to be able to recognize noise.
+
+* Disable as many processes or services as possible on the target system.
+
+* Disable frequency scaling, turbo boost and address space
+  randomization (see OS specific section).
+
+* Static link if the OS supports it. That avoids any variation that
+  might be introduced by loading dynamic libraries. This can be done
+  by passing ``-DLLVM_BUILD_STATIC=ON`` to cmake.
+
+* Try to avoid storage. On some systems you can use tmpfs. Putting the
+  program, inputs and outputs on tmpfs avoids touching a real storage
+  system, which can have a pretty big variability.
+
+  To mount it (on linux and freebsd at least)::
+
+    mount -t tmpfs -o size=<XX>g none dir_to_mount
+
+Linux
+=====
+
+* Disable address space randomization::
+
+    echo 0 > /proc/sys/kernel/randomize_va_space
+
+* Set scaling_governor to performance::
+
+   for i in /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+   do
+     echo performance > /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+   done
+
+* Use https://github.com/lpechacek/cpuset to reserve cpus for just the
+  program you are benchmarking. If using perf, leave at least 2 cores
+  so that perf runs in one and your program in another::
+
+    cset shield -c N1,N2 -k on
+
+  This will move all threads out of N1 and N2. The ``-k on`` means
+  that even kernel threads are moved out.
+
+* Disable the SMT pair of the cpus you will use for the benchmark. The
+  pair of cpu N can be found in
+  ``/sys/devices/system/cpu/cpuN/topology/thread_siblings_list`` and
+  disabled with::
+
+    echo 0 > /sys/devices/system/cpu/cpuX/online
+
+
+* Run the program with::
+
+    cset shield --exec -- perf stat -r 10 <cmd>
+
+  This will run the command after ``--`` in the isolated cpus. The
+  particular perf command runs the ``<cmd>`` 10 times and reports
+  statistics.
+
+With these in place you can expect perf variations of less than 0.1%.
+
+Linux Intel
+-----------
+
+* Disable turbo mode::
+
+    echo 1 > /sys/devices/system/cpu/intel_pstate/no_turbo
diff --git a/docs/GettingStartedVS.rst b/docs/GettingStartedVS.rst
index 1e4676767939..50f7aa123c55 100644
--- a/docs/GettingStartedVS.rst
+++ b/docs/GettingStartedVS.rst
@@ -100,6 +100,10 @@ Here's the short story for getting up and running quickly with LLVM:
    * CMake generates project files for all build types. To select a specific
      build type, use the Configuration manager from the VS IDE or the 
      ``/property:Configuration`` command line option when using MSBuild.
+   * By default, the Visual Studio project files generated by CMake use the
+     32-bit toolset. If you are developing on a 64-bit version of Windows and
+     want to use the 64-bit toolset, pass the ``-Thost=x64`` flag when
+     generating the Visual Studio solution. This requires CMake 3.8.0 or later.
 
 6. Start Visual Studio
 
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index b205cae9b118..2e339183ef11 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -4415,12 +4415,6 @@ The current supported vocabulary is limited:
   address space identifier.
 - ``DW_OP_stack_value`` marks a constant value.
 
-DIExpression nodes that contain a ``DW_OP_stack_value`` operator are standalone
-location descriptions that describe constant values. This form is used to
-describe global constants that have been optimized away. All other expressions
-are modifiers to another location: A debug intrinsic ties a location and a
-DIExpression together.
-
 DWARF specifies three kinds of simple location descriptions: Register, memory,
 and implicit location descriptions. Register and memory location descriptions
 describe the *location* of a source variable (in the sense that a debugger might
@@ -12722,7 +12716,7 @@ Syntax:
       declare <type> 
       @llvm.experimental.constrained.fadd(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
-                                          metadata  <exception behavior>)
+                                          metadata <exception behavior>)
 
 Overview:
 """""""""
@@ -12759,7 +12753,7 @@ Syntax:
       declare <type> 
       @llvm.experimental.constrained.fsub(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
-                                          metadata  <exception behavior>)
+                                          metadata <exception behavior>)
 
 Overview:
 """""""""
@@ -12796,7 +12790,7 @@ Syntax:
       declare <type> 
       @llvm.experimental.constrained.fmul(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
-                                          metadata  <exception behavior>)
+                                          metadata <exception behavior>)
 
 Overview:
 """""""""
@@ -12833,7 +12827,7 @@ Syntax:
       declare <type> 
       @llvm.experimental.constrained.fdiv(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
-                                          metadata  <exception behavior>)
+                                          metadata <exception behavior>)
 
 Overview:
 """""""""
@@ -12870,7 +12864,7 @@ Syntax:
       declare <type> 
       @llvm.experimental.constrained.frem(<type> <op1>, <type> <op2>,
                                           metadata <rounding mode>,
-                                          metadata  <exception behavior>)
+                                          metadata <exception behavior>)
 
 Overview:
 """""""""
@@ -12899,6 +12893,461 @@ value operands and has the same type as the operands.  The remainder has the
 same sign as the dividend. 
 
 
+Constrained libm-equivalent Intrinsics
+--------------------------------------
+
+In addition to the basic floating point operations for which constrained
+intrinsics are described above, there are constrained versions of various
+operations which provide equivalent behavior to a corresponding libm function.
+These intrinsics allow the precise behavior of these operations with respect to
+rounding mode and exception behavior to be controlled.
+
+As with the basic constrained floating point intrinsics, the rounding mode
+and exception behavior arguments only control the behavior of the optimizer.
+They do not change the runtime floating point environment.
+
+
+'``llvm.experimental.constrained.sqrt``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.sqrt(<type> <op1>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.sqrt``' intrinsic returns the square root
+of the specified value, returning the same value as the libm '``sqrt``'
+functions would, but without setting ``errno``.
+
+Arguments:
+""""""""""
+
+The first argument and the return type are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the nonnegative square root of the specified value.
+If the value is less than negative zero, a floating point exception occurs
+and the the return value is architecture specific.
+
+
+'``llvm.experimental.constrained.pow``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.pow(<type> <op1>, <type> <op2>,
+                                         metadata <rounding mode>,
+                                         metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.pow``' intrinsic returns the first operand
+raised to the (positive or negative) power specified by the second operand.
+
+Arguments:
+""""""""""
+
+The first two arguments and the return value are floating point numbers of the
+same type.  The second argument specifies the power to which the first argument
+should be raised.
+
+The third and fourth arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the first value raised to the second power,
+returning the same values as the libm ``pow`` functions would, and
+handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.powi``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.powi(<type> <op1>, i32 <op2>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.powi``' intrinsic returns the first operand
+raised to the (positive or negative) power specified by the second operand. The
+order of evaluation of multiplications is not defined. When a vector of floating
+point type is used, the second argument remains a scalar integer value.
+
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.  The second argument is a 32-bit signed integer specifying the power to
+which the first argument should be raised.
+
+The third and fourth arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the first value raised to the second power with an
+unspecified sequence of rounding operations.
+
+
+'``llvm.experimental.constrained.sin``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.sin(<type> <op1>,
+                                         metadata <rounding mode>,
+                                         metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.sin``' intrinsic returns the sine of the
+first operand.
+
+Arguments:
+""""""""""
+
+The first argument and the return type are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the sine of the specified operand, returning the
+same values as the libm ``sin`` functions would, and handles error
+conditions in the same way.
+
+
+'``llvm.experimental.constrained.cos``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.cos(<type> <op1>,
+                                         metadata <rounding mode>,
+                                         metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.cos``' intrinsic returns the cosine of the
+first operand.
+
+Arguments:
+""""""""""
+
+The first argument and the return type are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the cosine of the specified operand, returning the
+same values as the libm ``cos`` functions would, and handles error
+conditions in the same way.
+
+
+'``llvm.experimental.constrained.exp``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.exp(<type> <op1>,
+                                         metadata <rounding mode>,
+                                         metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.exp``' intrinsic computes the base-e
+exponential of the specified value.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``exp`` functions
+would, and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.exp2``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.exp2(<type> <op1>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.exp2``' intrinsic computes the base-2
+exponential of the specified value.
+
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``exp2`` functions
+would, and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.log``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.log(<type> <op1>,
+                                         metadata <rounding mode>,
+                                         metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.log``' intrinsic computes the base-e
+logarithm of the specified value.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``log`` functions
+would, and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.log10``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.log10(<type> <op1>,
+                                           metadata <rounding mode>,
+                                           metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.log10``' intrinsic computes the base-10
+logarithm of the specified value.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``log10`` functions
+would, and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.log2``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.log2(<type> <op1>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.log2``' intrinsic computes the base-2
+logarithm of the specified value.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``log2`` functions
+would, and handles error conditions in the same way.
+
+
+'``llvm.experimental.constrained.rint``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.rint(<type> <op1>,
+                                          metadata <rounding mode>,
+                                          metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.rint``' intrinsic returns the first
+operand rounded to the nearest integer. It may raise an inexact floating point
+exception if the operand is not an integer.
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``rint`` functions
+would, and handles error conditions in the same way.  The rounding mode is
+described, not determined, by the rounding mode argument.  The actual rounding
+mode is determined by the runtime floating point environment.  The rounding
+mode argument is only intended as information to the compiler.
+
+
+'``llvm.experimental.constrained.nearbyint``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type> 
+      @llvm.experimental.constrained.nearbyint(<type> <op1>,
+                                               metadata <rounding mode>,
+                                               metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.nearbyint``' intrinsic returns the first
+operand rounded to the nearest integer. It will not raise an inexact floating
+point exception if the operand is not an integer.
+
+
+Arguments:
+""""""""""
+
+The first argument and the return value are floating point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the same values as the libm ``nearbyint`` functions
+would, and handles error conditions in the same way.  The rounding mode is
+described, not determined, by the rounding mode argument.  The actual rounding
+mode is determined by the runtime floating point environment.  The rounding
+mode argument is only intended as information to the compiler.
+
+
 General Intrinsics
 ------------------
 
diff --git a/docs/Vectorizers.rst b/docs/Vectorizers.rst
index 65c19aa2bc0c..a909d458c317 100644
--- a/docs/Vectorizers.rst
+++ b/docs/Vectorizers.rst
@@ -99,7 +99,9 @@ Optimization remarks are enabled using:
 indicates if vectorization was specified.
 
 ``-Rpass-analysis=loop-vectorize`` identifies the statements that caused
-vectorization to fail.
+vectorization to fail. If in addition ``-fsave-optimization-record`` is
+provided, multiple causes of vectorization failure may be listed (this behavior
+might change in the future).
 
 Consider the following loop:
 
diff --git a/docs/index.rst b/docs/index.rst
index fe47eb1bcb7f..becbe48e7ec7 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -90,6 +90,7 @@ representation.
    CodeOfConduct
    CompileCudaWithLLVM
    ReportingGuide
+   Benchmarking
 
 :doc:`GettingStarted`
    Discusses how to get up and running quickly with the LLVM infrastructure.
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
index a14fd1dc20ec..847662cc11be 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
@@ -1,4 +1,4 @@
-//===----- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope ----*- C++ -*-===//
+//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,7 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
@@ -44,7 +44,7 @@ private:
   IRCompileLayer<decltype(ObjectLayer)> CompileLayer;
 
 public:
-  typedef decltype(CompileLayer)::ModuleSetHandleT ModuleHandle;
+  using ModuleHandle = decltype(CompileLayer)::ModuleSetHandleT;
 
   KaleidoscopeJIT()
       : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
index 2039be4571a5..a5ac2f017b74 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
@@ -1,4 +1,4 @@
-//===----- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope ----*- C++ -*-===//
+//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,7 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
@@ -47,13 +47,13 @@ private:
   RTDyldObjectLinkingLayer<> ObjectLayer;
   IRCompileLayer<decltype(ObjectLayer)> CompileLayer;
 
-  typedef std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>
-    OptimizeFunction;
+  using OptimizeFunction =
+      std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
 
   IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
 
 public:
-  typedef decltype(OptimizeLayer)::ModuleSetHandleT ModuleHandle;
+  using ModuleHandle = decltype(OptimizeLayer)::ModuleSetHandleT;
 
   KaleidoscopeJIT()
       : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h
index d22d41855072..7acb9c748880 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h
@@ -1,4 +1,4 @@
-//===----- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope ----*- C++ -*-===//
+//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
@@ -49,8 +50,8 @@ private:
   RTDyldObjectLinkingLayer<> ObjectLayer;
   IRCompileLayer<decltype(ObjectLayer)> CompileLayer;
 
-  typedef std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>
-    OptimizeFunction;
+  using OptimizeFunction =
+      std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
 
   IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
 
@@ -58,7 +59,7 @@ private:
   CompileOnDemandLayer<decltype(OptimizeLayer)> CODLayer;
 
 public:
-  typedef decltype(CODLayer)::ModuleSetHandleT ModuleHandle;
+  using ModuleHandle = decltype(CODLayer)::ModuleSetHandleT;
 
   KaleidoscopeJIT()
       : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h
index e0a78410f713..03e42230ae71 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h
@@ -1,4 +1,4 @@
-//===----- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope ----*- C++ -*-===//
+//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,10 +17,10 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
@@ -76,8 +76,8 @@ private:
   RTDyldObjectLinkingLayer<> ObjectLayer;
   IRCompileLayer<decltype(ObjectLayer)> CompileLayer;
 
-  typedef std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>
-    OptimizeFunction;
+  using OptimizeFunction =
+      std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
 
   IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
 
@@ -85,7 +85,7 @@ private:
   std::unique_ptr<IndirectStubsManager> IndirectStubsMgr;
 
 public:
-  typedef decltype(OptimizeLayer)::ModuleSetHandleT ModuleHandle;
+  using ModuleHandle = decltype(OptimizeLayer)::ModuleSetHandleT;
 
   KaleidoscopeJIT()
       : TM(EngineBuilder().selectTarget()),
@@ -106,7 +106,6 @@ public:
   TargetMachine &getTargetMachine() { return *TM; }
 
   ModuleHandle addModule(std::unique_ptr<Module> M) {
-
     // Build our symbol resolver:
     // Lambda 1: Look back into the JIT itself to find symbols that are part of
     //           the same "logical dylib".
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h
index 70a896fe8f00..0ee9d094ab82 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h
@@ -1,4 +1,4 @@
-//===----- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope ----*- C++ -*-===//
+//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -20,9 +20,8 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
+#include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
@@ -73,7 +72,7 @@ namespace llvm {
 namespace orc {
 
 // Typedef the remote-client API.
-typedef remote::OrcRemoteTargetClient<FDRPCChannel> MyRemote;
+using MyRemote = remote::OrcRemoteTargetClient<FDRPCChannel>;
 
 class KaleidoscopeJIT {
 private:
@@ -82,8 +81,8 @@ private:
   RTDyldObjectLinkingLayer<> ObjectLayer;
   IRCompileLayer<decltype(ObjectLayer)> CompileLayer;
 
-  typedef std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>
-    OptimizeFunction;
+  using OptimizeFunction =
+      std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>;
 
   IRTransformLayer<decltype(CompileLayer), OptimizeFunction> OptimizeLayer;
 
@@ -92,7 +91,7 @@ private:
   MyRemote &Remote;
 
 public:
-  typedef decltype(OptimizeLayer)::ModuleSetHandleT ModuleHandle;
+  using ModuleHandle = decltype(OptimizeLayer)::ModuleSetHandleT;
 
   KaleidoscopeJIT(MyRemote &Remote)
       : TM(EngineBuilder().selectTarget(Triple(Remote.getTargetTriple()), "",
@@ -124,7 +123,6 @@ public:
   TargetMachine &getTargetMachine() { return *TM; }
 
   ModuleHandle addModule(std::unique_ptr<Module> M) {
-
     // Build our symbol resolver:
     // Lambda 1: Look back into the JIT itself to find symbols that are part of
     //           the same "logical dylib".
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter5/Server/server.cpp b/examples/Kaleidoscope/BuildingAJIT/Chapter5/Server/server.cpp
index da6e8ac65234..e50a7ecf96bc 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter5/Server/server.cpp
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter5/Server/server.cpp
@@ -1,17 +1,19 @@
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/DynamicLibrary.h"
-#include "llvm/Support/TargetSelect.h"
+#include "../RemoteJITUtils.h"
+#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h"
 #include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
-
-#include "../RemoteJITUtils.h"
-
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetSelect.h"
+#include <cstdint>
+#include <cstdio>
 #include <cstring>
-#include <unistd.h>
+#include <string>
 #include <netinet/in.h>
 #include <sys/socket.h>
 
-
 using namespace llvm;
 using namespace llvm::orc;
 
@@ -22,7 +24,7 @@ cl::opt<uint32_t> Port("port",
 
 ExitOnError ExitOnErr;
 
-typedef int (*MainFun)(int, const char*[]);
+using MainFun = int (*)(int, const char*[]);
 
 template <typename NativePtrT>
 NativePtrT MakeNative(uint64_t P) {
@@ -36,7 +38,6 @@ void printExprResult(double Val) {
 
 // --- LAZY COMPILE TEST ---
 int main(int argc, char* argv[]) {
-
   if (argc == 0)
     ExitOnErr.setBanner("jit_server: ");
   else
@@ -59,14 +60,14 @@ int main(int argc, char* argv[]) {
   int sockfd = socket(PF_INET, SOCK_STREAM, 0);
   sockaddr_in servAddr, clientAddr;
   socklen_t clientAddrLen = sizeof(clientAddr);
-  bzero(&servAddr, sizeof(servAddr));
+  memset(&servAddr, 0, sizeof(servAddr));
   servAddr.sin_family = PF_INET;
   servAddr.sin_family = INADDR_ANY;
   servAddr.sin_port = htons(Port);
 
   {
     // avoid "Address already in use" error.
-    int yes=1;
+    int yes = 1;
     if (setsockopt(sockfd,SOL_SOCKET,SO_REUSEADDR,&yes,sizeof(int)) == -1) {
       errs() << "Error calling setsockopt.\n";
       return 1;
@@ -98,7 +99,8 @@ int main(int argc, char* argv[]) {
     };
 
   FDRPCChannel TCPChannel(newsockfd, newsockfd);
-  typedef remote::OrcRemoteTargetServer<FDRPCChannel, OrcX86_64_SysV> MyServerT;
+
+  using MyServerT = remote::OrcRemoteTargetServer<FDRPCChannel, OrcX86_64_SysV>;
 
   MyServerT Server(TCPChannel, SymbolLookup, RegisterEHFrames, DeregisterEHFrames);
 
diff --git a/examples/Kaleidoscope/include/KaleidoscopeJIT.h b/examples/Kaleidoscope/include/KaleidoscopeJIT.h
index 1dca39deba3c..9a682f7ab744 100644
--- a/examples/Kaleidoscope/include/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/include/KaleidoscopeJIT.h
@@ -1,4 +1,4 @@
-//===----- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope ----*- C++ -*-===//
+//===- KaleidoscopeJIT.h - A simple JIT for Kaleidoscope --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,7 +19,6 @@
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/JITSymbol.h"
 #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
-#include "llvm/ExecutionEngine/RuntimeDyld.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
@@ -40,9 +39,9 @@ namespace orc {
 
 class KaleidoscopeJIT {
 public:
-  typedef RTDyldObjectLinkingLayer<> ObjLayerT;
-  typedef IRCompileLayer<ObjLayerT> CompileLayerT;
-  typedef CompileLayerT::ModuleSetHandleT ModuleHandleT;
+  using ObjLayerT = RTDyldObjectLinkingLayer<>;
+  using CompileLayerT = IRCompileLayer<ObjLayerT>;
+  using ModuleHandleT = CompileLayerT::ModuleSetHandleT;
 
   KaleidoscopeJIT()
       : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 3a4a37017d61..07626982d289 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -59,6 +59,7 @@ public:
     mips64,         // MIPS64: mips64
     mips64el,       // MIPS64EL: mips64el
     msp430,         // MSP430: msp430
+    nios2,          // NIOSII: nios2
     ppc,            // PPC: powerpc
     ppc64,          // PPC64: powerpc64, ppu
     ppc64le,        // PPC64LE: powerpc64le
diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h
index bf73e099a2bf..ca48b5483512 100644
--- a/include/llvm/Analysis/InstructionSimplify.h
+++ b/include/llvm/Analysis/InstructionSimplify.h
@@ -70,174 +70,173 @@ struct SimplifyQuery {
     Copy.CxtI = I;
     return Copy;
   }
-  };
+};
 
-  // NOTE: the explicit multiple argument versions of these functions are
-  // deprecated.
-  // Please use the SimplifyQuery versions in new code.
+// NOTE: the explicit multiple argument versions of these functions are
+// deprecated.
+// Please use the SimplifyQuery versions in new code.
 
-  /// Given operands for an Add, fold the result or return null.
-  Value *SimplifyAddInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
+/// Given operands for an Add, fold the result or return null.
+Value *SimplifyAddInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
                        const SimplifyQuery &Q);
 
-  /// Given operands for a Sub, fold the result or return null.
-  Value *SimplifySubInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
-                         const SimplifyQuery &Q);
+/// Given operands for a Sub, fold the result or return null.
+Value *SimplifySubInst(Value *LHS, Value *RHS, bool isNSW, bool isNUW,
+                       const SimplifyQuery &Q);
 
-  /// Given operands for an FAdd, fold the result or return null.
-  Value *SimplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+/// Given operands for an FAdd, fold the result or return null.
+Value *SimplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+                        const SimplifyQuery &Q);
+
+/// Given operands for an FSub, fold the result or return null.
+Value *SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+                        const SimplifyQuery &Q);
+
+/// Given operands for an FMul, fold the result or return null.
+Value *SimplifyFMulInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+                        const SimplifyQuery &Q);
+
+/// Given operands for a Mul, fold the result or return null.
+Value *SimplifyMulInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+
+/// Given operands for an SDiv, fold the result or return null.
+Value *SimplifySDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+
+/// Given operands for a UDiv, fold the result or return null.
+Value *SimplifyUDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+
+/// Given operands for an FDiv, fold the result or return null.
+Value *SimplifyFDivInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+                        const SimplifyQuery &Q);
+
+/// Given operands for an SRem, fold the result or return null.
+Value *SimplifySRemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+
+/// Given operands for a URem, fold the result or return null.
+Value *SimplifyURemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+
+/// Given operands for an FRem, fold the result or return null.
+Value *SimplifyFRemInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+                        const SimplifyQuery &Q);
+
+/// Given operands for a Shl, fold the result or return null.
+Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
+                       const SimplifyQuery &Q);
+
+/// Given operands for a LShr, fold the result or return null.
+Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
+                        const SimplifyQuery &Q);
+
+/// Given operands for a AShr, fold the result or return nulll.
+Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
+                        const SimplifyQuery &Q);
+
+/// Given operands for an And, fold the result or return null.
+Value *SimplifyAndInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+
+/// Given operands for an Or, fold the result or return null.
+Value *SimplifyOrInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+
+/// Given operands for an Xor, fold the result or return null.
+Value *SimplifyXorInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+
+/// Given operands for an ICmpInst, fold the result or return null.
+Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+                        const SimplifyQuery &Q);
+
+/// Given operands for an FCmpInst, fold the result or return null.
+Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
+                        FastMathFlags FMF, const SimplifyQuery &Q);
+
+/// Given operands for a SelectInst, fold the result or return null.
+Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
                           const SimplifyQuery &Q);
 
-  /// Given operands for an FSub, fold the result or return null.
-  Value *SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF,
-                          const SimplifyQuery &Q);
+/// Given operands for a GetElementPtrInst, fold the result or return null.
+Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
+                       const SimplifyQuery &Q);
 
-  /// Given operands for an FMul, fold the result or return null.
-  Value *SimplifyFMulInst(Value *LHS, Value *RHS, FastMathFlags FMF,
-                          const SimplifyQuery &Q);
+/// Given operands for an InsertValueInst, fold the result or return null.
+Value *SimplifyInsertValueInst(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
+                               const SimplifyQuery &Q);
 
-  /// Given operands for a Mul, fold the result or return null.
-  Value *SimplifyMulInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
+/// Given operands for an ExtractValueInst, fold the result or return null.
+Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
+                                const SimplifyQuery &Q);
 
-  /// Given operands for an SDiv, fold the result or return null.
-  Value *SimplifySDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
-
-  /// Given operands for a UDiv, fold the result or return null.
-  Value *SimplifyUDivInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
-
-  /// Given operands for an FDiv, fold the result or return null.
-  Value *SimplifyFDivInst(Value *LHS, Value *RHS, FastMathFlags FMF,
-                          const SimplifyQuery &Q);
-
-  /// Given operands for an SRem, fold the result or return null.
-  Value *SimplifySRemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
-
-  /// Given operands for a URem, fold the result or return null.
-  Value *SimplifyURemInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
-
-  /// Given operands for an FRem, fold the result or return null.
-  Value *SimplifyFRemInst(Value *LHS, Value *RHS, FastMathFlags FMF,
-                          const SimplifyQuery &Q);
-
-  /// Given operands for a Shl, fold the result or return null.
-  Value *SimplifyShlInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
-                         const SimplifyQuery &Q);
-
-  /// Given operands for a LShr, fold the result or return null.
-  Value *SimplifyLShrInst(Value *Op0, Value *Op1, bool isExact,
-                          const SimplifyQuery &Q);
-
-  /// Given operands for a AShr, fold the result or return nulll.
-  Value *SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
-                          const SimplifyQuery &Q);
-
-  /// Given operands for an And, fold the result or return null.
-  Value *SimplifyAndInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
-
-  /// Given operands for an Or, fold the result or return null.
-  Value *SimplifyOrInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
-
-  /// Given operands for an Xor, fold the result or return null.
-  Value *SimplifyXorInst(Value *LHS, Value *RHS, const SimplifyQuery &Q);
-
-  /// Given operands for an ICmpInst, fold the result or return null.
-  Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                          const SimplifyQuery &Q);
-
-  /// Given operands for an FCmpInst, fold the result or return null.
-  Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                          FastMathFlags FMF, const SimplifyQuery &Q);
-
-  /// Given operands for a SelectInst, fold the result or return null.
-  Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
-                            const SimplifyQuery &Q);
-
-  /// Given operands for a GetElementPtrInst, fold the result or return null. 
-  Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
-                         const SimplifyQuery &Q);
-
-  /// Given operands for an InsertValueInst, fold the result or return null.
-  Value *SimplifyInsertValueInst(Value *Agg, Value *Val,
-                                 ArrayRef<unsigned> Idxs,
-                                 const SimplifyQuery &Q);
-
-  /// Given operands for an ExtractValueInst, fold the result or return null.
-  Value *SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
+/// Given operands for an ExtractElementInst, fold the result or return null.
+Value *SimplifyExtractElementInst(Value *Vec, Value *Idx,
                                   const SimplifyQuery &Q);
 
-  /// Given operands for an ExtractElementInst, fold the result or return null.
-  Value *SimplifyExtractElementInst(Value *Vec, Value *Idx,
-                                    const SimplifyQuery &Q);
+/// Given operands for a CastInst, fold the result or return null.
+Value *SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
+                        const SimplifyQuery &Q);
 
-  /// Given operands for a CastInst, fold the result or return null.
-  Value *SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
-                          const SimplifyQuery &Q);
+/// Given operands for a ShuffleVectorInst, fold the result or return null.
+Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
+                                 Type *RetTy, const SimplifyQuery &Q);
 
-  /// Given operands for a ShuffleVectorInst, fold the result or return null.
-  Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
-                                   Type *RetTy, const SimplifyQuery &Q);
+//=== Helper functions for higher up the class hierarchy.
 
-  //=== Helper functions for higher up the class hierarchy.
-
-
-  /// Given operands for a CmpInst, fold the result or return null.
-  Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
-                         const SimplifyQuery &Q);
-
-  /// Given operands for a BinaryOperator, fold the result or return null.
-  Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+/// Given operands for a CmpInst, fold the result or return null.
+Value *SimplifyCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                        const SimplifyQuery &Q);
 
-  /// Given operands for an FP BinaryOperator, fold the result or return null.
-  /// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the
-  /// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp.
-  Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                         FastMathFlags FMF, const SimplifyQuery &Q);
+/// Given operands for a BinaryOperator, fold the result or return null.
+Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+                     const SimplifyQuery &Q);
 
-  /// Given a function and iterators over arguments, fold the result or return
-  /// null.
-  Value *SimplifyCall(Value *V, User::op_iterator ArgBegin,
-                      User::op_iterator ArgEnd, const SimplifyQuery &Q);
+/// Given operands for an FP BinaryOperator, fold the result or return null.
+/// In contrast to SimplifyBinOp, try to use FastMathFlag when folding the
+/// result. In case we don't need FastMathFlags, simply fall to SimplifyBinOp.
+Value *SimplifyFPBinOp(unsigned Opcode, Value *LHS, Value *RHS,
+                       FastMathFlags FMF, const SimplifyQuery &Q);
 
-  /// Given a function and set of arguments, fold the result or return null.
-  Value *SimplifyCall(Value *V, ArrayRef<Value *> Args, const SimplifyQuery &Q);
+/// Given a function and iterators over arguments, fold the result or return
+/// null.
+Value *SimplifyCall(Value *V, User::op_iterator ArgBegin,
+                    User::op_iterator ArgEnd, const SimplifyQuery &Q);
 
-  /// See if we can compute a simplified version of this instruction. If not,
-  /// return null.
-  Value *SimplifyInstruction(Instruction *I, const SimplifyQuery &Q,
-                             OptimizationRemarkEmitter *ORE = nullptr);
+/// Given a function and set of arguments, fold the result or return null.
+Value *SimplifyCall(Value *V, ArrayRef<Value *> Args, const SimplifyQuery &Q);
 
-  /// Replace all uses of 'I' with 'SimpleV' and simplify the uses recursively.
-  ///
-  /// This first performs a normal RAUW of I with SimpleV. It then recursively
-  /// attempts to simplify those users updated by the operation. The 'I'
-  /// instruction must not be equal to the simplified value 'SimpleV'.
-  ///
-  /// The function returns true if any simplifications were performed.
-  bool replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
-                                     const TargetLibraryInfo *TLI = nullptr,
-                                     const DominatorTree *DT = nullptr,
-                                     AssumptionCache *AC = nullptr);
+/// See if we can compute a simplified version of this instruction. If not,
+/// return null.
+Value *SimplifyInstruction(Instruction *I, const SimplifyQuery &Q,
+                           OptimizationRemarkEmitter *ORE = nullptr);
 
-  /// Recursively attempt to simplify an instruction.
-  ///
-  /// This routine uses SimplifyInstruction to simplify 'I', and if successful
-  /// replaces uses of 'I' with the simplified value. It then recurses on each
-  /// of the users impacted. It returns true if any simplifications were
-  /// performed.
-  bool recursivelySimplifyInstruction(Instruction *I,
-                                      const TargetLibraryInfo *TLI = nullptr,
-                                      const DominatorTree *DT = nullptr,
-                                      AssumptionCache *AC = nullptr);
-  // These helper functions return a SimplifyQuery structure that contains as
-  // many of the optional analysis we use as are currently valid.  This is the
-  // strongly preferred way of constructing SimplifyQuery in passes.
-  const SimplifyQuery getBestSimplifyQuery(Pass &, Function &);
-  template <class T, class... TArgs>
-  const SimplifyQuery getBestSimplifyQuery(AnalysisManager<T, TArgs...> &,
-                                           Function &);
-  const SimplifyQuery getBestSimplifyQuery(LoopStandardAnalysisResults &,
-                                           const DataLayout &);
+/// Replace all uses of 'I' with 'SimpleV' and simplify the uses recursively.
+///
+/// This first performs a normal RAUW of I with SimpleV. It then recursively
+/// attempts to simplify those users updated by the operation. The 'I'
+/// instruction must not be equal to the simplified value 'SimpleV'.
+///
+/// The function returns true if any simplifications were performed.
+bool replaceAndRecursivelySimplify(Instruction *I, Value *SimpleV,
+                                   const TargetLibraryInfo *TLI = nullptr,
+                                   const DominatorTree *DT = nullptr,
+                                   AssumptionCache *AC = nullptr);
+
+/// Recursively attempt to simplify an instruction.
+///
+/// This routine uses SimplifyInstruction to simplify 'I', and if successful
+/// replaces uses of 'I' with the simplified value. It then recurses on each
+/// of the users impacted. It returns true if any simplifications were
+/// performed.
+bool recursivelySimplifyInstruction(Instruction *I,
+                                    const TargetLibraryInfo *TLI = nullptr,
+                                    const DominatorTree *DT = nullptr,
+                                    AssumptionCache *AC = nullptr);
+
+// These helper functions return a SimplifyQuery structure that contains as
+// many of the optional analysis we use as are currently valid.  This is the
+// strongly preferred way of constructing SimplifyQuery in passes.
+const SimplifyQuery getBestSimplifyQuery(Pass &, Function &);
+template <class T, class... TArgs>
+const SimplifyQuery getBestSimplifyQuery(AnalysisManager<T, TArgs...> &,
+                                         Function &);
+const SimplifyQuery getBestSimplifyQuery(LoopStandardAnalysisResults &,
+                                         const DataLayout &);
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/Analysis/LoopPass.h b/include/llvm/Analysis/LoopPass.h
index 496ae189e57b..75e7688bbdc2 100644
--- a/include/llvm/Analysis/LoopPass.h
+++ b/include/llvm/Analysis/LoopPass.h
@@ -126,9 +126,8 @@ public:
   }
 
 public:
-  // Add a new loop into the loop queue as a child of the given parent, or at
-  // the top level if \c ParentLoop is null.
-  Loop &addLoop(Loop *ParentLoop);
+  // Add a new loop into the loop queue.
+  void addLoop(Loop &L);
 
   //===--------------------------------------------------------------------===//
   /// SimpleAnalysis - Provides simple interface to update analysis info
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index ac54bd4cfffb..4a6fc245c225 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -1533,6 +1533,12 @@ public:
   /// specified loop.
   bool isLoopInvariant(const SCEV *S, const Loop *L);
 
+  /// Determine if the SCEV can be evaluated at loop's entry. It is true if it
+  /// doesn't depend on a SCEVUnknown of an instruction which is dominated by
+  /// the header of loop L.
+  bool isAvailableAtLoopEntry(const SCEV *S, const Loop *L, DominatorTree &DT,
+                              LoopInfo &LI);
+
   /// Return true if the given SCEV changes value in a known way in the
   /// specified loop.  This property being true implies that the value is
   /// variant in the loop AND that we can emit an expression to compute the
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index 0a0af384c3e6..6cbe3a1f515e 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -396,6 +396,9 @@ public:
   bool isLegalMaskedScatter(Type *DataType) const;
   bool isLegalMaskedGather(Type *DataType) const;
 
+  /// Return true if target doesn't mind addresses in vectors.
+  bool prefersVectorizedAddressing() const;
+
   /// \brief Return the cost of the scaling factor used in the addressing
   /// mode represented by AM for this target, for a load/store
   /// of the specified type.
@@ -807,6 +810,7 @@ public:
   virtual bool isLegalMaskedLoad(Type *DataType) = 0;
   virtual bool isLegalMaskedScatter(Type *DataType) = 0;
   virtual bool isLegalMaskedGather(Type *DataType) = 0;
+  virtual bool prefersVectorizedAddressing() = 0;
   virtual int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                    int64_t BaseOffset, bool HasBaseReg,
                                    int64_t Scale, unsigned AddrSpace) = 0;
@@ -1000,6 +1004,9 @@ public:
   bool isLegalMaskedGather(Type *DataType) override {
     return Impl.isLegalMaskedGather(DataType);
   }
+  bool prefersVectorizedAddressing() override {
+    return Impl.prefersVectorizedAddressing();
+  }
   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                            bool HasBaseReg, int64_t Scale,
                            unsigned AddrSpace) override {
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index 550e84ad90c4..ad1a7cb748fe 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -237,6 +237,8 @@ public:
 
   bool isLegalMaskedGather(Type *DataType) { return false; }
 
+  bool prefersVectorizedAddressing() { return true; }
+
   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
                            bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
     // Guess that all legal addressing mode are free.
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index cf24062e46f8..b1ee76159c4b 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -60,7 +60,8 @@ template <typename T> class ArrayRef;
   KnownBits computeKnownBits(const Value *V, const DataLayout &DL,
                              unsigned Depth = 0, AssumptionCache *AC = nullptr,
                              const Instruction *CxtI = nullptr,
-                             const DominatorTree *DT = nullptr);
+                             const DominatorTree *DT = nullptr,
+                             OptimizationRemarkEmitter *ORE = nullptr);
   /// Compute known bits from the range metadata.
   /// \p KnownZero the set of bits that are known to be zero
   /// \p KnownOne the set of bits that are known to be one
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index 180c0b579248..c898667f1474 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -34,6 +34,7 @@
 namespace llvm {
 
 class AsmPrinterHandler;
+class BasicBlock;
 class BlockAddress;
 class Constant;
 class ConstantArray;
@@ -43,6 +44,7 @@ class DIEAbbrev;
 class DwarfDebug;
 class GCMetadataPrinter;
 class GlobalIndirectSymbol;
+class GlobalObject;
 class GlobalValue;
 class GlobalVariable;
 class GCStrategy;
@@ -65,6 +67,8 @@ class MCSubtargetInfo;
 class MCSymbol;
 class MCTargetOptions;
 class MDNode;
+class Module;
+class raw_ostream;
 class TargetLoweringObjectFile;
 class TargetMachine;
 
@@ -109,7 +113,7 @@ public:
 
   /// Map global GOT equivalent MCSymbols to GlobalVariables and keep track of
   /// its number of uses by other globals.
-  typedef std::pair<const GlobalVariable *, unsigned> GOTEquivUsePair;
+  using GOTEquivUsePair = std::pair<const GlobalVariable *, unsigned>;
   MapVector<const MCSymbol *, GOTEquivUsePair> GlobalGOTEquivs;
 
   /// Enable print [latency:throughput] in output
diff --git a/include/llvm/CodeGen/AtomicExpandUtils.h b/include/llvm/CodeGen/AtomicExpandUtils.h
index ac18eac8a1ce..1f9c96b18e1b 100644
--- a/include/llvm/CodeGen/AtomicExpandUtils.h
+++ b/include/llvm/CodeGen/AtomicExpandUtils.h
@@ -1,4 +1,4 @@
-//===-- AtomicExpandUtils.h - Utilities for expanding atomic instructions -===//
+//===- AtomicExpandUtils.h - Utilities for expanding atomic instructions --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifndef LLVM_CODEGEN_ATOMICEXPANDUTILS_H
+#define LLVM_CODEGEN_ATOMICEXPANDUTILS_H
+
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/AtomicOrdering.h"
 
 namespace llvm {
-class Value;
-class AtomicRMWInst;
 
+class AtomicRMWInst;
+class Value;
 
 /// Parameters (see the expansion example below):
 /// (the builder, %addr, %loaded, %new_val, ordering,
 ///  /* OUT */ %success, /* OUT */ %new_loaded)
-typedef function_ref<void(IRBuilder<> &, Value *, Value *, Value *,
-                          AtomicOrdering, Value *&, Value *&)> CreateCmpXchgInstFun;
+using CreateCmpXchgInstFun =
+    function_ref<void(IRBuilder<> &, Value *, Value *, Value *, AtomicOrdering,
+                      Value *&, Value *&)>;
 
 /// \brief Expand an atomic RMW instruction into a loop utilizing
 /// cmpxchg. You'll want to make sure your target machine likes cmpxchg
@@ -42,7 +47,8 @@ typedef function_ref<void(IRBuilder<> &, Value *, Value *, Value *,
 /// loop:
 ///     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
 ///     %new = some_op iN %loaded, %incr
-/// ; This is what -atomic-expand will produce using this function on i686 targets:
+/// ; This is what -atomic-expand will produce using this function on i686
+/// targets:
 ///     %pair = cmpxchg iN* %addr, iN %loaded, iN %new_val
 ///     %new_loaded = extractvalue { iN, i1 } %pair, 0
 ///     %success = extractvalue { iN, i1 } %pair, 1
@@ -52,6 +58,8 @@ typedef function_ref<void(IRBuilder<> &, Value *, Value *, Value *,
 ///     [...]
 ///
 /// Returns true if the containing function was modified.
-bool
-expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, CreateCmpXchgInstFun Factory);
-}
+bool expandAtomicRMWToCmpXchg(AtomicRMWInst *AI, CreateCmpXchgInstFun Factory);
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_ATOMICEXPANDUTILS_H
diff --git a/include/llvm/CodeGen/DIE.h b/include/llvm/CodeGen/DIE.h
index a40147336fe2..4be44e62fa92 100644
--- a/include/llvm/CodeGen/DIE.h
+++ b/include/llvm/CodeGen/DIE.h
@@ -1,4 +1,4 @@
-//===--- lib/CodeGen/DIE.h - DWARF Info Entries -----------------*- C++ -*-===//
+//===- lib/CodeGen/DIE.h - DWARF Info Entries -------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -31,6 +31,7 @@
 #include <iterator>
 #include <new>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 namespace llvm {
@@ -53,11 +54,11 @@ class DIEAbbrevData {
   dwarf::Form Form;
 
   /// Dwarf attribute value for DW_FORM_implicit_const
-  int64_t Value;
+  int64_t Value = 0;
 
 public:
   DIEAbbrevData(dwarf::Attribute A, dwarf::Form F)
-      : Attribute(A), Form(F), Value(0) {}
+      : Attribute(A), Form(F) {}
   DIEAbbrevData(dwarf::Attribute A, int64_t V)
       : Attribute(A), Form(dwarf::DW_FORM_implicit_const), Value(V) {}
 
@@ -136,13 +137,14 @@ class DIEAbbrevSet {
   /// storage container.
   BumpPtrAllocator &Alloc;
   /// \brief FoldingSet that uniques the abbreviations.
-  llvm::FoldingSet<DIEAbbrev> AbbreviationsSet;
+  FoldingSet<DIEAbbrev> AbbreviationsSet;
   /// A list of all the unique abbreviations in use.
   std::vector<DIEAbbrev *> Abbreviations;
 
 public:
   DIEAbbrevSet(BumpPtrAllocator &A) : Alloc(A) {}
   ~DIEAbbrevSet();
+
   /// Generate the abbreviation declaration for a DIE and return a pointer to
   /// the generated abbreviation.
   ///
@@ -289,13 +291,11 @@ public:
 /// A pointer to another debug information entry.  An instance of this class can
 /// also be used as a proxy for a debug information entry not yet defined
 /// (ie. types.)
-class DIE;
 class DIEEntry {
   DIE *Entry;
 
-  DIEEntry() = delete;
-
 public:
+  DIEEntry() = delete;
   explicit DIEEntry(DIE &E) : Entry(&E) {}
 
   DIE &getEntry() const { return *Entry; }
@@ -348,10 +348,10 @@ private:
   ///
   /// All values that aren't standard layout (or are larger than 8 bytes)
   /// should be stored by reference instead of by value.
-  typedef AlignedCharArrayUnion<DIEInteger, DIEString, DIEExpr, DIELabel,
-                                DIEDelta *, DIEEntry, DIEBlock *, DIELoc *,
-                                DIELocList>
-      ValTy;
+  using ValTy = AlignedCharArrayUnion<DIEInteger, DIEString, DIEExpr, DIELabel,
+                                      DIEDelta *, DIEEntry, DIEBlock *,
+                                      DIELoc *, DIELocList>;
+
   static_assert(sizeof(ValTy) <= sizeof(uint64_t) ||
                     sizeof(ValTy) <= sizeof(void *),
                 "Expected all large types to be stored via pointer");
@@ -486,10 +486,12 @@ struct IntrusiveBackListNode {
 };
 
 struct IntrusiveBackListBase {
-  typedef IntrusiveBackListNode Node;
+  using Node = IntrusiveBackListNode;
+
   Node *Last = nullptr;
 
   bool empty() const { return !Last; }
+
   void push_back(Node &N) {
     assert(N.Next.getPointer() == &N && "Expected unlinked node");
     assert(N.Next.getInt() == true && "Expected unlinked node");
@@ -505,6 +507,7 @@ struct IntrusiveBackListBase {
 template <class T> class IntrusiveBackList : IntrusiveBackListBase {
 public:
   using IntrusiveBackListBase::empty;
+
   void push_back(T &N) { IntrusiveBackListBase::push_back(N); }
   T &back() { return *static_cast<T *>(Last); }
   const T &back() const { return *static_cast<T *>(Last); }
@@ -513,6 +516,7 @@ public:
   class iterator
       : public iterator_facade_base<iterator, std::forward_iterator_tag, T> {
     friend class const_iterator;
+
     Node *N = nullptr;
 
   public:
@@ -585,10 +589,12 @@ public:
 class DIEValueList {
   struct Node : IntrusiveBackListNode {
     DIEValue V;
+
     explicit Node(DIEValue V) : V(V) {}
   };
 
-  typedef IntrusiveBackList<Node> ListTy;
+  using ListTy = IntrusiveBackList<Node>;
+
   ListTy List;
 
 public:
@@ -597,9 +603,10 @@ public:
       : public iterator_adaptor_base<value_iterator, ListTy::iterator,
                                      std::forward_iterator_tag, DIEValue> {
     friend class const_value_iterator;
-    typedef iterator_adaptor_base<value_iterator, ListTy::iterator,
-                                  std::forward_iterator_tag,
-                                  DIEValue> iterator_adaptor;
+
+    using iterator_adaptor =
+        iterator_adaptor_base<value_iterator, ListTy::iterator,
+                              std::forward_iterator_tag, DIEValue>;
 
   public:
     value_iterator() = default;
@@ -612,9 +619,9 @@ public:
   class const_value_iterator : public iterator_adaptor_base<
                                    const_value_iterator, ListTy::const_iterator,
                                    std::forward_iterator_tag, const DIEValue> {
-    typedef iterator_adaptor_base<const_value_iterator, ListTy::const_iterator,
-                                  std::forward_iterator_tag,
-                                  const DIEValue> iterator_adaptor;
+    using iterator_adaptor =
+        iterator_adaptor_base<const_value_iterator, ListTy::const_iterator,
+                              std::forward_iterator_tag, const DIEValue>;
 
   public:
     const_value_iterator() = default;
@@ -627,8 +634,8 @@ public:
     const DIEValue &operator*() const { return wrapped()->V; }
   };
 
-  typedef iterator_range<value_iterator> value_range;
-  typedef iterator_range<const_value_iterator> const_value_range;
+  using value_range = iterator_range<value_iterator>;
+  using const_value_range = iterator_range<const_value_iterator>;
 
   value_iterator addValue(BumpPtrAllocator &Alloc, const DIEValue &V) {
     List.push_back(*new (Alloc) Node(V));
@@ -657,15 +664,15 @@ class DIE : IntrusiveBackListNode, public DIEValueList {
   friend class DIEUnit;
 
   /// Dwarf unit relative offset.
-  unsigned Offset;
+  unsigned Offset = 0;
   /// Size of instance + children.
-  unsigned Size;
+  unsigned Size = 0;
   unsigned AbbrevNumber = ~0u;
   /// Dwarf tag code.
   dwarf::Tag Tag = (dwarf::Tag)0;
   /// Set to true to force a DIE to emit an abbreviation that says it has
   /// children even when it doesn't. This is used for unit testing purposes.
-  bool ForceChildren;
+  bool ForceChildren = false;
   /// Children DIEs.
   IntrusiveBackList<DIE> Children;
 
@@ -673,20 +680,19 @@ class DIE : IntrusiveBackListNode, public DIEValueList {
   /// DIEUnit which contains this DIE as its unit DIE.
   PointerUnion<DIE *, DIEUnit *> Owner;
 
-  DIE() = delete;
-  explicit DIE(dwarf::Tag Tag) : Offset(0), Size(0), Tag(Tag),
-      ForceChildren(false) {}
+  explicit DIE(dwarf::Tag Tag) : Tag(Tag) {}
 
 public:
+  DIE() = delete;
+  DIE(const DIE &RHS) = delete;
+  DIE(DIE &&RHS) = delete;
+  DIE &operator=(const DIE &RHS) = delete;
+  DIE &operator=(const DIE &&RHS) = delete;
+
   static DIE *get(BumpPtrAllocator &Alloc, dwarf::Tag Tag) {
     return new (Alloc) DIE(Tag);
   }
 
-  DIE(const DIE &RHS) = delete;
-  DIE(DIE &&RHS) = delete;
-  void operator=(const DIE &RHS) = delete;
-  void operator=(const DIE &&RHS) = delete;
-
   // Accessors.
   unsigned getAbbrevNumber() const { return AbbrevNumber; }
   dwarf::Tag getTag() const { return Tag; }
@@ -696,10 +702,10 @@ public:
   bool hasChildren() const { return ForceChildren || !Children.empty(); }
   void setForceChildren(bool B) { ForceChildren = B; }
 
-  typedef IntrusiveBackList<DIE>::iterator child_iterator;
-  typedef IntrusiveBackList<DIE>::const_iterator const_child_iterator;
-  typedef iterator_range<child_iterator> child_range;
-  typedef iterator_range<const_child_iterator> const_child_range;
+  using child_iterator = IntrusiveBackList<DIE>::iterator;
+  using const_child_iterator = IntrusiveBackList<DIE>::const_iterator;
+  using child_range = iterator_range<child_iterator>;
+  using const_child_range = iterator_range<const_child_iterator>;
 
   child_range children() {
     return make_range(Children.begin(), Children.end());
@@ -838,10 +844,10 @@ struct BasicDIEUnit final : DIEUnit {
 /// DIELoc - Represents an expression location.
 //
 class DIELoc : public DIEValueList {
-  mutable unsigned Size; // Size in bytes excluding size header.
+  mutable unsigned Size = 0; // Size in bytes excluding size header.
 
 public:
-  DIELoc() : Size(0) {}
+  DIELoc() = default;
 
   /// ComputeSize - Calculate the size of the location expression.
   ///
@@ -872,10 +878,10 @@ public:
 /// DIEBlock - Represents a block of values.
 //
 class DIEBlock : public DIEValueList {
-  mutable unsigned Size; // Size in bytes excluding size header.
+  mutable unsigned Size = 0; // Size in bytes excluding size header.
 
 public:
-  DIEBlock() : Size(0) {}
+  DIEBlock() = default;
 
   /// ComputeSize - Calculate the size of the location expression.
   ///
diff --git a/include/llvm/CodeGen/FaultMaps.h b/include/llvm/CodeGen/FaultMaps.h
index 0f0005b83c54..98ff526dfe94 100644
--- a/include/llvm/CodeGen/FaultMaps.h
+++ b/include/llvm/CodeGen/FaultMaps.h
@@ -56,7 +56,7 @@ private:
           HandlerOffsetExpr(HandlerOffset) {}
   };
 
-  typedef std::vector<FaultInfo> FunctionFaultInfos;
+  using FunctionFaultInfos = std::vector<FaultInfo>;
 
   // We'd like to keep a stable iteration order for FunctionInfos to help
   // FileCheck based testing.
@@ -78,20 +78,17 @@ private:
 /// generated by the version of LLVM that includes it.  No guarantees are made
 /// with respect to forward or backward compatibility.
 class FaultMapParser {
-  typedef uint8_t FaultMapVersionType;
-  static const size_t FaultMapVersionOffset = 0;
+  using FaultMapVersionType = uint8_t;
+  using Reserved0Type = uint8_t;
+  using Reserved1Type = uint16_t;
+  using NumFunctionsType = uint32_t;
 
-  typedef uint8_t Reserved0Type;
+  static const size_t FaultMapVersionOffset = 0;
   static const size_t Reserved0Offset =
       FaultMapVersionOffset + sizeof(FaultMapVersionType);
-
-  typedef uint16_t Reserved1Type;
   static const size_t Reserved1Offset = Reserved0Offset + sizeof(Reserved0Type);
-
-  typedef uint32_t NumFunctionsType;
   static const size_t NumFunctionsOffset =
       Reserved1Offset + sizeof(Reserved1Type);
-
   static const size_t FunctionInfosOffset =
       NumFunctionsOffset + sizeof(NumFunctionsType);
 
@@ -105,14 +102,13 @@ class FaultMapParser {
 
 public:
   class FunctionFaultInfoAccessor {
-    typedef uint32_t FaultKindType;
-    static const size_t FaultKindOffset = 0;
+    using FaultKindType = uint32_t;
+    using FaultingPCOffsetType = uint32_t;
+    using HandlerPCOffsetType = uint32_t;
 
-    typedef uint32_t FaultingPCOffsetType;
+    static const size_t FaultKindOffset = 0;
     static const size_t FaultingPCOffsetOffset =
         FaultKindOffset + sizeof(FaultKindType);
-
-    typedef uint32_t HandlerPCOffsetType;
     static const size_t HandlerPCOffsetOffset =
         FaultingPCOffsetOffset + sizeof(FaultingPCOffsetType);
 
@@ -140,20 +136,17 @@ public:
   };
 
   class FunctionInfoAccessor {
-    typedef uint64_t FunctionAddrType;
-    static const size_t FunctionAddrOffset = 0;
+    using FunctionAddrType = uint64_t;
+    using NumFaultingPCsType = uint32_t;
+    using ReservedType = uint32_t;
 
-    typedef uint32_t NumFaultingPCsType;
+    static const size_t FunctionAddrOffset = 0;
     static const size_t NumFaultingPCsOffset =
         FunctionAddrOffset + sizeof(FunctionAddrType);
-
-    typedef uint32_t ReservedType;
     static const size_t ReservedOffset =
         NumFaultingPCsOffset + sizeof(NumFaultingPCsType);
-
     static const size_t FunctionFaultInfosOffset =
         ReservedOffset + sizeof(ReservedType);
-
     static const size_t FunctionInfoHeaderSize = FunctionFaultInfosOffset;
 
     const uint8_t *P = nullptr;
diff --git a/include/llvm/CodeGen/GlobalISel/Localizer.h b/include/llvm/CodeGen/GlobalISel/Localizer.h
new file mode 100644
index 000000000000..0a46eb9e7840
--- /dev/null
+++ b/include/llvm/CodeGen/GlobalISel/Localizer.h
@@ -0,0 +1,78 @@
+//== llvm/CodeGen/GlobalISel/Localizer.h - Localizer -------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file describes the interface of the Localizer pass.
+/// This pass moves/duplicates constant-like instructions close to their uses.
+/// Its primarily goal is to workaround the deficiencies of the fast register
+/// allocator.
+/// With GlobalISel constants are all materialized in the entry block of
+/// a function. However, the fast allocator cannot rematerialize constants and
+/// has a lot more live-ranges to deal with and will most likely end up
+/// spilling a lot.
+/// By pushing the constants close to their use, we only create small
+/// live-ranges.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_GLOBALISEL_LOCALIZER_H
+#define LLVM_CODEGEN_GLOBALISEL_LOCALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+// Forward declarations.
+class MachineRegisterInfo;
+
+/// This pass implements the localization mechanism described at the
+/// top of this file. One specificity of the implementation is that
+/// it will materialize one and only one instance of a constant per
+/// basic block, thus enabling reuse of that constant within that block.
+/// Moreover, it only materializes constants in blocks where they
+/// are used. PHI uses are considered happening at the end of the
+/// related predecessor.
+class Localizer : public MachineFunctionPass {
+public:
+  static char ID;
+
+private:
+  /// MRI contains all the register class/bank information that this
+  /// pass uses and updates.
+  MachineRegisterInfo *MRI;
+
+  /// Check whether or not \p MI needs to be moved close to its uses.
+  static bool shouldLocalize(const MachineInstr &MI);
+
+  /// Check if \p MOUse is used in the same basic block as \p Def.
+  /// If the use is in the same block, we say it is local.
+  /// When the use is not local, \p InsertMBB will contain the basic
+  /// block when to insert \p Def to have a local use.
+  static bool isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
+                         MachineBasicBlock *&InsertMBB);
+
+  /// Initialize the field members using \p MF.
+  void init(MachineFunction &MF);
+
+public:
+  Localizer();
+
+  StringRef getPassName() const override { return "Localizer"; }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties()
+        .set(MachineFunctionProperties::Property::IsSSA)
+        .set(MachineFunctionProperties::Property::Legalized)
+        .set(MachineFunctionProperties::Property::RegBankSelected);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // End namespace llvm.
+
+#endif
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index f2a9a9f73ca6..2300a106c358 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -264,6 +264,14 @@ namespace ISD {
     /// optimized.
     STRICT_FADD, STRICT_FSUB, STRICT_FMUL, STRICT_FDIV, STRICT_FREM,
 
+    /// Constrained versions of libm-equivalent floating point intrinsics.
+    /// These will be lowered to the equivalent non-constrained pseudo-op
+    /// (or expanded to the equivalent library call) before final selection.
+    /// They are used to limit optimizations while the DAG is being optimized.
+    STRICT_FSQRT, STRICT_FPOW, STRICT_FPOWI, STRICT_FSIN, STRICT_FCOS,
+    STRICT_FEXP, STRICT_FEXP2, STRICT_FLOG, STRICT_FLOG10, STRICT_FLOG2,
+    STRICT_FRINT, STRICT_FNEARBYINT,
+
     /// FMA - Perform a * b + c with no intermediate rounding step.
     FMA,
 
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index b792cba4b78a..40cd146f88f8 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/LiveInterval.h - Interval representation ---*- C++ -*-===//
+//===- llvm/CodeGen/LiveInterval.h - Interval representation ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,22 +21,30 @@
 #ifndef LLVM_CODEGEN_LIVEINTERVAL_H
 #define LLVM_CODEGEN_LIVEINTERVAL_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/IntEqClasses.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/Allocator.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
 #include <cassert>
-#include <climits>
+#include <cstddef>
+#include <functional>
+#include <memory>
 #include <set>
+#include <tuple>
+#include <utility>
 
 namespace llvm {
+
   class CoalescerPair;
   class LiveIntervals;
-  class MachineInstr;
   class MachineRegisterInfo;
-  class TargetRegisterInfo;
   class raw_ostream;
-  template <typename T, unsigned Small> class SmallPtrSet;
 
   /// VNInfo - Value Number Information.
   /// This class holds information about a machine level values, including
@@ -44,7 +52,7 @@ namespace llvm {
   ///
   class VNInfo {
   public:
-    typedef BumpPtrAllocator Allocator;
+    using Allocator = BumpPtrAllocator;
 
     /// The ID number of this value.
     unsigned id;
@@ -53,14 +61,10 @@ namespace llvm {
     SlotIndex def;
 
     /// VNInfo constructor.
-    VNInfo(unsigned i, SlotIndex d)
-      : id(i), def(d)
-    { }
+    VNInfo(unsigned i, SlotIndex d) : id(i), def(d) {}
 
     /// VNInfo constructor, copies values from orig, except for the value number.
-    VNInfo(unsigned i, const VNInfo &orig)
-      : id(i), def(orig.def)
-    { }
+    VNInfo(unsigned i, const VNInfo &orig) : id(i), def(orig.def) {}
 
     /// Copy from the parameter into this VNInfo.
     void copyFrom(VNInfo &src) {
@@ -152,16 +156,16 @@ namespace llvm {
   /// segment with a new value number is used.
   class LiveRange {
   public:
-
     /// This represents a simple continuous liveness interval for a value.
     /// The start point is inclusive, the end point exclusive. These intervals
     /// are rendered as [start,end).
     struct Segment {
       SlotIndex start;  // Start point of the interval (inclusive)
       SlotIndex end;    // End point of the interval (exclusive)
-      VNInfo *valno;    // identifier for the value contained in this segment.
+      VNInfo *valno = nullptr; // identifier for the value contained in this
+                               // segment.
 
-      Segment() : valno(nullptr) {}
+      Segment() = default;
 
       Segment(SlotIndex S, SlotIndex E, VNInfo *V)
         : start(S), end(E), valno(V) {
@@ -189,8 +193,8 @@ namespace llvm {
       void dump() const;
     };
 
-    typedef SmallVector<Segment, 2> Segments;
-    typedef SmallVector<VNInfo *, 2> VNInfoList;
+    using Segments = SmallVector<Segment, 2>;
+    using VNInfoList = SmallVector<VNInfo *, 2>;
 
     Segments segments;   // the liveness segments
     VNInfoList valnos;   // value#'s
@@ -198,22 +202,24 @@ namespace llvm {
     // The segment set is used temporarily to accelerate initial computation
     // of live ranges of physical registers in computeRegUnitRange.
     // After that the set is flushed to the segment vector and deleted.
-    typedef std::set<Segment> SegmentSet;
+    using SegmentSet = std::set<Segment>;
     std::unique_ptr<SegmentSet> segmentSet;
 
-    typedef Segments::iterator iterator;
+    using iterator = Segments::iterator;
+    using const_iterator = Segments::const_iterator;
+
     iterator begin() { return segments.begin(); }
     iterator end()   { return segments.end(); }
 
-    typedef Segments::const_iterator const_iterator;
     const_iterator begin() const { return segments.begin(); }
     const_iterator end() const  { return segments.end(); }
 
-    typedef VNInfoList::iterator vni_iterator;
+    using vni_iterator = VNInfoList::iterator;
+    using const_vni_iterator = VNInfoList::const_iterator;
+
     vni_iterator vni_begin() { return valnos.begin(); }
     vni_iterator vni_end()   { return valnos.end(); }
 
-    typedef VNInfoList::const_iterator const_vni_iterator;
     const_vni_iterator vni_begin() const { return valnos.begin(); }
     const_vni_iterator vni_end() const   { return valnos.end(); }
 
@@ -631,40 +637,37 @@ namespace llvm {
   /// or stack slot.
   class LiveInterval : public LiveRange {
   public:
-    typedef LiveRange super;
+    using super = LiveRange;
 
     /// A live range for subregisters. The LaneMask specifies which parts of the
     /// super register are covered by the interval.
     /// (@sa TargetRegisterInfo::getSubRegIndexLaneMask()).
     class SubRange : public LiveRange {
     public:
-      SubRange *Next;
+      SubRange *Next = nullptr;
       LaneBitmask LaneMask;
 
       /// Constructs a new SubRange object.
-      SubRange(LaneBitmask LaneMask)
-        : Next(nullptr), LaneMask(LaneMask) {
-      }
+      SubRange(LaneBitmask LaneMask) : LaneMask(LaneMask) {}
 
       /// Constructs a new SubRange object by copying liveness from @p Other.
       SubRange(LaneBitmask LaneMask, const LiveRange &Other,
                BumpPtrAllocator &Allocator)
-        : LiveRange(Other, Allocator), Next(nullptr), LaneMask(LaneMask) {
-      }
+        : LiveRange(Other, Allocator), LaneMask(LaneMask) {}
 
       void print(raw_ostream &OS) const;
       void dump() const;
     };
 
   private:
-    SubRange *SubRanges; ///< Single linked list of subregister live ranges.
+    SubRange *SubRanges = nullptr; ///< Single linked list of subregister live
+                                   /// ranges.
 
   public:
     const unsigned reg;  // the register or stack slot of this interval.
     float weight;        // weight of this interval
 
-    LiveInterval(unsigned Reg, float Weight)
-      : SubRanges(nullptr), reg(Reg), weight(Weight) {}
+    LiveInterval(unsigned Reg, float Weight) : reg(Reg), weight(Weight) {}
 
     ~LiveInterval() {
       clearSubRanges();
@@ -673,8 +676,10 @@ namespace llvm {
     template<typename T>
     class SingleLinkedListIterator {
       T *P;
+
     public:
       SingleLinkedListIterator<T>(T *P) : P(P) {}
+
       SingleLinkedListIterator<T> &operator++() {
         P = P->Next;
         return *this;
@@ -698,7 +703,9 @@ namespace llvm {
       }
     };
 
-    typedef SingleLinkedListIterator<SubRange> subrange_iterator;
+    using subrange_iterator = SingleLinkedListIterator<SubRange>;
+    using const_subrange_iterator = SingleLinkedListIterator<const SubRange>;
+
     subrange_iterator subrange_begin() {
       return subrange_iterator(SubRanges);
     }
@@ -706,7 +713,6 @@ namespace llvm {
       return subrange_iterator(nullptr);
     }
 
-    typedef SingleLinkedListIterator<const SubRange> const_subrange_iterator;
     const_subrange_iterator subrange_begin() const {
       return const_subrange_iterator(SubRanges);
     }
@@ -759,12 +765,12 @@ namespace llvm {
 
     /// isSpillable - Can this interval be spilled?
     bool isSpillable() const {
-      return weight != llvm::huge_valf;
+      return weight != huge_valf;
     }
 
     /// markNotSpillable - Mark interval as not spillable
     void markNotSpillable() {
-      weight = llvm::huge_valf;
+      weight = huge_valf;
     }
 
     /// For a given lane mask @p LaneMask, compute indexes at which the
@@ -931,5 +937,7 @@ namespace llvm {
     void Distribute(LiveInterval &LI, LiveInterval *LIV[],
                     MachineRegisterInfo &MRI);
   };
-}
-#endif
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_LIVEINTERVAL_H
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index 181cb375de86..820e88362483 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h
@@ -1,4 +1,4 @@
-//===-- LiveIntervalAnalysis.h - Live Interval Analysis ---------*- C++ -*-===//
+//===- LiveIntervalAnalysis.h - Live Interval Analysis ----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -20,6 +20,7 @@
 #ifndef LLVM_CODEGEN_LIVEINTERVALANALYSIS_H
 #define LLVM_CODEGEN_LIVEINTERVALANALYSIS_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -27,27 +28,29 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/Support/Allocator.h"
+#include "llvm/MC/LaneBitmask.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include <cmath>
+#include <cassert>
+#include <cstdint>
+#include <utility>
 
 namespace llvm {
 
 extern cl::opt<bool> UseSegmentSetForPhysRegs;
 
-  class BitVector;
-  class BlockFrequency;
-  class LiveRangeCalc;
-  class LiveVariables;
-  class MachineDominatorTree;
-  class MachineLoopInfo;
-  class TargetRegisterInfo;
-  class MachineRegisterInfo;
-  class TargetInstrInfo;
-  class TargetRegisterClass;
-  class VirtRegMap;
-  class MachineBlockFrequencyInfo;
+class BitVector;
+class LiveRangeCalc;
+class MachineBlockFrequencyInfo;
+class MachineDominatorTree;
+class MachineFunction;
+class MachineInstr;
+class MachineRegisterInfo;
+class raw_ostream;
+class TargetInstrInfo;
+class VirtRegMap;
 
   class LiveIntervals : public MachineFunctionPass {
     MachineFunction* MF;
@@ -56,8 +59,8 @@ extern cl::opt<bool> UseSegmentSetForPhysRegs;
     const TargetInstrInfo* TII;
     AliasAnalysis *AA;
     SlotIndexes* Indexes;
-    MachineDominatorTree *DomTree;
-    LiveRangeCalc *LRCalc;
+    MachineDominatorTree *DomTree = nullptr;
+    LiveRangeCalc *LRCalc = nullptr;
 
     /// Special pool allocator for VNInfo's (LiveInterval val#).
     VNInfo::Allocator VNInfoAllocator;
@@ -95,6 +98,7 @@ extern cl::opt<bool> UseSegmentSetForPhysRegs;
 
   public:
     static char ID;
+
     LiveIntervals();
     ~LiveIntervals() override;
 
@@ -466,6 +470,7 @@ extern cl::opt<bool> UseSegmentSetForPhysRegs;
 
     class HMEditor;
   };
-} // End llvm namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_LIVEINTERVALANALYSIS_H
diff --git a/include/llvm/CodeGen/LiveIntervalUnion.h b/include/llvm/CodeGen/LiveIntervalUnion.h
index 57e3deb038af..b922e543c856 100644
--- a/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -26,12 +26,14 @@
 
 namespace llvm {
 
+class raw_ostream;
 class TargetRegisterInfo;
 
 #ifndef NDEBUG
 // forward declaration
 template <unsigned Element> class SparseBitVector;
-typedef SparseBitVector<128> LiveVirtRegBitSet;
+
+using LiveVirtRegBitSet = SparseBitVector<128>;
 #endif
 
 /// Union of live intervals that are strong candidates for coalescing into a
@@ -42,19 +44,19 @@ class LiveIntervalUnion {
   // A set of live virtual register segments that supports fast insertion,
   // intersection, and removal.
   // Mapping SlotIndex intervals to virtual register numbers.
-  typedef IntervalMap<SlotIndex, LiveInterval*> LiveSegments;
+  using LiveSegments = IntervalMap<SlotIndex, LiveInterval*>;
 
 public:
   // SegmentIter can advance to the next segment ordered by starting position
   // which may belong to a different live virtual register. We also must be able
   // to reach the current segment's containing virtual register.
-  typedef LiveSegments::iterator SegmentIter;
+  using SegmentIter = LiveSegments::iterator;
 
   /// Const version of SegmentIter.
-  typedef LiveSegments::const_iterator ConstSegmentIter;
+  using ConstSegmentIter = LiveSegments::const_iterator;
 
   // LiveIntervalUnions share an external allocator.
-  typedef LiveSegments::Allocator Allocator;
+  using Allocator = LiveSegments::Allocator;
 
 private:
   unsigned Tag = 0;       // unique tag for current contents.
@@ -76,7 +78,7 @@ public:
   SlotIndex startIndex() const { return Segments.start(); }
 
   // Provide public access to the underlying map to allow overlap iteration.
-  typedef LiveSegments Map;
+  using Map = LiveSegments;
   const Map &getMap() const { return Segments; }
 
   /// getTag - Return an opaque tag representing the current state of the union.
diff --git a/include/llvm/CodeGen/LivePhysRegs.h b/include/llvm/CodeGen/LivePhysRegs.h
index 9e04c467fadc..f9c741dd75b2 100644
--- a/include/llvm/CodeGen/LivePhysRegs.h
+++ b/include/llvm/CodeGen/LivePhysRegs.h
@@ -7,23 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the LivePhysRegs utility for tracking liveness of
-// physical registers. This can be used for ad-hoc liveness tracking after
-// register allocation. You can start with the live-ins/live-outs at the
-// beginning/end of a block and update the information while walking the
-// instructions inside the block. This implementation tracks the liveness on a
-// sub-register granularity.
-//
-// We assume that the high bits of a physical super-register are not preserved
-// unless the instruction has an implicit-use operand reading the super-
-// register.
-//
-// X86 Example:
-// %YMM0<def> = ...
-// %XMM0<def> = ... (Kills %XMM0, all %XMM0s sub-registers, and %YMM0)
-//
-// %YMM0<def> = ...
-// %XMM0<def> = ..., %YMM0<imp-use> (%YMM0 and all its sub-registers are alive)
+/// \file
+/// This file implements the LivePhysRegs utility for tracking liveness of
+/// physical registers. This can be used for ad-hoc liveness tracking after
+/// register allocation. You can start with the live-ins/live-outs at the
+/// beginning/end of a block and update the information while walking the
+/// instructions inside the block. This implementation tracks the liveness on a
+/// sub-register granularity.
+///
+/// We assume that the high bits of a physical super-register are not preserved
+/// unless the instruction has an implicit-use operand reading the super-
+/// register.
+///
+/// X86 Example:
+/// %YMM0<def> = ...
+/// %XMM0<def> = ... (Kills %XMM0, all %XMM0s sub-registers, and %YMM0)
+///
+/// %YMM0<def> = ...
+/// %XMM0<def> = ..., %YMM0<imp-use> (%YMM0 and all its sub-registers are alive)
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_CODEGEN_LIVEPHYSREGS_H
@@ -39,40 +40,42 @@
 namespace llvm {
 
 class MachineInstr;
+class MachineOperand;
+class MachineRegisterInfo;
+class raw_ostream;
 
-/// \brief A set of live physical registers with functions to track liveness
+/// \brief A set of physical registers with utility functions to track liveness
 /// when walking backward/forward through a basic block.
 class LivePhysRegs {
   const TargetRegisterInfo *TRI = nullptr;
   SparseSet<unsigned> LiveRegs;
 
+public:
+  /// Constructs an unitialized set. init() needs to be called to initialize it.
+  LivePhysRegs() = default;
+
+  /// Constructs and initializes an empty set.
+  LivePhysRegs(const TargetRegisterInfo &TRI) : TRI(&TRI) {
+    LiveRegs.setUniverse(TRI.getNumRegs());
+  }
+
   LivePhysRegs(const LivePhysRegs&) = delete;
   LivePhysRegs &operator=(const LivePhysRegs&) = delete;
 
-public:
-  /// \brief Constructs a new empty LivePhysRegs set.
-  LivePhysRegs() = default;
-
-  /// \brief Constructs and initialize an empty LivePhysRegs set.
-  LivePhysRegs(const TargetRegisterInfo *TRI) : TRI(TRI) {
-    assert(TRI && "Invalid TargetRegisterInfo pointer.");
-    LiveRegs.setUniverse(TRI->getNumRegs());
-  }
-
-  /// \brief Clear and initialize the LivePhysRegs set.
+  /// (re-)initializes and clears the set.
   void init(const TargetRegisterInfo &TRI) {
     this->TRI = &TRI;
     LiveRegs.clear();
     LiveRegs.setUniverse(TRI.getNumRegs());
   }
 
-  /// \brief Clears the LivePhysRegs set.
+  /// Clears the set.
   void clear() { LiveRegs.clear(); }
 
-  /// \brief Returns true if the set is empty.
+  /// Returns true if the set is empty.
   bool empty() const { return LiveRegs.empty(); }
 
-  /// \brief Adds a physical register and all its sub-registers to the set.
+  /// Adds a physical register and all its sub-registers to the set.
   void addReg(unsigned Reg) {
     assert(TRI && "LivePhysRegs is not initialized.");
     assert(Reg <= TRI->getNumRegs() && "Expected a physical register.");
@@ -90,12 +93,13 @@ public:
       LiveRegs.erase(*R);
   }
 
-  /// \brief Removes physical registers clobbered by the regmask operand @p MO.
+  /// Removes physical registers clobbered by the regmask operand \p MO.
   void removeRegsInMask(const MachineOperand &MO,
-        SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> *Clobbers);
+        SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> *Clobbers =
+        nullptr);
 
-  /// \brief Returns true if register @p Reg is contained in the set. This also
-  /// works if only the super register of @p Reg has been defined, because
+  /// \brief Returns true if register \p Reg is contained in the set. This also
+  /// works if only the super register of \p Reg has been defined, because
   /// addReg() always adds all sub-registers to the set as well.
   /// Note: Returns false if just some sub registers are live, use available()
   /// when searching a free register.
@@ -104,48 +108,48 @@ public:
   /// Returns true if register \p Reg and no aliasing register is in the set.
   bool available(const MachineRegisterInfo &MRI, unsigned Reg) const;
 
-  /// \brief Simulates liveness when stepping backwards over an
-  /// instruction(bundle): Remove Defs, add uses. This is the recommended way of
-  /// calculating liveness.
+  /// Simulates liveness when stepping backwards over an instruction(bundle).
+  /// Remove Defs, add uses. This is the recommended way of calculating
+  /// liveness.
   void stepBackward(const MachineInstr &MI);
 
-  /// \brief Simulates liveness when stepping forward over an
-  /// instruction(bundle): Remove killed-uses, add defs. This is the not
-  /// recommended way, because it depends on accurate kill flags. If possible
-  /// use stepBackward() instead of this function.
-  /// The clobbers set will be the list of registers either defined or clobbered
-  /// by a regmask.  The operand will identify whether this is a regmask or
-  /// register operand.
+  /// Simulates liveness when stepping forward over an instruction(bundle).
+  /// Remove killed-uses, add defs. This is the not recommended way, because it
+  /// depends on accurate kill flags. If possible use stepBackward() instead of
+  /// this function. The clobbers set will be the list of registers either
+  /// defined or clobbered by a regmask.  The operand will identify whether this
+  /// is a regmask or register operand.
   void stepForward(const MachineInstr &MI,
         SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> &Clobbers);
 
-  /// Adds all live-in registers of basic block @p MBB.
+  /// Adds all live-in registers of basic block \p MBB.
   /// Live in registers are the registers in the blocks live-in list and the
   /// pristine registers.
   void addLiveIns(const MachineBasicBlock &MBB);
 
-  /// Adds all live-out registers of basic block @p MBB.
+  /// Adds all live-out registers of basic block \p MBB.
   /// Live out registers are the union of the live-in registers of the successor
   /// blocks and pristine registers. Live out registers of the end block are the
   /// callee saved registers.
   void addLiveOuts(const MachineBasicBlock &MBB);
 
-  /// Like addLiveOuts() but does not add pristine registers/callee saved
+  /// Adds all live-out registers of basic block \p MBB but skips pristine
   /// registers.
   void addLiveOutsNoPristines(const MachineBasicBlock &MBB);
 
-  typedef SparseSet<unsigned>::const_iterator const_iterator;
+  using const_iterator = SparseSet<unsigned>::const_iterator;
+
   const_iterator begin() const { return LiveRegs.begin(); }
   const_iterator end() const { return LiveRegs.end(); }
 
-  /// \brief Prints the currently live registers to @p OS.
+  /// Prints the currently live registers to \p OS.
   void print(raw_ostream &OS) const;
 
-  /// \brief Dumps the currently live registers to the debug output.
+  /// Dumps the currently live registers to the debug output.
   void dump() const;
 
 private:
-  /// Adds live-in registers from basic block @p MBB, taking associated
+  /// \brief Adds live-in registers from basic block \p MBB, taking associated
   /// lane masks into consideration.
   void addBlockLiveIns(const MachineBasicBlock &MBB);
 };
@@ -155,11 +159,11 @@ inline raw_ostream &operator<<(raw_ostream &OS, const LivePhysRegs& LR) {
   return OS;
 }
 
-/// Compute the live-in list for \p MBB assuming all of its successors live-in
-/// lists are up-to-date. Uses the given LivePhysReg instance \p LiveRegs; This
-/// is just here to avoid repeated heap allocations when calling this multiple
-/// times in a pass.
-void computeLiveIns(LivePhysRegs &LiveRegs, const TargetRegisterInfo &TRI,
+/// \brief Computes the live-in list for \p MBB assuming all of its successors
+/// live-in lists are up-to-date. Uses the given LivePhysReg instance \p
+/// LiveRegs; This is just here to avoid repeated heap allocations when calling
+/// this multiple times in a pass.
+void computeLiveIns(LivePhysRegs &LiveRegs, const MachineRegisterInfo &MRI,
                     MachineBasicBlock &MBB);
 
 } // end namespace llvm
diff --git a/include/llvm/CodeGen/LiveRangeEdit.h b/include/llvm/CodeGen/LiveRangeEdit.h
index 4250777682ba..362d9854a271 100644
--- a/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/include/llvm/CodeGen/LiveRangeEdit.h
@@ -1,4 +1,4 @@
-//===---- LiveRangeEdit.h - Basic tools for split and spill -----*- C++ -*-===//
+//===- LiveRangeEdit.h - Basic tools for split and spill --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,19 +19,28 @@
 #define LLVM_CODEGEN_LIVERANGEEDIT_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
 
 namespace llvm {
 
 class LiveIntervals;
 class MachineBlockFrequencyInfo;
+class MachineInstr;
 class MachineLoopInfo;
+class MachineOperand;
+class TargetInstrInfo;
+class TargetRegisterInfo;
 class VirtRegMap;
 
 class LiveRangeEdit : private MachineRegisterInfo::Delegate {
@@ -39,7 +48,10 @@ public:
   /// Callback methods for LiveRangeEdit owners.
   class Delegate {
     virtual void anchor();
+
   public:
+    virtual ~Delegate() = default;
+
     /// Called immediately before erasing a dead machine instruction.
     virtual void LRE_WillEraseInstruction(MachineInstr *MI) {}
 
@@ -53,8 +65,6 @@ public:
     /// Called after cloning a virtual register.
     /// This is used for new registers representing connected components of Old.
     virtual void LRE_DidCloneVirtReg(unsigned New, unsigned Old) {}
-
-    virtual ~Delegate() {}
   };
 
 private:
@@ -70,7 +80,7 @@ private:
   const unsigned FirstNew;
 
   /// ScannedRemattable - true when remattable values have been identified.
-  bool ScannedRemattable;
+  bool ScannedRemattable = false;
 
   /// DeadRemats - The saved instructions which have already been dead after
   /// rematerialization but not deleted yet -- to be done in postOptimization.
@@ -78,11 +88,11 @@ private:
 
   /// Remattable - Values defined by remattable instructions as identified by
   /// tii.isTriviallyReMaterializable().
-  SmallPtrSet<const VNInfo*,4> Remattable;
+  SmallPtrSet<const VNInfo *, 4> Remattable;
 
   /// Rematted - Values that were actually rematted, and so need to have their
   /// live range trimmed or entirely removed.
-  SmallPtrSet<const VNInfo*,4> Rematted;
+  SmallPtrSet<const VNInfo *, 4> Rematted;
 
   /// scanRemattable - Identify the Parent values that may rematerialize.
   void scanRemattable(AliasAnalysis *aa);
@@ -94,11 +104,11 @@ private:
 
   /// foldAsLoad - If LI has a single use and a single def that can be folded as
   /// a load, eliminate the register by folding the def into the use.
-  bool foldAsLoad(LiveInterval *LI, SmallVectorImpl<MachineInstr*> &Dead);
+  bool foldAsLoad(LiveInterval *LI, SmallVectorImpl<MachineInstr *> &Dead);
+
+  using ToShrinkSet = SetVector<LiveInterval *, SmallVector<LiveInterval *, 8>,
+                                SmallPtrSet<LiveInterval *, 8>>;
 
-  typedef SetVector<LiveInterval*,
-                    SmallVector<LiveInterval*, 8>,
-                    SmallPtrSet<LiveInterval*, 8> > ToShrinkSet;
   /// Helper for eliminateDeadDefs.
   void eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
                         AliasAnalysis *AA);
@@ -129,26 +139,26 @@ public:
                 SmallPtrSet<MachineInstr *, 32> *deadRemats = nullptr)
       : Parent(parent), NewRegs(newRegs), MRI(MF.getRegInfo()), LIS(lis),
         VRM(vrm), TII(*MF.getSubtarget().getInstrInfo()), TheDelegate(delegate),
-        FirstNew(newRegs.size()), ScannedRemattable(false),
-        DeadRemats(deadRemats) {
+        FirstNew(newRegs.size()), DeadRemats(deadRemats) {
     MRI.setDelegate(this);
   }
 
   ~LiveRangeEdit() override { MRI.resetDelegate(this); }
 
   LiveInterval &getParent() const {
-   assert(Parent && "No parent LiveInterval");
-   return *Parent;
+    assert(Parent && "No parent LiveInterval");
+    return *Parent;
   }
+
   unsigned getReg() const { return getParent().reg; }
 
   /// Iterator for accessing the new registers added by this edit.
-  typedef SmallVectorImpl<unsigned>::const_iterator iterator;
-  iterator begin() const { return NewRegs.begin()+FirstNew; }
+  using iterator = SmallVectorImpl<unsigned>::const_iterator;
+  iterator begin() const { return NewRegs.begin() + FirstNew; }
   iterator end() const { return NewRegs.end(); }
-  unsigned size() const { return NewRegs.size()-FirstNew; }
+  unsigned size() const { return NewRegs.size() - FirstNew; }
   bool empty() const { return size() == 0; }
-  unsigned get(unsigned idx) const { return NewRegs[idx+FirstNew]; }
+  unsigned get(unsigned idx) const { return NewRegs[idx + FirstNew]; }
 
   /// pop_back - It allows LiveRangeEdit users to drop new registers.
   /// The context is when an original def instruction of a register is
@@ -176,26 +186,25 @@ public:
     return createEmptyIntervalFrom(getReg());
   }
 
-  unsigned create() {
-    return createFrom(getReg());
-  }
+  unsigned create() { return createFrom(getReg()); }
 
   /// anyRematerializable - Return true if any parent values may be
   /// rematerializable.
   /// This function must be called before any rematerialization is attempted.
-  bool anyRematerializable(AliasAnalysis*);
+  bool anyRematerializable(AliasAnalysis *);
 
   /// checkRematerializable - Manually add VNI to the list of rematerializable
   /// values if DefMI may be rematerializable.
   bool checkRematerializable(VNInfo *VNI, const MachineInstr *DefMI,
-                             AliasAnalysis*);
+                             AliasAnalysis *);
 
   /// Remat - Information needed to rematerialize at a specific location.
   struct Remat {
-    VNInfo *ParentVNI;      // parent_'s value at the remat location.
-    MachineInstr *OrigMI;   // Instruction defining OrigVNI. It contains the
-                            // real expr for remat.
-    explicit Remat(VNInfo *ParentVNI) : ParentVNI(ParentVNI), OrigMI(nullptr) {}
+    VNInfo *ParentVNI;              // parent_'s value at the remat location.
+    MachineInstr *OrigMI = nullptr; // Instruction defining OrigVNI. It contains
+                                    // the real expr for remat.
+
+    explicit Remat(VNInfo *ParentVNI) : ParentVNI(ParentVNI) {}
   };
 
   /// canRematerializeAt - Determine if ParentVNI can be rematerialized at
@@ -209,10 +218,8 @@ public:
   /// liveness is not updated.
   /// Return the SlotIndex of the new instruction.
   SlotIndex rematerializeAt(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            unsigned DestReg,
-                            const Remat &RM,
-                            const TargetRegisterInfo&,
+                            MachineBasicBlock::iterator MI, unsigned DestReg,
+                            const Remat &RM, const TargetRegisterInfo &,
                             bool Late = false);
 
   /// markRematerialized - explicitly mark a value as rematerialized after doing
@@ -248,11 +255,10 @@ public:
 
   /// calculateRegClassAndHint - Recompute register class and hint for each new
   /// register.
-  void calculateRegClassAndHint(MachineFunction&,
-                                const MachineLoopInfo&,
-                                const MachineBlockFrequencyInfo&);
+  void calculateRegClassAndHint(MachineFunction &, const MachineLoopInfo &,
+                                const MachineBlockFrequencyInfo &);
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_LIVERANGEEDIT_H
diff --git a/include/llvm/CodeGen/LiveStackAnalysis.h b/include/llvm/CodeGen/LiveStackAnalysis.h
index 3ffbe3d775b4..c90ae7b184f4 100644
--- a/include/llvm/CodeGen/LiveStackAnalysis.h
+++ b/include/llvm/CodeGen/LiveStackAnalysis.h
@@ -1,4 +1,4 @@
-//===-- LiveStackAnalysis.h - Live Stack Slot Analysis ----------*- C++ -*-===//
+//===- LiveStackAnalysis.h - Live Stack Slot Analysis -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,13 +18,16 @@
 
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Pass.h"
+#include <cassert>
 #include <map>
 #include <unordered_map>
 
 namespace llvm {
 
+class TargetRegisterClass;
+class TargetRegisterInfo;
+
 class LiveStacks : public MachineFunctionPass {
   const TargetRegisterInfo *TRI;
 
@@ -33,8 +36,7 @@ class LiveStacks : public MachineFunctionPass {
   VNInfo::Allocator VNInfoAllocator;
 
   /// S2IMap - Stack slot indices to live interval mapping.
-  ///
-  typedef std::unordered_map<int, LiveInterval> SS2IntervalMap;
+  using SS2IntervalMap = std::unordered_map<int, LiveInterval>;
   SS2IntervalMap S2IMap;
 
   /// S2RCMap - Stack slot indices to register class mapping.
@@ -42,12 +44,14 @@ class LiveStacks : public MachineFunctionPass {
 
 public:
   static char ID; // Pass identification, replacement for typeid
+
   LiveStacks() : MachineFunctionPass(ID) {
     initializeLiveStacksPass(*PassRegistry::getPassRegistry());
   }
 
-  typedef SS2IntervalMap::iterator iterator;
-  typedef SS2IntervalMap::const_iterator const_iterator;
+  using iterator = SS2IntervalMap::iterator;
+  using const_iterator = SS2IntervalMap::const_iterator;
+
   const_iterator begin() const { return S2IMap.begin(); }
   const_iterator end() const { return S2IMap.end(); }
   iterator begin() { return S2IMap.begin(); }
@@ -93,6 +97,7 @@ public:
   /// print - Implement the dump method.
   void print(raw_ostream &O, const Module * = nullptr) const override;
 };
-}
 
-#endif /* LLVM_CODEGEN_LIVESTACK_ANALYSIS_H */
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_LIVESTACK_ANALYSIS_H
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index 18d40564856d..8da48c379d00 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/MachineBasicBlock.h ------------------------*- C++ -*-===//
+//===- llvm/CodeGen/MachineBasicBlock.h -------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,41 +15,50 @@
 #define LLVM_CODEGEN_MACHINEBASICBLOCK_H
 
 #include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/simple_ilist.h"
 #include "llvm/CodeGen/MachineInstrBundleIterator.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/DataTypes.h"
+#include <cassert>
+#include <cstdint>
 #include <functional>
+#include <iterator>
+#include <string>
+#include <vector>
 
 namespace llvm {
 
-class Pass;
 class BasicBlock;
 class MachineFunction;
 class MCSymbol;
-class MIPrinter;
+class ModuleSlotTracker;
+class Pass;
 class SlotIndexes;
 class StringRef;
 class raw_ostream;
-class MachineBranchProbabilityInfo;
+class TargetRegisterClass;
+class TargetRegisterInfo;
 
 template <> struct ilist_traits<MachineInstr> {
 private:
   friend class MachineBasicBlock; // Set by the owning MachineBasicBlock.
+
   MachineBasicBlock *Parent;
 
-  typedef simple_ilist<MachineInstr, ilist_sentinel_tracking<true>>::iterator
-      instr_iterator;
+  using instr_iterator =
+      simple_ilist<MachineInstr, ilist_sentinel_tracking<true>>::iterator;
 
 public:
   void addNodeToList(MachineInstr *N);
   void removeNodeFromList(MachineInstr *N);
   void transferNodesFromList(ilist_traits &OldList, instr_iterator First,
                              instr_iterator Last);
-
   void deleteNode(MachineInstr *MI);
 };
 
@@ -69,7 +78,8 @@ public:
   };
 
 private:
-  typedef ilist<MachineInstr, ilist_sentinel_tracking<true>> Instructions;
+  using Instructions = ilist<MachineInstr, ilist_sentinel_tracking<true>>;
+
   Instructions Insts;
   const BasicBlock *BB;
   int Number;
@@ -83,12 +93,12 @@ private:
   /// same order as Successors, or it is empty if we don't use it (disable
   /// optimization).
   std::vector<BranchProbability> Probs;
-  typedef std::vector<BranchProbability>::iterator probability_iterator;
-  typedef std::vector<BranchProbability>::const_iterator
-      const_probability_iterator;
+  using probability_iterator = std::vector<BranchProbability>::iterator;
+  using const_probability_iterator =
+      std::vector<BranchProbability>::const_iterator;
 
   /// Keep track of the physical registers that are livein of the basicblock.
-  typedef std::vector<RegisterMaskPair> LiveInVector;
+  using LiveInVector = std::vector<RegisterMaskPair>;
   LiveInVector LiveIns;
 
   /// Alignment of the basic block. Zero if the basic block does not need to be
@@ -113,7 +123,7 @@ private:
   mutable MCSymbol *CachedMCSymbol = nullptr;
 
   // Intrusive list support
-  MachineBasicBlock() {}
+  MachineBasicBlock() = default;
 
   explicit MachineBasicBlock(MachineFunction &MF, const BasicBlock *BB);
 
@@ -145,16 +155,16 @@ public:
   const MachineFunction *getParent() const { return xParent; }
   MachineFunction *getParent() { return xParent; }
 
-  typedef Instructions::iterator                                 instr_iterator;
-  typedef Instructions::const_iterator                     const_instr_iterator;
-  typedef Instructions::reverse_iterator reverse_instr_iterator;
-  typedef Instructions::const_reverse_iterator const_reverse_instr_iterator;
+  using instr_iterator = Instructions::iterator;
+  using const_instr_iterator = Instructions::const_iterator;
+  using reverse_instr_iterator = Instructions::reverse_iterator;
+  using const_reverse_instr_iterator = Instructions::const_reverse_iterator;
 
-  typedef MachineInstrBundleIterator<MachineInstr> iterator;
-  typedef MachineInstrBundleIterator<const MachineInstr> const_iterator;
-  typedef MachineInstrBundleIterator<MachineInstr, true> reverse_iterator;
-  typedef MachineInstrBundleIterator<const MachineInstr, true>
-      const_reverse_iterator;
+  using iterator = MachineInstrBundleIterator<MachineInstr>;
+  using const_iterator = MachineInstrBundleIterator<const MachineInstr>;
+  using reverse_iterator = MachineInstrBundleIterator<MachineInstr, true>;
+  using const_reverse_iterator =
+      MachineInstrBundleIterator<const MachineInstr, true>;
 
   unsigned size() const { return (unsigned)Insts.size(); }
   bool empty() const { return Insts.empty(); }
@@ -178,8 +188,8 @@ public:
   reverse_instr_iterator       instr_rend  ()       { return Insts.rend();   }
   const_reverse_instr_iterator instr_rend  () const { return Insts.rend();   }
 
-  typedef iterator_range<instr_iterator> instr_range;
-  typedef iterator_range<const_instr_iterator> const_instr_range;
+  using instr_range = iterator_range<instr_iterator>;
+  using const_instr_range = iterator_range<const_instr_iterator>;
   instr_range instrs() { return instr_range(instr_begin(), instr_end()); }
   const_instr_range instrs() const {
     return const_instr_range(instr_begin(), instr_end());
@@ -213,18 +223,18 @@ public:
   }
 
   // Machine-CFG iterators
-  typedef std::vector<MachineBasicBlock *>::iterator       pred_iterator;
-  typedef std::vector<MachineBasicBlock *>::const_iterator const_pred_iterator;
-  typedef std::vector<MachineBasicBlock *>::iterator       succ_iterator;
-  typedef std::vector<MachineBasicBlock *>::const_iterator const_succ_iterator;
-  typedef std::vector<MachineBasicBlock *>::reverse_iterator
-                                                         pred_reverse_iterator;
-  typedef std::vector<MachineBasicBlock *>::const_reverse_iterator
-                                                   const_pred_reverse_iterator;
-  typedef std::vector<MachineBasicBlock *>::reverse_iterator
-                                                         succ_reverse_iterator;
-  typedef std::vector<MachineBasicBlock *>::const_reverse_iterator
-                                                   const_succ_reverse_iterator;
+  using pred_iterator = std::vector<MachineBasicBlock *>::iterator;
+  using const_pred_iterator = std::vector<MachineBasicBlock *>::const_iterator;
+  using succ_iterator = std::vector<MachineBasicBlock *>::iterator;
+  using const_succ_iterator = std::vector<MachineBasicBlock *>::const_iterator;
+  using pred_reverse_iterator =
+      std::vector<MachineBasicBlock *>::reverse_iterator;
+  using const_pred_reverse_iterator =
+      std::vector<MachineBasicBlock *>::const_reverse_iterator;
+  using succ_reverse_iterator =
+      std::vector<MachineBasicBlock *>::reverse_iterator;
+  using const_succ_reverse_iterator =
+      std::vector<MachineBasicBlock *>::const_reverse_iterator;
   pred_iterator        pred_begin()       { return Predecessors.begin(); }
   const_pred_iterator  pred_begin() const { return Predecessors.begin(); }
   pred_iterator        pred_end()         { return Predecessors.end();   }
@@ -307,7 +317,7 @@ public:
 
   // Iteration support for live in sets.  These sets are kept in sorted
   // order by their register number.
-  typedef LiveInVector::const_iterator livein_iterator;
+  using livein_iterator = LiveInVector::const_iterator;
 #ifndef NDEBUG
   /// Unlike livein_begin, this method does not check that the liveness
   /// information is accurate. Still for debug purposes it may be useful
@@ -455,7 +465,6 @@ public:
   /// other block.
   bool isLayoutSuccessor(const MachineBasicBlock *MBB) const;
 
-
   /// Return the fallthrough block if the block can implicitly
   /// transfer control to the block after it by falling off the end of
   /// it.  This should return null if it can reach the block after
@@ -695,7 +704,7 @@ public:
   LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI,
                                               unsigned Reg,
                                               const_iterator Before,
-                                              unsigned Neighborhood=10) const;
+                                              unsigned Neighborhood = 10) const;
 
   // Debugging methods.
   void dump() const;
@@ -714,7 +723,6 @@ public:
   /// Return the MCSymbol for this basic block.
   MCSymbol *getSymbol() const;
 
-
 private:
   /// Return probability iterator corresponding to the I successor iterator.
   probability_iterator getProbabilityIterator(succ_iterator I);
@@ -764,8 +772,8 @@ struct MBB2NumberFunctor :
 //
 
 template <> struct GraphTraits<MachineBasicBlock *> {
-  typedef MachineBasicBlock *NodeRef;
-  typedef MachineBasicBlock::succ_iterator ChildIteratorType;
+  using NodeRef = MachineBasicBlock *;
+  using ChildIteratorType = MachineBasicBlock::succ_iterator;
 
   static NodeRef getEntryNode(MachineBasicBlock *BB) { return BB; }
   static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); }
@@ -773,8 +781,8 @@ template <> struct GraphTraits<MachineBasicBlock *> {
 };
 
 template <> struct GraphTraits<const MachineBasicBlock *> {
-  typedef const MachineBasicBlock *NodeRef;
-  typedef MachineBasicBlock::const_succ_iterator ChildIteratorType;
+  using NodeRef = const MachineBasicBlock *;
+  using ChildIteratorType = MachineBasicBlock::const_succ_iterator;
 
   static NodeRef getEntryNode(const MachineBasicBlock *BB) { return BB; }
   static ChildIteratorType child_begin(NodeRef N) { return N->succ_begin(); }
@@ -787,28 +795,30 @@ template <> struct GraphTraits<const MachineBasicBlock *> {
 // to be when traversing the predecessor edges of a MBB
 // instead of the successor edges.
 //
-template <> struct GraphTraits<Inverse<MachineBasicBlock*> > {
-  typedef MachineBasicBlock *NodeRef;
-  typedef MachineBasicBlock::pred_iterator ChildIteratorType;
+template <> struct GraphTraits<Inverse<MachineBasicBlock*>> {
+  using NodeRef = MachineBasicBlock *;
+  using ChildIteratorType = MachineBasicBlock::pred_iterator;
+
   static NodeRef getEntryNode(Inverse<MachineBasicBlock *> G) {
     return G.Graph;
   }
+
   static ChildIteratorType child_begin(NodeRef N) { return N->pred_begin(); }
   static ChildIteratorType child_end(NodeRef N) { return N->pred_end(); }
 };
 
-template <> struct GraphTraits<Inverse<const MachineBasicBlock*> > {
-  typedef const MachineBasicBlock *NodeRef;
-  typedef MachineBasicBlock::const_pred_iterator ChildIteratorType;
+template <> struct GraphTraits<Inverse<const MachineBasicBlock*>> {
+  using NodeRef = const MachineBasicBlock *;
+  using ChildIteratorType = MachineBasicBlock::const_pred_iterator;
+
   static NodeRef getEntryNode(Inverse<const MachineBasicBlock *> G) {
     return G.Graph;
   }
+
   static ChildIteratorType child_begin(NodeRef N) { return N->pred_begin(); }
   static ChildIteratorType child_end(NodeRef N) { return N->pred_end(); }
 };
 
-
-
 /// MachineInstrSpan provides an interface to get an iteration range
 /// containing the instruction it was initialized with, along with all
 /// those instructions inserted prior to or following that instruction
@@ -816,6 +826,7 @@ template <> struct GraphTraits<Inverse<const MachineBasicBlock*> > {
 class MachineInstrSpan {
   MachineBasicBlock &MBB;
   MachineBasicBlock::iterator I, B, E;
+
 public:
   MachineInstrSpan(MachineBasicBlock::iterator I)
     : MBB(*I->getParent()),
@@ -854,6 +865,6 @@ inline IterT skipDebugInstructionsBackward(IterT It, IterT Begin) {
   return It;
 }
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_MACHINEBASICBLOCK_H
diff --git a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
index cd1c204981ed..cba79c818a76 100644
--- a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
+++ b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
@@ -1,4 +1,4 @@
-//===- MachineBlockFrequencyInfo.h - MBB Frequency Analysis -*- C++ -*-----===//
+//===- MachineBlockFrequencyInfo.h - MBB Frequency Analysis -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,26 +17,28 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Support/BlockFrequency.h"
-#include <climits>
+#include <cstdint>
+#include <memory>
 
 namespace llvm {
 
+template <class BlockT> class BlockFrequencyInfoImpl;
 class MachineBasicBlock;
 class MachineBranchProbabilityInfo;
+class MachineFunction;
 class MachineLoopInfo;
-template <class BlockT> class BlockFrequencyInfoImpl;
+class raw_ostream;
 
 /// MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation
 /// to estimate machine basic block frequencies.
 class MachineBlockFrequencyInfo : public MachineFunctionPass {
-  typedef BlockFrequencyInfoImpl<MachineBasicBlock> ImplType;
+  using ImplType = BlockFrequencyInfoImpl<MachineBasicBlock>;
   std::unique_ptr<ImplType> MBFI;
 
 public:
   static char ID;
 
   MachineBlockFrequencyInfo();
-
   ~MachineBlockFrequencyInfo() override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
@@ -74,9 +76,8 @@ public:
                               const MachineBasicBlock *MBB) const;
 
   uint64_t getEntryFreq() const;
-
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_MACHINEBLOCKFREQUENCYINFO_H
diff --git a/include/llvm/CodeGen/MachineDominanceFrontier.h b/include/llvm/CodeGen/MachineDominanceFrontier.h
index 4131194a0c0f..370ffbe4862e 100644
--- a/include/llvm/CodeGen/MachineDominanceFrontier.h
+++ b/include/llvm/CodeGen/MachineDominanceFrontier.h
@@ -11,23 +11,28 @@
 #define LLVM_CODEGEN_MACHINEDOMINANCEFRONTIER_H
 
 #include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Analysis/DominanceFrontierImpl.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-
+#include "llvm/Support/GenericDomTree.h"
+#include <vector>
 
 namespace llvm {
 
 class MachineDominanceFrontier : public MachineFunctionPass {
   ForwardDominanceFrontierBase<MachineBasicBlock> Base;
-public:
-  typedef DominatorTreeBase<MachineBasicBlock> DomTreeT;
-  typedef DomTreeNodeBase<MachineBasicBlock> DomTreeNodeT;
-  typedef DominanceFrontierBase<MachineBasicBlock>::DomSetType DomSetType;
-  typedef DominanceFrontierBase<MachineBasicBlock>::iterator iterator;
-  typedef DominanceFrontierBase<MachineBasicBlock>::const_iterator const_iterator;
 
-  void operator=(const MachineDominanceFrontier &) = delete;
+public:
+  using DomTreeT = DominatorTreeBase<MachineBasicBlock>;
+  using DomTreeNodeT = DomTreeNodeBase<MachineBasicBlock>;
+  using DomSetType = DominanceFrontierBase<MachineBasicBlock>::DomSetType;
+  using iterator = DominanceFrontierBase<MachineBasicBlock>::iterator;
+  using const_iterator =
+      DominanceFrontierBase<MachineBasicBlock>::const_iterator;
+
   MachineDominanceFrontier(const MachineDominanceFrontier &) = delete;
+  MachineDominanceFrontier &
+  operator=(const MachineDominanceFrontier &) = delete;
 
   static char ID;
 
@@ -104,6 +109,6 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_MACHINEDOMINANCEFRONTIER_H
diff --git a/include/llvm/CodeGen/MachineDominators.h b/include/llvm/CodeGen/MachineDominators.h
index 30b6cfdd1c36..74a7c3ea04ae 100644
--- a/include/llvm/CodeGen/MachineDominators.h
+++ b/include/llvm/CodeGen/MachineDominators.h
@@ -1,4 +1,4 @@
-//=- llvm/CodeGen/MachineDominators.h - Machine Dom Calculation --*- C++ -*-==//
+//==- llvm/CodeGen/MachineDominators.h - Machine Dom Calculation -*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,12 +16,15 @@
 #define LLVM_CODEGEN_MACHINEDOMINATORS_H
 
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/GenericDomTreeConstruction.h"
+#include <cassert>
 #include <memory>
+#include <vector>
 
 namespace llvm {
 
@@ -33,7 +36,7 @@ inline void DominatorTreeBase<MachineBasicBlock>::addRoot(MachineBasicBlock* MBB
 extern template class DomTreeNodeBase<MachineBasicBlock>;
 extern template class DominatorTreeBase<MachineBasicBlock>;
 
-typedef DomTreeNodeBase<MachineBasicBlock> MachineDomTreeNode;
+using MachineDomTreeNode = DomTreeNodeBase<MachineBasicBlock>;
 
 //===-------------------------------------
 /// DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to
@@ -52,6 +55,7 @@ class MachineDominatorTree : public MachineFunctionPass {
   /// The splitting of a critical edge is local and thus, it is possible
   /// to apply several of those changes at the same time.
   mutable SmallVector<CriticalEdge, 32> CriticalEdgesToSplit;
+
   /// \brief Remember all the basic blocks that are inserted during
   /// edge splitting.
   /// Invariant: NewBBs == all the basic blocks contained in the NewBB
@@ -259,8 +263,8 @@ public:
 
 template <class Node, class ChildIterator>
 struct MachineDomTreeGraphTraitsBase {
-  typedef Node *NodeRef;
-  typedef ChildIterator ChildIteratorType;
+  using NodeRef = Node *;
+  using ChildIteratorType = ChildIterator;
 
   static NodeRef getEntryNode(NodeRef N) { return N; }
   static ChildIteratorType child_begin(NodeRef N) { return N->begin(); }
@@ -287,6 +291,6 @@ template <> struct GraphTraits<MachineDominatorTree*>
   }
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_MACHINEDOMINATORS_H
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index e7e728c1be28..8d040beff7a6 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -826,20 +826,12 @@ public:
       getOperand(0).getSubReg() == getOperand(1).getSubReg();
   }
 
-  /// Return true if this is a transient instruction that is
-  /// either very likely to be eliminated during register allocation (such as
-  /// copy-like instructions), or if this instruction doesn't have an
-  /// execution-time cost.
-  bool isTransient() const {
-    switch(getOpcode()) {
-    default: return false;
-    // Copy-like instructions are usually eliminated during register allocation.
-    case TargetOpcode::PHI:
-    case TargetOpcode::COPY:
-    case TargetOpcode::INSERT_SUBREG:
-    case TargetOpcode::SUBREG_TO_REG:
-    case TargetOpcode::REG_SEQUENCE:
-    // Pseudo-instructions that don't produce any real output.
+  /// Return true if this instruction doesn't produce any output in the form of
+  /// executable instructions.
+  bool isMetaInstruction() const {
+    switch (getOpcode()) {
+    default:
+      return false;
     case TargetOpcode::IMPLICIT_DEF:
     case TargetOpcode::KILL:
     case TargetOpcode::CFI_INSTRUCTION:
@@ -850,6 +842,23 @@ public:
     }
   }
 
+  /// Return true if this is a transient instruction that is either very likely
+  /// to be eliminated during register allocation (such as copy-like
+  /// instructions), or if this instruction doesn't have an execution-time cost.
+  bool isTransient() const {
+    switch (getOpcode()) {
+    default:
+      return isMetaInstruction();
+    // Copy-like instructions are usually eliminated during register allocation.
+    case TargetOpcode::PHI:
+    case TargetOpcode::COPY:
+    case TargetOpcode::INSERT_SUBREG:
+    case TargetOpcode::SUBREG_TO_REG:
+    case TargetOpcode::REG_SEQUENCE:
+      return true;
+    }
+  }
+
   /// Return the number of instructions inside the MI bundle, excluding the
   /// bundle header.
   ///
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 6e5c6473ff4a..1026654da3d7 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -642,6 +642,11 @@ public:
   ///
   void setRegBank(unsigned Reg, const RegisterBank &RegBank);
 
+  void setRegClassOrRegBank(unsigned Reg,
+                            const RegClassOrRegBank &RCOrRB){
+    VRegInfo[Reg].first = RCOrRB;
+  }
+
   /// constrainRegClass - Constrain the register class of the specified virtual
   /// register to be a common subclass of RC and the current register class,
   /// but only if the new class has at least MinNumRegs registers.  Return the
diff --git a/include/llvm/CodeGen/MachineValueType.h b/include/llvm/CodeGen/MachineValueType.h
index e92bb7f74967..d991e4c216d9 100644
--- a/include/llvm/CodeGen/MachineValueType.h
+++ b/include/llvm/CodeGen/MachineValueType.h
@@ -26,7 +26,7 @@ namespace llvm {
   /// Machine Value Type. Every type that is supported natively by some
   /// processor targeted by LLVM occurs here. This means that any legal value
   /// type can be represented by an MVT.
-class MVT {
+  class MVT {
   public:
     enum SimpleValueType : uint8_t {
       // Simple value types that aren't explicitly part of this enumeration
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index 99afd8c5c9ab..97aa2aace822 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -52,14 +52,14 @@ class TargetRegisterInfo;
     /// These are the different kinds of scheduling dependencies.
     enum Kind {
       Data,        ///< Regular data dependence (aka true-dependence).
-      Anti,        ///< A register anti-dependedence (aka WAR).
+      Anti,        ///< A register anti-dependence (aka WAR).
       Output,      ///< A register output-dependence (aka WAW).
       Order        ///< Any other ordering dependency.
     };
 
     // Strong dependencies must be respected by the scheduler. Artificial
     // dependencies may be removed only if they are redundant with another
-    // strong depedence.
+    // strong dependence.
     //
     // Weak dependencies may be violated by the scheduling strategy, but only if
     // the strategy can prove it is correct to do so.
@@ -342,7 +342,7 @@ class TargetRegisterInfo;
     /// BoundaryNodes can have DAG edges, including Data edges, but they do not
     /// correspond to schedulable entities (e.g. instructions) and do not have a
     /// valid ID. Consequently, always check for boundary nodes before accessing
-    /// an assoicative data structure keyed on node ID.
+    /// an associative data structure keyed on node ID.
     bool isBoundaryNode() const { return NodeNum == BoundaryID; }
 
     /// Assigns the representative SDNode for this SUnit. This may be used
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 21e1740aa6b8..f5f5bfd45e79 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SparseMultiSet.h"
 #include "llvm/ADT/SparseSet.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/Compiler.h"
@@ -224,7 +225,7 @@ namespace llvm {
     MachineInstr *FirstDbgValue;
 
     /// Set of live physical registers for updating kill flags.
-    BitVector LiveRegs;
+    LivePhysRegs LiveRegs;
 
   public:
     explicit ScheduleDAGInstrs(MachineFunction &mf,
@@ -311,7 +312,7 @@ namespace llvm {
     std::string getDAGName() const override;
 
     /// Fixes register kill flags that scheduling has made invalid.
-    void fixupKills(MachineBasicBlock *MBB);
+    void fixupKills(MachineBasicBlock &MBB);
 
   protected:
     void initSUnits();
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index d761661f763e..493122b15704 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -1070,6 +1070,11 @@ public:
   SDNode *MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs,
                       ArrayRef<SDValue> Ops);
 
+  /// Mutate the specified strict FP node to its non-strict equivalent,
+  /// unlinking the node from its chain and dropping the metadata arguments.
+  /// The node must be a strict FP node.
+  SDNode *mutateStrictFPToFP(SDNode *Node);
+
   /// These are used for target selectors to create a new node
   /// with specified return type(s), MachineInstr opcode, and operands.
   ///
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 35ddcf80c91f..973c5aac5281 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -612,6 +612,32 @@ public:
            SDNodeBits.IsMemIntrinsic;
   }
 
+  /// Test if this node is a strict floating point pseudo-op.
+  bool isStrictFPOpcode() {
+    switch (NodeType) {
+      default: 
+        return false;
+      case ISD::STRICT_FADD:
+      case ISD::STRICT_FSUB:
+      case ISD::STRICT_FMUL:
+      case ISD::STRICT_FDIV:
+      case ISD::STRICT_FREM:
+      case ISD::STRICT_FSQRT:
+      case ISD::STRICT_FPOW:
+      case ISD::STRICT_FPOWI:
+      case ISD::STRICT_FSIN:
+      case ISD::STRICT_FCOS:
+      case ISD::STRICT_FEXP:
+      case ISD::STRICT_FEXP2:
+      case ISD::STRICT_FLOG:
+      case ISD::STRICT_FLOG10:
+      case ISD::STRICT_FLOG2:
+      case ISD::STRICT_FRINT:
+      case ISD::STRICT_FNEARBYINT:
+        return true;
+    }
+  }
+
   /// Test if this node has a post-isel opcode, directly
   /// corresponding to a MachineInstr opcode.
   bool isMachineOpcode() const { return NodeType < 0; }
diff --git a/include/llvm/DebugInfo/CodeView/CVRecord.h b/include/llvm/DebugInfo/CodeView/CVRecord.h
index 71ea82b6a9ab..68ad09982202 100644
--- a/include/llvm/DebugInfo/CodeView/CVRecord.h
+++ b/include/llvm/DebugInfo/CodeView/CVRecord.h
@@ -14,6 +14,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
@@ -50,6 +51,13 @@ public:
   Optional<uint32_t> Hash;
 };
 
+template <typename Kind> struct RemappedRecord {
+  explicit RemappedRecord(const CVRecord<Kind> &R) : OriginalRecord(R) {}
+
+  CVRecord<Kind> OriginalRecord;
+  SmallVector<std::pair<uint32_t, TypeIndex>, 8> Mappings;
+};
+
 } // end namespace codeview
 
 template <typename Kind>
diff --git a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
index 4bc8fbefd5d8..70ccc867cd38 100644
--- a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
@@ -46,6 +46,7 @@ Error visitMemberRecordStream(ArrayRef<uint8_t> FieldList,
                               TypeVisitorCallbacks &Callbacks);
 
 Error visitTypeStream(const CVTypeArray &Types, TypeVisitorCallbacks &Callbacks,
+                      VisitorDataSource Source = VDS_BytesPresent,
                       TypeServerHandler *TS = nullptr);
 Error visitTypeStream(CVTypeRange Types, TypeVisitorCallbacks &Callbacks,
                       TypeServerHandler *TS = nullptr);
diff --git a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
index 2142d4a2dec7..a9c5cf42fc5b 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
@@ -40,6 +40,17 @@ class TypeDeserializer : public TypeVisitorCallbacks {
 public:
   TypeDeserializer() = default;
 
+  template <typename T> static Error deserializeAs(CVType &CVT, T &Record) {
+    MappingInfo I(CVT.content());
+    if (auto EC = I.Mapping.visitTypeBegin(CVT))
+      return EC;
+    if (auto EC = I.Mapping.visitKnownRecord(CVT, Record))
+      return EC;
+    if (auto EC = I.Mapping.visitTypeEnd(CVT))
+      return EC;
+    return Error::success();
+  }
+
   Error visitTypeBegin(CVType &Record) override {
     assert(!Mapping && "Already in a type mapping!");
     Mapping = llvm::make_unique<MappingInfo>(Record.content());
diff --git a/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h b/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h
new file mode 100644
index 000000000000..82ceb5038316
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/TypeIndexDiscovery.h
@@ -0,0 +1,33 @@
+//===- TypeIndexDiscovery.h -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEINDEXDISCOVERY_H
+#define LLVM_DEBUGINFO_CODEVIEW_TYPEINDEXDISCOVERY_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace codeview {
+enum class TiRefKind { TypeRef, IndexRef };
+struct TiReference {
+  TiRefKind Kind;
+  uint32_t Offset;
+  uint32_t Count;
+};
+
+void discoverTypeIndices(ArrayRef<uint8_t> RecordData,
+                         SmallVectorImpl<TiReference> &Refs);
+void discoverTypeIndices(const CVType &Type,
+                         SmallVectorImpl<TiReference> &Refs);
+}
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h
index 1f10872c8768..92745ebfcded 100644
--- a/include/llvm/DebugInfo/CodeView/TypeRecord.h
+++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h
@@ -35,6 +35,7 @@ using support::ulittle16_t;
 using support::ulittle32_t;
 
 typedef CVRecord<TypeLeafKind> CVType;
+typedef RemappedRecord<TypeLeafKind> RemappedType;
 
 struct CVMemberRecord {
   TypeLeafKind Kind;
@@ -278,15 +279,9 @@ public:
         Attrs(calcAttrs(PK, PM, PO, Size)) {}
 
   PointerRecord(TypeIndex ReferentType, PointerKind PK, PointerMode PM,
-                PointerOptions PO, uint8_t Size,
-                const MemberPointerInfo &Member)
+                PointerOptions PO, uint8_t Size, const MemberPointerInfo &MPI)
       : TypeRecord(TypeRecordKind::Pointer), ReferentType(ReferentType),
-        Attrs(calcAttrs(PK, PM, PO, Size)), MemberInfo(Member) {}
-
-  PointerRecord(TypeIndex ReferentType, uint32_t Attrs,
-                const MemberPointerInfo &Member)
-      : TypeRecord(TypeRecordKind::Pointer), ReferentType(ReferentType),
-        Attrs(Attrs), MemberInfo(Member) {}
+        Attrs(calcAttrs(PK, PM, PO, Size)), MemberInfo(MPI) {}
 
   TypeIndex getReferentType() const { return ReferentType; }
 
diff --git a/include/llvm/DebugInfo/CodeView/TypeSerializer.h b/include/llvm/DebugInfo/CodeView/TypeSerializer.h
index 6dad98247136..435c43f7edcb 100644
--- a/include/llvm/DebugInfo/CodeView/TypeSerializer.h
+++ b/include/llvm/DebugInfo/CodeView/TypeSerializer.h
@@ -17,7 +17,6 @@
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Error.h"
@@ -26,6 +25,8 @@ namespace llvm {
 
 namespace codeview {
 
+class TypeHasher;
+
 class TypeSerializer : public TypeVisitorCallbacks {
   struct SubRecord {
     SubRecord(TypeLeafKind K, uint32_t S) : Kind(K), Size(S) {}
@@ -45,14 +46,13 @@ class TypeSerializer : public TypeVisitorCallbacks {
     }
   };
 
-  typedef SmallVector<MutableArrayRef<uint8_t>, 2> RecordList;
+  typedef SmallVector<MutableArrayRef<uint8_t>, 2> MutableRecordList;
 
   static constexpr uint8_t ContinuationLength = 8;
   BumpPtrAllocator &RecordStorage;
   RecordSegment CurrentSegment;
-  RecordList FieldListSegments;
+  MutableRecordList FieldListSegments;
 
-  TypeIndex LastTypeIndex;
   Optional<TypeLeafKind> TypeKind;
   Optional<TypeLeafKind> MemberKind;
   std::vector<uint8_t> RecordBuffer;
@@ -60,28 +60,35 @@ class TypeSerializer : public TypeVisitorCallbacks {
   BinaryStreamWriter Writer;
   TypeRecordMapping Mapping;
 
-  RecordList SeenRecords;
-  StringMap<TypeIndex> HashedRecords;
+  /// Private type record hashing implementation details are handled here.
+  std::unique_ptr<TypeHasher> Hasher;
+
+  /// Contains a list of all records indexed by TypeIndex.toArrayIndex().
+  SmallVector<ArrayRef<uint8_t>, 2> SeenRecords;
+
+  /// Temporary storage that we use to copy a record's data while re-writing
+  /// its type indices.
+  SmallVector<uint8_t, 256> RemapStorage;
+
+  TypeIndex nextTypeIndex() const;
 
   bool isInFieldList() const;
-  TypeIndex calcNextTypeIndex() const;
-  TypeIndex incrementTypeIndex();
   MutableArrayRef<uint8_t> getCurrentSubRecordData();
   MutableArrayRef<uint8_t> getCurrentRecordData();
   Error writeRecordPrefix(TypeLeafKind Kind);
-  TypeIndex insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record);
-  TypeIndex insertRecordBytesWithCopy(CVType &Record,
-                                      MutableArrayRef<uint8_t> Data);
 
   Expected<MutableArrayRef<uint8_t>>
   addPadding(MutableArrayRef<uint8_t> Record);
 
 public:
-  explicit TypeSerializer(BumpPtrAllocator &Storage);
+  explicit TypeSerializer(BumpPtrAllocator &Storage, bool Hash = true);
+  ~TypeSerializer();
 
-  ArrayRef<MutableArrayRef<uint8_t>> records() const;
-  TypeIndex getLastTypeIndex() const;
-  TypeIndex insertRecordBytes(MutableArrayRef<uint8_t> Record);
+  void reset();
+
+  ArrayRef<ArrayRef<uint8_t>> records() const;
+  TypeIndex insertRecordBytes(ArrayRef<uint8_t> &Record);
+  TypeIndex insertRecord(const RemappedType &Record);
   Expected<TypeIndex> visitTypeEndGetIndex(CVType &Record);
 
   Error visitTypeBegin(CVType &Record) override;
diff --git a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
index 65bcf9812e68..3ad2b4e9c92f 100644
--- a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
+++ b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
@@ -22,12 +22,75 @@ class TypeIndex;
 class TypeServerHandler;
 class TypeTableBuilder;
 
-/// Merges one type stream into another. Returns true on success.
-Error mergeTypeStreams(TypeTableBuilder &DestIdStream,
-                       TypeTableBuilder &DestTypeStream,
+/// \brief Merge one set of type records into another.  This method assumes
+/// that all records are type records, and there are no Id records present.
+///
+/// \param Dest The table to store the re-written type records into.
+///
+/// \param SourceToDest A vector, indexed by the TypeIndex in the source
+/// type stream, that contains the index of the corresponding type record
+/// in the destination stream.
+///
+/// \param Handler (optional) If non-null, an interface that gets invoked
+/// to handle type server records.
+///
+/// \param Types The collection of types to merge in.
+///
+/// \returns Error::success() if the operation succeeded, otherwise an
+/// appropriate error code.
+Error mergeTypeRecords(TypeTableBuilder &Dest,
                        SmallVectorImpl<TypeIndex> &SourceToDest,
                        TypeServerHandler *Handler, const CVTypeArray &Types);
 
+/// \brief Merge one set of id records into another.  This method assumes
+/// that all records are id records, and there are no Type records present.
+/// However, since Id records can refer back to Type records, this method
+/// assumes that the referenced type records have also been merged into
+/// another type stream (for example using the above method), and accepts
+/// the mapping from source to dest for that stream so that it can re-write
+/// the type record mappings accordingly.
+///
+/// \param Dest The table to store the re-written id records into.
+///
+/// \param Types The mapping to use for the type records that these id
+/// records refer to.
+///
+/// \param SourceToDest A vector, indexed by the TypeIndex in the source
+/// id stream, that contains the index of the corresponding id record
+/// in the destination stream.
+///
+/// \param Ids The collection of id records to merge in.
+///
+/// \returns Error::success() if the operation succeeded, otherwise an
+/// appropriate error code.
+Error mergeIdRecords(TypeTableBuilder &Dest, ArrayRef<TypeIndex> Types,
+                     SmallVectorImpl<TypeIndex> &SourceToDest,
+  const CVTypeArray &Ids);
+
+/// \brief Merge a unified set of type and id records, splitting them into
+/// separate output streams.
+///
+/// \param DestIds The table to store the re-written id records into.
+///
+/// \param DestTypes the table to store the re-written type records into.
+///
+/// \param SourceToDest A vector, indexed by the TypeIndex in the source
+/// id stream, that contains the index of the corresponding id record
+/// in the destination stream.
+///
+/// \param Handler (optional) If non-null, an interface that gets invoked
+/// to handle type server records.
+///
+/// \param IdsAndTypes The collection of id records to merge in.
+///
+/// \returns Error::success() if the operation succeeded, otherwise an
+/// appropriate error code.
+Error mergeTypeAndIdRecords(TypeTableBuilder &DestIds,
+                            TypeTableBuilder &DestTypes,
+                            SmallVectorImpl<TypeIndex> &SourceToDest,
+                            TypeServerHandler *Handler,
+  const CVTypeArray &IdsAndTypes);
+
 } // end namespace codeview
 } // end namespace llvm
 
diff --git a/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h b/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h
index 102bee4b0801..7bdc9ecb20cf 100644
--- a/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h
+++ b/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h
@@ -64,10 +64,14 @@ public:
     return *ExpectedIndex;
   }
 
-  TypeIndex writeSerializedRecord(MutableArrayRef<uint8_t> Record) {
+  TypeIndex writeSerializedRecord(ArrayRef<uint8_t> Record) {
     return Serializer.insertRecordBytes(Record);
   }
 
+  TypeIndex writeSerializedRecord(const RemappedType &Record) {
+    return Serializer.insertRecord(Record);
+  }
+
   template <typename TFunc> void ForEachRecord(TFunc Func) {
     uint32_t Index = TypeIndex::FirstNonSimpleIndex;
 
@@ -77,23 +81,24 @@ public:
     }
   }
 
-  ArrayRef<MutableArrayRef<uint8_t>> records() const {
-    return Serializer.records();
-  }
+  ArrayRef<ArrayRef<uint8_t>> records() const { return Serializer.records(); }
 };
 
 class FieldListRecordBuilder {
   TypeTableBuilder &TypeTable;
+  BumpPtrAllocator Allocator;
   TypeSerializer TempSerializer;
   CVType Type;
 
 public:
   explicit FieldListRecordBuilder(TypeTableBuilder &TypeTable)
-      : TypeTable(TypeTable), TempSerializer(TypeTable.getAllocator()) {
+      : TypeTable(TypeTable), TempSerializer(Allocator, false) {
     Type.Type = TypeLeafKind::LF_FIELDLIST;
   }
 
   void begin() {
+    TempSerializer.reset();
+
     if (auto EC = TempSerializer.visitTypeBegin(Type))
       consumeError(std::move(EC));
   }
@@ -109,23 +114,19 @@ public:
       consumeError(std::move(EC));
   }
 
-  TypeIndex end() {
+  TypeIndex end(bool Write) {
+    TypeIndex Index;
     if (auto EC = TempSerializer.visitTypeEnd(Type)) {
       consumeError(std::move(EC));
       return TypeIndex();
     }
 
-    TypeIndex Index;
-    for (auto Record : TempSerializer.records()) {
-      Index = TypeTable.writeSerializedRecord(Record);
+    if (Write) {
+      for (auto Record : TempSerializer.records())
+        Index = TypeTable.writeSerializedRecord(Record);
     }
-    return Index;
-  }
 
-  /// Stop building the record.
-  void reset() {
-    if (auto EC = TempSerializer.visitTypeEnd(Type))
-      consumeError(std::move(EC));
+    return Index;
   }
 };
 
diff --git a/include/llvm/DebugInfo/CodeView/TypeTableCollection.h b/include/llvm/DebugInfo/CodeView/TypeTableCollection.h
index 7de562a19a74..42b62ba2b6ce 100644
--- a/include/llvm/DebugInfo/CodeView/TypeTableCollection.h
+++ b/include/llvm/DebugInfo/CodeView/TypeTableCollection.h
@@ -18,7 +18,7 @@ namespace codeview {
 
 class TypeTableCollection : public TypeCollection {
 public:
-  explicit TypeTableCollection(ArrayRef<MutableArrayRef<uint8_t>> Records);
+  explicit TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records);
 
   Optional<TypeIndex> getFirst() override;
   Optional<TypeIndex> getNext(TypeIndex Prev) override;
@@ -33,7 +33,7 @@ private:
   bool hasCapacityFor(TypeIndex Index) const;
   void ensureTypeExists(TypeIndex Index);
 
-  ArrayRef<MutableArrayRef<uint8_t>> Records;
+  ArrayRef<ArrayRef<uint8_t>> Records;
   TypeDatabase Database;
 };
 }
diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h
index d3a63edf10ff..7fa68f3f2314 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -46,7 +46,8 @@ class raw_ostream;
 /// Reads a value from data extractor and applies a relocation to the result if
 /// one exists for the given offset.
 uint64_t getRelocatedValue(const DataExtractor &Data, uint32_t Size,
-                           uint32_t *Off, const RelocAddrMap *Relocs);
+                           uint32_t *Off, const RelocAddrMap *Relocs,
+                           uint64_t *SecNdx = nullptr);
 
 /// DWARFContext
 /// This data structure is the top level entity that deals with dwarf debug
@@ -71,6 +72,14 @@ class DWARFContext : public DIContext {
   std::unique_ptr<DWARFDebugAbbrev> AbbrevDWO;
   std::unique_ptr<DWARFDebugLocDWO> LocDWO;
 
+  struct DWOFile {
+    object::OwningBinary<object::ObjectFile> File;
+    std::unique_ptr<DWARFContext> Context;
+  };
+  StringMap<std::weak_ptr<DWOFile>> DWOFiles;
+  std::weak_ptr<DWOFile> DWP;
+  bool CheckedForDWP = false;
+
   /// Read compile units from the debug_info section (if necessary)
   /// and store them in CUs.
   void parseCompileUnits();
@@ -165,6 +174,8 @@ public:
     return DWOCUs[index].get();
   }
 
+  DWARFCompileUnit *getDWOCompileUnitForHash(uint64_t Hash);
+
   /// Get a DIE given an exact offset.
   DWARFDie getDIEForOffset(uint32_t Offset);
 
@@ -206,6 +217,7 @@ public:
   DIInliningInfo getInliningInfoForAddress(uint64_t Address,
       DILineInfoSpecifier Specifier = DILineInfoSpecifier()) override;
 
+  virtual StringRef getFileName() const = 0;
   virtual bool isLittleEndian() const = 0;
   virtual uint8_t getAddressSize() const = 0;
   virtual const DWARFSection &getInfoSection() = 0;
@@ -248,6 +260,8 @@ public:
     return version == 2 || version == 3 || version == 4 || version == 5;
   }
 
+  std::shared_ptr<DWARFContext> getDWOContext(StringRef AbsolutePath);
+
 private:
   /// Return the compile unit that includes an offset (relative to .debug_info).
   DWARFCompileUnit *getCompileUnitForOffset(uint32_t Offset);
@@ -263,6 +277,7 @@ private:
 class DWARFContextInMemory : public DWARFContext {
   virtual void anchor();
 
+  StringRef FileName;
   bool IsLittleEndian;
   uint8_t AddressSize;
   DWARFSection InfoSection;
@@ -316,6 +331,7 @@ public:
                        uint8_t AddrSize,
                        bool isLittleEndian = sys::IsLittleEndianHost);
 
+  StringRef getFileName() const override { return FileName; }
   bool isLittleEndian() const override { return IsLittleEndian; }
   uint8_t getAddressSize() const override { return AddressSize; }
   const DWARFSection &getInfoSection() override { return InfoSection; }
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index 95ec1be62a79..b436711ae6ed 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -25,6 +25,7 @@ class raw_ostream;
 struct DWARFAddressRange {
   uint64_t LowPC;
   uint64_t HighPC;
+  uint64_t SectionIndex;
 };
 
 /// DWARFAddressRangesVector - represents a set of absolute address ranges.
@@ -44,6 +45,8 @@ public:
     /// address past the end of the address range. The ending address must
     /// be greater than or equal to the beginning address.
     uint64_t EndAddress;
+    /// A section index this range belongs to.
+    uint64_t SectionIndex;
 
     /// The end of any given range list is marked by an end of list entry,
     /// which consists of a 0 for the beginning address offset
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h
index ca94a90fabfc..fa41b9e293c0 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -195,7 +195,8 @@ public:
 
   /// Retrieves DW_AT_low_pc and DW_AT_high_pc from CU.
   /// Returns true if both attributes are present.
-  bool getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC) const;
+  bool getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC,
+                       uint64_t &SectionIndex) const;
 
   /// Get the address ranges for this DIE.
   ///
diff --git a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index a30e0be9c3c3..3a781dde8929 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -47,6 +47,7 @@ private:
       const char *cstr;
     };
     const uint8_t *data = nullptr;
+    uint64_t SectionIndex;      /// Section index for reference forms.
   };
 
   dwarf::Form Form;             /// Form for this value.
@@ -58,6 +59,7 @@ public:
 
   dwarf::Form getForm() const { return Form; }
   uint64_t getRawUValue() const { return Value.uval; }
+  uint64_t getSectionIndex() const { return Value.SectionIndex; }
   void setForm(dwarf::Form F) { Form = F; }
   void setUValue(uint64_t V) { Value.uval = V; }
   void setSValue(int64_t V) { Value.sval = V; }
diff --git a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
index fabacc0abcea..f143de334737 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
@@ -16,7 +16,10 @@
 
 namespace llvm {
 
+/// RelocAddrEntry contains relocated value and section index.
+/// Section index is -1LL if relocation points to absolute symbol.
 struct RelocAddrEntry {
+  uint64_t SectionIndex;
   uint64_t Value;
 };
 
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index ae7fd24ce5bb..d0f7bd0d623f 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -143,17 +143,7 @@ class DWARFUnit {
   typedef iterator_range<std::vector<DWARFDebugInfoEntry>::iterator>
       die_iterator_range;
 
-  class DWOHolder {
-    object::OwningBinary<object::ObjectFile> DWOFile;
-    std::unique_ptr<DWARFContext> DWOContext;
-    DWARFUnit *DWOU = nullptr;
-
-  public:
-    DWOHolder(StringRef DWOPath, uint64_t DWOId);
-
-    DWARFUnit *getUnit() const { return DWOU; }
-  };
-  std::unique_ptr<DWOHolder> DWO;
+  std::shared_ptr<DWARFUnit> DWO;
 
   const DWARFUnitIndex::Entry *IndexEntry;
 
diff --git a/include/llvm/DebugInfo/MSF/MappedBlockStream.h b/include/llvm/DebugInfo/MSF/MappedBlockStream.h
index c91f6f725c80..d68f5f70c83e 100644
--- a/include/llvm/DebugInfo/MSF/MappedBlockStream.h
+++ b/include/llvm/DebugInfo/MSF/MappedBlockStream.h
@@ -43,8 +43,8 @@ class MappedBlockStream : public BinaryStream {
   friend class WritableMappedBlockStream;
 public:
   static std::unique_ptr<MappedBlockStream>
-  createStream(uint32_t BlockSize, uint32_t NumBlocks,
-               const MSFStreamLayout &Layout, BinaryStreamRef MsfData);
+  createStream(uint32_t BlockSize, const MSFStreamLayout &Layout,
+               BinaryStreamRef MsfData);
 
   static std::unique_ptr<MappedBlockStream>
   createIndexedStream(const MSFLayout &Layout, BinaryStreamRef MsfData,
@@ -74,12 +74,11 @@ public:
   void invalidateCache();
 
   uint32_t getBlockSize() const { return BlockSize; }
-  uint32_t getNumBlocks() const { return NumBlocks; }
+  uint32_t getNumBlocks() const { return StreamLayout.Blocks.size(); }
   uint32_t getStreamLength() const { return StreamLayout.Length; }
 
 protected:
-  MappedBlockStream(uint32_t BlockSize, uint32_t NumBlocks,
-                    const MSFStreamLayout &StreamLayout,
+  MappedBlockStream(uint32_t BlockSize, const MSFStreamLayout &StreamLayout,
                     BinaryStreamRef MsfData);
 
 private:
@@ -91,7 +90,6 @@ private:
                            ArrayRef<uint8_t> &Buffer);
 
   const uint32_t BlockSize;
-  const uint32_t NumBlocks;
   const MSFStreamLayout StreamLayout;
   BinaryStreamRef MsfData;
 
@@ -103,8 +101,8 @@ private:
 class WritableMappedBlockStream : public WritableBinaryStream {
 public:
   static std::unique_ptr<WritableMappedBlockStream>
-  createStream(uint32_t BlockSize, uint32_t NumBlocks,
-               const MSFStreamLayout &Layout, WritableBinaryStreamRef MsfData);
+  createStream(uint32_t BlockSize, const MSFStreamLayout &Layout,
+               WritableBinaryStreamRef MsfData);
 
   static std::unique_ptr<WritableMappedBlockStream>
   createIndexedStream(const MSFLayout &Layout, WritableBinaryStreamRef MsfData,
@@ -139,7 +137,7 @@ public:
   uint32_t getStreamLength() const { return ReadInterface.getStreamLength(); }
 
 protected:
-  WritableMappedBlockStream(uint32_t BlockSize, uint32_t NumBlocks,
+  WritableMappedBlockStream(uint32_t BlockSize,
                             const MSFStreamLayout &StreamLayout,
                             WritableBinaryStreamRef MsfData);
 
diff --git a/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
index bcac182e2145..e116f314ac0e 100644
--- a/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
@@ -82,6 +82,7 @@ private:
 
   Error finalize();
   uint32_t calculateModiSubstreamSize() const;
+  uint32_t calculateNamesOffset() const;
   uint32_t calculateSectionContribsStreamSize() const;
   uint32_t calculateSectionMapStreamSize() const;
   uint32_t calculateFileInfoSubstreamSize() const;
diff --git a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h b/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
index bfd38b6c80ec..196ba4d6ffbd 100644
--- a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
+++ b/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
@@ -11,8 +11,7 @@
 #define LLVM_DEBUGINFO_PDB_PDBTYPESERVERHANDLER_H
 
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
@@ -39,7 +38,7 @@ private:
 
   bool RevisitAlways;
   std::unique_ptr<NativeSession> Session;
-  SmallVector<SmallString<64>, 4> SearchPaths;
+  StringSet<> SearchPaths;
 };
 }
 }
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
index c5549983ed43..17fba9991c2e 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
@@ -21,6 +21,9 @@
 #include "llvm/Support/Error.h"
 
 namespace llvm {
+namespace codeview {
+class LazyRandomTypeCollection;
+}
 namespace msf {
 class MappedBlockStream;
 }
@@ -53,12 +56,16 @@ public:
   codeview::CVTypeRange types(bool *HadError) const;
   const codeview::CVTypeArray &typeArray() const { return TypeRecords; }
 
+  codeview::LazyRandomTypeCollection &typeCollection() { return *Types; }
+
   Error commit();
 
 private:
   const PDBFile &Pdb;
   std::unique_ptr<msf::MappedBlockStream> Stream;
 
+  std::unique_ptr<codeview::LazyRandomTypeCollection> Types;
+
   codeview::CVTypeArray TypeRecords;
 
   std::unique_ptr<BinaryStream> HashStream;
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index d4a896c01867..ace309ed95a4 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -322,7 +322,7 @@ template <> struct DenseMapInfo<AttributeSet> {
 /// the AttributeList object. The function attributes are at index
 /// `AttributeList::FunctionIndex', the return value is at index
 /// `AttributeList::ReturnIndex', and the attributes for the parameters start at
-/// index `1'.
+/// index `AttributeList::FirstArgIndex'.
 class AttributeList {
 public:
   enum AttrIndex : unsigned {
@@ -347,8 +347,8 @@ public:
   /// \brief Create an AttributeList with the specified parameters in it.
   static AttributeList get(LLVMContext &C,
                            ArrayRef<std::pair<unsigned, Attribute>> Attrs);
-  static AttributeList
-  get(LLVMContext &C, ArrayRef<std::pair<unsigned, AttributeSet>> Attrs);
+  static AttributeList get(LLVMContext &C,
+                           ArrayRef<std::pair<unsigned, AttributeSet>> Attrs);
 
   /// \brief Create an AttributeList from attribute sets for a function, its
   /// return value, and all of its arguments.
@@ -356,13 +356,11 @@ public:
                            AttributeSet RetAttrs,
                            ArrayRef<AttributeSet> ArgAttrs);
 
-  static AttributeList
-  getImpl(LLVMContext &C,
-          ArrayRef<std::pair<unsigned, AttributeSet>> Attrs);
-
 private:
   explicit AttributeList(AttributeListImpl *LI) : pImpl(LI) {}
 
+  static AttributeList getImpl(LLVMContext &C, ArrayRef<AttributeSet> AttrSets);
+
 public:
   AttributeList() = default;
 
@@ -521,39 +519,31 @@ public:
   /// \brief Return the attributes at the index as a string.
   std::string getAsString(unsigned Index, bool InAttrGrp = false) const;
 
-  using iterator = ArrayRef<Attribute>::iterator;
+  //===--------------------------------------------------------------------===//
+  // AttributeList Introspection
+  //===--------------------------------------------------------------------===//
 
-  iterator begin(unsigned Slot) const;
-  iterator end(unsigned Slot) const;
+  typedef const AttributeSet *iterator;
+  iterator begin() const;
+  iterator end() const;
+
+  unsigned getNumAttrSets() const;
+
+  /// Use these to iterate over the valid attribute indices.
+  unsigned index_begin() const { return AttributeList::FunctionIndex; }
+  unsigned index_end() const { return getNumAttrSets() - 1; }
 
   /// operator==/!= - Provide equality predicates.
   bool operator==(const AttributeList &RHS) const { return pImpl == RHS.pImpl; }
   bool operator!=(const AttributeList &RHS) const { return pImpl != RHS.pImpl; }
 
-  //===--------------------------------------------------------------------===//
-  // AttributeList Introspection
-  //===--------------------------------------------------------------------===//
-
   /// \brief Return a raw pointer that uniquely identifies this attribute list.
   void *getRawPointer() const {
     return pImpl;
   }
 
   /// \brief Return true if there are no attributes.
-  bool isEmpty() const {
-    return getNumSlots() == 0;
-  }
-
-  /// \brief Return the number of slots used in this attribute list.  This is
-  /// the number of arguments that have an attribute set on them (including the
-  /// function itself).
-  unsigned getNumSlots() const;
-
-  /// \brief Return the index for the given slot.
-  unsigned getSlotIndex(unsigned Slot) const;
-
-  /// \brief Return the attributes at the given slot.
-  AttributeSet getSlotAttributes(unsigned Slot) const;
+  bool isEmpty() const { return pImpl == nullptr; }
 
   void dump() const;
 };
diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index c917b1f2cada..235cb57cfd09 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h
@@ -33,6 +33,7 @@ class Function;
 class LandingPadInst;
 class LLVMContext;
 class Module;
+class PHINode;
 class TerminatorInst;
 class ValueSymbolTable;
 
@@ -261,6 +262,50 @@ public:
   inline const Instruction       &back() const { return InstList.back();  }
   inline       Instruction       &back()       { return InstList.back();  }
 
+  /// Iterator to walk just the phi nodes in the basic block.
+  template <typename PHINodeT = PHINode, typename BBIteratorT = iterator>
+  class phi_iterator_impl
+      : public iterator_facade_base<phi_iterator_impl<PHINodeT, BBIteratorT>,
+                                    std::forward_iterator_tag, PHINodeT> {
+    friend BasicBlock;
+
+    PHINodeT *PN;
+
+    phi_iterator_impl(PHINodeT *PN) : PN(PN) {}
+
+  public:
+    // Allow default construction to build variables, but this doesn't build
+    // a useful iterator.
+    phi_iterator_impl() = default;
+
+    // Allow conversion between instantiations where valid.
+    template <typename PHINodeU, typename BBIteratorU>
+    phi_iterator_impl(const phi_iterator_impl<PHINodeU, BBIteratorU> &Arg)
+        : PN(Arg.PN) {}
+
+    bool operator==(const phi_iterator_impl &Arg) const { return PN == Arg.PN; }
+
+    PHINodeT &operator*() const { return *PN; }
+
+    using phi_iterator_impl::iterator_facade_base::operator++;
+    phi_iterator_impl &operator++() {
+      assert(PN && "Cannot increment the end iterator!");
+      PN = dyn_cast<PHINodeT>(std::next(BBIteratorT(PN)));
+      return *this;
+    }
+  };
+  typedef phi_iterator_impl<> phi_iterator;
+  typedef phi_iterator_impl<const PHINode, BasicBlock::const_iterator>
+      const_phi_iterator;
+
+  /// Returns a range that iterates over the phis in the basic block.
+  ///
+  /// Note that this cannot be used with basic blocks that have no terminator.
+  iterator_range<const_phi_iterator> phis() const {
+    return const_cast<BasicBlock *>(this)->phis();
+  }
+  iterator_range<phi_iterator> phis();
+
   /// \brief Return the underlying instruction list container.
   ///
   /// Currently you need to access the underlying instruction list container
diff --git a/include/llvm/IR/IntrinsicInst.h b/include/llvm/IR/IntrinsicInst.h
index 05e3315cbab2..2ae98d9e35b0 100644
--- a/include/llvm/IR/IntrinsicInst.h
+++ b/include/llvm/IR/IntrinsicInst.h
@@ -171,6 +171,7 @@ namespace llvm {
       ebStrict
     };
 
+    bool isUnaryOp() const;
     RoundingMode getRoundingMode() const;
     ExceptionBehavior getExceptionBehavior() const;
 
@@ -182,6 +183,18 @@ namespace llvm {
       case Intrinsic::experimental_constrained_fmul:
       case Intrinsic::experimental_constrained_fdiv:
       case Intrinsic::experimental_constrained_frem:
+      case Intrinsic::experimental_constrained_sqrt:
+      case Intrinsic::experimental_constrained_pow:
+      case Intrinsic::experimental_constrained_powi:
+      case Intrinsic::experimental_constrained_sin:
+      case Intrinsic::experimental_constrained_cos:
+      case Intrinsic::experimental_constrained_exp:
+      case Intrinsic::experimental_constrained_exp2:
+      case Intrinsic::experimental_constrained_log:
+      case Intrinsic::experimental_constrained_log10:
+      case Intrinsic::experimental_constrained_log2:
+      case Intrinsic::experimental_constrained_rint:
+      case Intrinsic::experimental_constrained_nearbyint:
         return true;
       default: return false;
       }
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 19f6045568f4..291d16fb0d9b 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -489,8 +489,64 @@ let IntrProperties = [IntrInaccessibleMemOnly] in {
                                                       LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
+
+  // These intrinsics are sensitive to the rounding mode so we need constrained
+  // versions of each of them.  When strict rounding and exception control are
+  // not required the non-constrained versions of these intrinsics should be
+  // used.
+  def int_experimental_constrained_sqrt : Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_powi : Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_i32_ty,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_sin  : Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_cos  : Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_pow  : Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_log  : Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_log10: Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_log2 : Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_exp  : Intrinsic<[ llvm_anyfloat_ty ], 
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_exp2 : Intrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
+  def int_experimental_constrained_rint  : Intrinsic<[ llvm_anyfloat_ty ],
+                                                     [ LLVMMatchType<0>,
+                                                       llvm_metadata_ty,
+                                                       llvm_metadata_ty ]>;
+  def int_experimental_constrained_nearbyint : Intrinsic<[ llvm_anyfloat_ty ],
+                                                         [ LLVMMatchType<0>,
+                                                           llvm_metadata_ty,
+                                                           llvm_metadata_ty ]>;
 }
-// FIXME: Add intrinsic for fcmp, fptrunc, fpext, fptoui and fptosi.
+// FIXME: Add intrinsics for fcmp, fptrunc, fpext, fptoui and fptosi.
+// FIXME: Add intrinsics for fabs, copysign, floor, ceil, trunc and round?
 
 
 //===------------------------- Expect Intrinsics --------------------------===//
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index d7413fe9e56f..e1928546607a 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -566,6 +566,16 @@ def int_amdgcn_s_getreg :
   [IntrReadMem, IntrSpeculatable]
 >;
 
+// int_amdgcn_s_getpc is provided to allow a specific style of position
+// independent code to determine the high part of its address when it is
+// known (through convention) that the code and any data of interest does
+// not cross a 4Gb address boundary. Use for any other purpose may not
+// produce the desired results as optimizations may cause code movement,
+// especially as we explicitly use IntrNoMem to allow optimizations.
+def int_amdgcn_s_getpc :
+  GCCBuiltin<"__builtin_amdgcn_s_getpc">,
+  Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrSpeculatable]>;
+
 // __builtin_amdgcn_interp_mov <param>, <attr_chan>, <attr>, <m0>
 // param values: 0 = P10, 1 = P20, 2 = P0
 def int_amdgcn_interp_mov :
diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index 92f701e01ff3..3c753260190e 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@@ -1223,6 +1223,7 @@ public:
 
   // FIXME: Fix callers and remove condition on N.
   unsigned size() const { return N ? N->getNumOperands() : 0u; }
+  bool empty() const { return N ? N->getNumOperands() == 0 : true; }
   T *operator[](unsigned I) const { return cast_or_null<T>(N->getOperand(I)); }
 
   // FIXME: Fix callers and remove condition on N.
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 3024d9e27a2f..5e1f680c5b36 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -139,9 +139,12 @@ public:
     /// during the append operation.
     AppendUnique = 6,
 
+    /// Takes the max of the two values, which are required to be integers.
+    Max = 7,
+
     // Markers:
     ModFlagBehaviorFirstVal = Error,
-    ModFlagBehaviorLastVal = AppendUnique
+    ModFlagBehaviorLastVal = Max
   };
 
   /// Checks if Metadata represents a valid ModFlagBehavior, and stores the
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 3df5244a0bd6..3ca21c15577b 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -144,6 +144,7 @@ void initializeGCMachineCodeAnalysisPass(PassRegistry&);
 void initializeGCModuleInfoPass(PassRegistry&);
 void initializeGCOVProfilerLegacyPassPass(PassRegistry&);
 void initializeGVNHoistLegacyPassPass(PassRegistry&);
+void initializeGVNSinkLegacyPassPass(PassRegistry&);
 void initializeGVNLegacyPassPass(PassRegistry&);
 void initializeGlobalDCELegacyPassPass(PassRegistry&);
 void initializeGlobalMergePass(PassRegistry&);
@@ -193,6 +194,7 @@ void initializeLiveVariablesPass(PassRegistry&);
 void initializeLoadCombinePass(PassRegistry&);
 void initializeLoadStoreVectorizerPass(PassRegistry&);
 void initializeLoaderPassPass(PassRegistry&);
+void initializeLocalizerPass(PassRegistry&);
 void initializeLocalStackSlotPassPass(PassRegistry&);
 void initializeLoopAccessLegacyAnalysisPass(PassRegistry&);
 void initializeLoopDataPrefetchLegacyPassPass(PassRegistry&);
diff --git a/include/llvm/LTO/Config.h b/include/llvm/LTO/Config.h
index ede6637dfa4d..5ba8492db8f5 100644
--- a/include/llvm/LTO/Config.h
+++ b/include/llvm/LTO/Config.h
@@ -39,7 +39,7 @@ struct Config {
   std::string CPU;
   TargetOptions Options;
   std::vector<std::string> MAttrs;
-  Reloc::Model RelocModel = Reloc::PIC_;
+  Optional<Reloc::Model> RelocModel = Reloc::PIC_;
   CodeModel::Model CodeModel = CodeModel::Default;
   CodeGenOpt::Level CGOptLevel = CodeGenOpt::Default;
   TargetMachine::CodeGenFileType CGFileType = TargetMachine::CGFT_ObjectFile;
diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h
index cf5d93ee9ed7..3f5a233c1ee1 100644
--- a/include/llvm/Object/Binary.h
+++ b/include/llvm/Object/Binary.h
@@ -95,9 +95,7 @@ public:
     return TypeID > ID_StartObjects && TypeID < ID_EndObjects;
   }
 
-  bool isSymbolic() const {
-    return isIR() || isObject();
-  }
+  bool isSymbolic() const { return isIR() || isObject() || isCOFFImportFile(); }
 
   bool isArchive() const {
     return TypeID == ID_Archive;
diff --git a/include/llvm/Object/COFF.h b/include/llvm/Object/COFF.h
index 8b9b49737170..dafd1a43cb59 100644
--- a/include/llvm/Object/COFF.h
+++ b/include/llvm/Object/COFF.h
@@ -782,6 +782,7 @@ protected:
   std::error_code getSectionName(DataRefImpl Sec,
                                  StringRef &Res) const override;
   uint64_t getSectionAddress(DataRefImpl Sec) const override;
+  uint64_t getSectionIndex(DataRefImpl Sec) const override;
   uint64_t getSectionSize(DataRefImpl Sec) const override;
   std::error_code getSectionContents(DataRefImpl Sec,
                                      StringRef &Res) const override;
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index d8b58b8079fa..ef2abd8c52ce 100644
--- a/include/llvm/Object/ELFObjectFile.h
+++ b/include/llvm/Object/ELFObjectFile.h
@@ -235,6 +235,7 @@ protected:
   std::error_code getSectionName(DataRefImpl Sec,
                                  StringRef &Res) const override;
   uint64_t getSectionAddress(DataRefImpl Sec) const override;
+  uint64_t getSectionIndex(DataRefImpl Sec) const override;
   uint64_t getSectionSize(DataRefImpl Sec) const override;
   std::error_code getSectionContents(DataRefImpl Sec,
                                      StringRef &Res) const override;
@@ -645,6 +646,17 @@ uint64_t ELFObjectFile<ELFT>::getSectionAddress(DataRefImpl Sec) const {
   return getSection(Sec)->sh_addr;
 }
 
+template <class ELFT>
+uint64_t ELFObjectFile<ELFT>::getSectionIndex(DataRefImpl Sec) const {
+  auto SectionsOrErr = EF.sections();
+  handleAllErrors(std::move(SectionsOrErr.takeError()),
+                  [](const ErrorInfoBase &) {
+                    llvm_unreachable("unable to get section index");
+                  });
+  const Elf_Shdr *First = SectionsOrErr->begin();
+  return getSection(Sec) - First;
+}
+
 template <class ELFT>
 uint64_t ELFObjectFile<ELFT>::getSectionSize(DataRefImpl Sec) const {
   return getSection(Sec)->sh_size;
diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
index 29553558f72f..a4356d5977b2 100644
--- a/include/llvm/Object/MachO.h
+++ b/include/llvm/Object/MachO.h
@@ -290,6 +290,7 @@ public:
   std::error_code getSectionName(DataRefImpl Sec,
                                  StringRef &Res) const override;
   uint64_t getSectionAddress(DataRefImpl Sec) const override;
+  uint64_t getSectionIndex(DataRefImpl Sec) const override;
   uint64_t getSectionSize(DataRefImpl Sec) const override;
   std::error_code getSectionContents(DataRefImpl Sec,
                                      StringRef &Res) const override;
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 9a7bc618ffd0..ea6a9049bc1b 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -95,6 +95,7 @@ public:
 
   std::error_code getName(StringRef &Result) const;
   uint64_t getAddress() const;
+  uint64_t getIndex() const;
   uint64_t getSize() const;
   std::error_code getContents(StringRef &Result) const;
 
@@ -222,6 +223,7 @@ protected:
   virtual std::error_code getSectionName(DataRefImpl Sec,
                                          StringRef &Res) const = 0;
   virtual uint64_t getSectionAddress(DataRefImpl Sec) const = 0;
+  virtual uint64_t getSectionIndex(DataRefImpl Sec) const = 0;
   virtual uint64_t getSectionSize(DataRefImpl Sec) const = 0;
   virtual std::error_code getSectionContents(DataRefImpl Sec,
                                              StringRef &Res) const = 0;
@@ -393,6 +395,10 @@ inline uint64_t SectionRef::getAddress() const {
   return OwningObject->getSectionAddress(SectionPimpl);
 }
 
+inline uint64_t SectionRef::getIndex() const {
+  return OwningObject->getSectionIndex(SectionPimpl);
+}
+
 inline uint64_t SectionRef::getSize() const {
   return OwningObject->getSectionSize(SectionPimpl);
 }
diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h
index 86579b7c3e3a..348179860f3e 100644
--- a/include/llvm/Object/RelocVisitor.h
+++ b/include/llvm/Object/RelocVisitor.h
@@ -40,13 +40,13 @@ public:
   // TODO: Should handle multiple applied relocations via either passing in the
   // previously computed value or just count paired relocations as a single
   // visit.
-  uint64_t visit(uint32_t RelocType, RelocationRef R, uint64_t Value = 0) {
+  uint64_t visit(uint32_t Rel, RelocationRef R, uint64_t Value = 0) {
     if (isa<ELFObjectFileBase>(ObjToVisit))
-      return visitELF(RelocType, R, Value);
+      return visitELF(Rel, R, Value);
     if (isa<COFFObjectFile>(ObjToVisit))
-      return visitCOFF(RelocType, R, Value);
+      return visitCOFF(Rel, R, Value);
     if (isa<MachOObjectFile>(ObjToVisit))
-      return visitMachO(RelocType, R, Value);
+      return visitMachO(Rel, R, Value);
 
     HasError = true;
     return 0;
@@ -58,214 +58,60 @@ private:
   const ObjectFile &ObjToVisit;
   bool HasError = false;
 
-  uint64_t visitELF(uint32_t RelocType, RelocationRef R, uint64_t Value) {
+  uint64_t visitELF(uint32_t Rel, RelocationRef R, uint64_t Value) {
     if (ObjToVisit.getBytesInAddress() == 8) { // 64-bit object file
       switch (ObjToVisit.getArch()) {
       case Triple::x86_64:
-        switch (RelocType) {
-        case ELF::R_X86_64_NONE:
-          return visitELF_X86_64_NONE(R);
-        case ELF::R_X86_64_64:
-          return visitELF_X86_64_64(R, Value);
-        case ELF::R_X86_64_PC32:
-          return visitELF_X86_64_PC32(R, Value);
-        case ELF::R_X86_64_32:
-          return visitELF_X86_64_32(R, Value);
-        case ELF::R_X86_64_32S:
-          return visitELF_X86_64_32S(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
+        return visitX86_64(Rel, R, Value);
       case Triple::aarch64:
       case Triple::aarch64_be:
-        switch (RelocType) {
-        case ELF::R_AARCH64_ABS32:
-          return visitELF_AARCH64_ABS32(R, Value);
-        case ELF::R_AARCH64_ABS64:
-          return visitELF_AARCH64_ABS64(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
+        return visitAarch64(Rel, R, Value);
       case Triple::bpfel:
       case Triple::bpfeb:
-        switch (RelocType) {
-        case ELF::R_BPF_64_64:
-          return visitELF_BPF_64_64(R, Value);
-        case ELF::R_BPF_64_32:
-          return visitELF_BPF_64_32(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
+        return visitBpf(Rel, R, Value);
       case Triple::mips64el:
       case Triple::mips64:
-        switch (RelocType) {
-        case ELF::R_MIPS_32:
-          return visitELF_MIPS64_32(R, Value);
-        case ELF::R_MIPS_64:
-          return visitELF_MIPS64_64(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
+        return visitMips64(Rel, R, Value);
       case Triple::ppc64le:
       case Triple::ppc64:
-        switch (RelocType) {
-        case ELF::R_PPC64_ADDR32:
-          return visitELF_PPC64_ADDR32(R, Value);
-        case ELF::R_PPC64_ADDR64:
-          return visitELF_PPC64_ADDR64(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
+        return visitPPC64(Rel, R, Value);
       case Triple::systemz:
-        switch (RelocType) {
-        case ELF::R_390_32:
-          return visitELF_390_32(R, Value);
-        case ELF::R_390_64:
-          return visitELF_390_64(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
+        return visitSystemz(Rel, R, Value);
       case Triple::sparcv9:
-        switch (RelocType) {
-        case ELF::R_SPARC_32:
-        case ELF::R_SPARC_UA32:
-          return visitELF_SPARCV9_32(R, Value);
-        case ELF::R_SPARC_64:
-        case ELF::R_SPARC_UA64:
-          return visitELF_SPARCV9_64(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
+        return visitSparc64(Rel, R, Value);
       case Triple::amdgcn:
-        switch (RelocType) {
-        case ELF::R_AMDGPU_ABS32:
-          return visitELF_AMDGPU_ABS32(R, Value);
-        case ELF::R_AMDGPU_ABS64:
-          return visitELF_AMDGPU_ABS64(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
+        return visitAmdgpu(Rel, R, Value);
       default:
         HasError = true;
         return 0;
       }
-    } else if (ObjToVisit.getBytesInAddress() == 4) { // 32-bit object file
-      switch (ObjToVisit.getArch()) {
-      case Triple::x86:
-        switch (RelocType) {
-        case ELF::R_386_NONE:
-          return visitELF_386_NONE(R);
-        case ELF::R_386_32:
-          return visitELF_386_32(R, Value);
-        case ELF::R_386_PC32:
-          return visitELF_386_PC32(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
-      case Triple::ppc:
-        switch (RelocType) {
-        case ELF::R_PPC_ADDR32:
-          return visitELF_PPC_ADDR32(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
-      case Triple::arm:
-      case Triple::armeb:
-        switch (RelocType) {
-        default:
-          HasError = true;
-          return 0;
-        case ELF::R_ARM_ABS32:
-          return visitELF_ARM_ABS32(R, Value);
-        }
-      case Triple::lanai:
-        switch (RelocType) {
-        case ELF::R_LANAI_32:
-          return visitELF_Lanai_32(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
-      case Triple::mipsel:
-      case Triple::mips:
-        switch (RelocType) {
-        case ELF::R_MIPS_32:
-          return visitELF_MIPS_32(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
-      case Triple::sparc:
-        switch (RelocType) {
-        case ELF::R_SPARC_32:
-        case ELF::R_SPARC_UA32:
-          return visitELF_SPARC_32(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
-      case Triple::hexagon:
-        switch (RelocType) {
-        case ELF::R_HEX_32:
-          return visitELF_HEX_32(R, Value);
-        default:
-          HasError = true;
-          return 0;
-        }
-      default:
-        HasError = true;
-        return 0;
-      }
-    } else {
-      report_fatal_error("Invalid word size in object file");
     }
-  }
 
-  uint64_t visitCOFF(uint32_t RelocType, RelocationRef R, uint64_t Value) {
+    // 32-bit object file
+    assert(ObjToVisit.getBytesInAddress() == 4 &&
+           "Invalid word size in object file");
+
     switch (ObjToVisit.getArch()) {
     case Triple::x86:
-      switch (RelocType) {
-      case COFF::IMAGE_REL_I386_SECREL:
-        return visitCOFF_I386_SECREL(R, Value);
-      case COFF::IMAGE_REL_I386_DIR32:
-        return visitCOFF_I386_DIR32(R, Value);
-      }
-      break;
-    case Triple::x86_64:
-      switch (RelocType) {
-      case COFF::IMAGE_REL_AMD64_SECREL:
-        return visitCOFF_AMD64_SECREL(R, Value);
-      case COFF::IMAGE_REL_AMD64_ADDR64:
-        return visitCOFF_AMD64_ADDR64(R, Value);
-      }
-      break;
+      return visitX86(Rel, R, Value);
+    case Triple::ppc:
+      return visitPPC32(Rel, R, Value);
+    case Triple::arm:
+    case Triple::armeb:
+      return visitARM(Rel, R, Value);
+    case Triple::lanai:
+      return visitLanai(Rel, R, Value);
+    case Triple::mipsel:
+    case Triple::mips:
+      return visitMips32(Rel, R, Value);
+    case Triple::sparc:
+      return visitSparc32(Rel, R, Value);
+    case Triple::hexagon:
+      return visitHexagon(Rel, R, Value);
+    default:
+      HasError = true;
+      return 0;
     }
-    HasError = true;
-    return 0;
-  }
-
-  uint64_t visitMachO(uint32_t RelocType, RelocationRef R, uint64_t Value) {
-    switch (ObjToVisit.getArch()) {
-    default: break;
-    case Triple::x86_64:
-      switch (RelocType) {
-        default: break;
-        case MachO::X86_64_RELOC_UNSIGNED:
-          return visitMACHO_X86_64_UNSIGNED(R, Value);
-      }
-    }
-    HasError = true;
-    return 0;
   }
 
   int64_t getELFAddend(RelocationRef R) {
@@ -275,176 +121,193 @@ private:
     return *AddendOrErr;
   }
 
-  /// Operations
-
-  /// 386-ELF
-  uint64_t visitELF_386_NONE(RelocationRef R) {
+  uint64_t visitX86_64(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    switch (Rel) {
+    case ELF::R_X86_64_NONE:
+      return 0;
+    case ELF::R_X86_64_64:
+      return Value + getELFAddend(R);
+    case ELF::R_X86_64_PC32:
+      return Value + getELFAddend(R) - R.getOffset();
+    case ELF::R_X86_64_32:
+    case ELF::R_X86_64_32S:
+      return (Value + getELFAddend(R)) & 0xFFFFFFFF;
+    }
+    HasError = true;
     return 0;
   }
 
-  // Ideally the Addend here will be the addend in the data for
-  // the relocation. It's not actually the case for Rel relocations.
-  uint64_t visitELF_386_32(RelocationRef R, uint64_t Value) {
-    return Value;
-  }
-
-  uint64_t visitELF_386_PC32(RelocationRef R, uint64_t Value) {
-    return Value - R.getOffset();
-  }
-
-  /// X86-64 ELF
-  uint64_t visitELF_X86_64_NONE(RelocationRef R) {
+  uint64_t visitAarch64(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    switch (Rel) {
+    case ELF::R_AARCH64_ABS32: {
+      int64_t Res = Value + getELFAddend(R);
+      if (Res < INT32_MIN || Res > UINT32_MAX)
+        HasError = true;
+      return static_cast<uint32_t>(Res);
+    }
+    case ELF::R_AARCH64_ABS64:
+      return Value + getELFAddend(R);
+    }
+    HasError = true;
     return 0;
   }
 
-  uint64_t visitELF_X86_64_64(RelocationRef R, uint64_t Value) {
-    return Value + getELFAddend(R);
+  uint64_t visitBpf(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    switch (Rel) {
+    case ELF::R_BPF_64_32:
+      return Value & 0xFFFFFFFF;
+    case ELF::R_BPF_64_64:
+      return Value;
+    }
+    HasError = true;
+    return 0;
   }
 
-  uint64_t visitELF_X86_64_PC32(RelocationRef R, uint64_t Value) {
-    return Value + getELFAddend(R) - R.getOffset();
+  uint64_t visitMips64(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    switch (Rel) {
+    case ELF::R_MIPS_32:
+      return (Value + getELFAddend(R)) & 0xFFFFFFFF;
+    case ELF::R_MIPS_64:
+      return Value + getELFAddend(R);
+    }
+    HasError = true;
+    return 0;
   }
 
-  uint64_t visitELF_X86_64_32(RelocationRef R, uint64_t Value) {
-    return (Value + getELFAddend(R)) & 0xFFFFFFFF;
+  uint64_t visitPPC64(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    switch (Rel) {
+    case ELF::R_PPC64_ADDR32:
+      return (Value + getELFAddend(R)) & 0xFFFFFFFF;
+    case ELF::R_PPC64_ADDR64:
+      return Value + getELFAddend(R);
+    }
+    HasError = true;
+    return 0;
   }
 
-  uint64_t visitELF_X86_64_32S(RelocationRef R, uint64_t Value) {
-    return (Value + getELFAddend(R)) & 0xFFFFFFFF;
+  uint64_t visitSystemz(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    switch (Rel) {
+    case ELF::R_390_32: {
+      int64_t Res = Value + getELFAddend(R);
+      if (Res < INT32_MIN || Res > UINT32_MAX)
+        HasError = true;
+      return static_cast<uint32_t>(Res);
+    }
+    case ELF::R_390_64:
+      return Value + getELFAddend(R);
+    }
+    HasError = true;
+    return 0;
   }
 
-  /// BPF ELF
-  uint64_t visitELF_BPF_64_32(RelocationRef R, uint64_t Value) {
-    return Value & 0xFFFFFFFF;
+  uint64_t visitSparc64(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    switch (Rel) {
+    case ELF::R_SPARC_32:
+    case ELF::R_SPARC_64:
+    case ELF::R_SPARC_UA32:
+    case ELF::R_SPARC_UA64:
+      return Value + getELFAddend(R);
+    }
+    HasError = true;
+    return 0;
   }
 
-  uint64_t visitELF_BPF_64_64(RelocationRef R, uint64_t Value) {
-    return Value;
+  uint64_t visitAmdgpu(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    switch (Rel) {
+    case ELF::R_AMDGPU_ABS32:
+    case ELF::R_AMDGPU_ABS64:
+      return Value + getELFAddend(R);
+    }
+    HasError = true;
+    return 0;
   }
 
-  /// PPC64 ELF
-  uint64_t visitELF_PPC64_ADDR32(RelocationRef R, uint64_t Value) {
-    return (Value + getELFAddend(R)) & 0xFFFFFFFF;
+  uint64_t visitX86(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    switch (Rel) {
+    case ELF::R_386_NONE:
+      return 0;
+    case ELF::R_386_32:
+      return Value;
+    case ELF::R_386_PC32:
+      return Value - R.getOffset();
+    }
+    HasError = true;
+    return 0;
   }
 
-  uint64_t visitELF_PPC64_ADDR64(RelocationRef R, uint64_t Value) {
-    return Value + getELFAddend(R);
+  uint64_t visitPPC32(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    if (Rel == ELF::R_PPC_ADDR32)
+      return (Value + getELFAddend(R)) & 0xFFFFFFFF;
+    HasError = true;
+    return 0;
   }
 
-  /// PPC32 ELF
-  uint64_t visitELF_PPC_ADDR32(RelocationRef R, uint64_t Value) {
-    return (Value + getELFAddend(R)) & 0xFFFFFFFF;
+  uint64_t visitARM(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    if (Rel == ELF::R_ARM_ABS32) {
+      if ((int64_t)Value < INT32_MIN || (int64_t)Value > UINT32_MAX)
+        HasError = true;
+      return static_cast<uint32_t>(Value);
+    }
+    HasError = true;
+    return 0;
   }
 
-  /// Lanai ELF
-  uint64_t visitELF_Lanai_32(RelocationRef R, uint64_t Value) {
-    return (Value + getELFAddend(R)) & 0xFFFFFFFF;
+  uint64_t visitLanai(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    if (Rel == ELF::R_LANAI_32)
+      return (Value + getELFAddend(R)) & 0xFFFFFFFF;
+    HasError = true;
+    return 0;
   }
 
-  /// MIPS ELF
-  uint64_t visitELF_MIPS_32(RelocationRef R, uint64_t Value) {
-    return Value & 0xFFFFFFFF;
+  uint64_t visitMips32(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    if (Rel == ELF::R_MIPS_32)
+      return Value & 0xFFFFFFFF;
+    HasError = true;
+    return 0;
   }
 
-  /// MIPS64 ELF
-  uint64_t visitELF_MIPS64_32(RelocationRef R, uint64_t Value) {
-    return (Value + getELFAddend(R)) & 0xFFFFFFFF;
+  uint64_t visitSparc32(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    if (Rel == ELF::R_SPARC_32 || Rel == ELF::R_SPARC_UA32)
+      return Value + getELFAddend(R);
+    HasError = true;
+    return 0;
   }
 
-  uint64_t visitELF_MIPS64_64(RelocationRef R, uint64_t Value) {
-    return Value + getELFAddend(R);
+  uint64_t visitHexagon(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    if (Rel == ELF::R_HEX_32)
+      return Value + getELFAddend(R);
+    HasError = true;
+    return 0;
   }
 
-  // AArch64 ELF
-  uint64_t visitELF_AARCH64_ABS32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    int64_t Res =  Value + Addend;
-
-    // Overflow check allows for both signed and unsigned interpretation.
-    if (Res < INT32_MIN || Res > UINT32_MAX)
-      HasError = true;
-
-    return static_cast<uint32_t>(Res);
+  uint64_t visitCOFF(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    switch (ObjToVisit.getArch()) {
+    case Triple::x86:
+      switch (Rel) {
+      case COFF::IMAGE_REL_I386_SECREL:
+      case COFF::IMAGE_REL_I386_DIR32:
+        return static_cast<uint32_t>(Value);
+      }
+      break;
+    case Triple::x86_64:
+      switch (Rel) {
+      case COFF::IMAGE_REL_AMD64_SECREL:
+        return static_cast<uint32_t>(Value);
+      case COFF::IMAGE_REL_AMD64_ADDR64:
+        return Value;
+      }
+      break;
+    }
+    HasError = true;
+    return 0;
   }
 
-  uint64_t visitELF_AARCH64_ABS64(RelocationRef R, uint64_t Value) {
-    return Value + getELFAddend(R);
-  }
-
-  // SystemZ ELF
-  uint64_t visitELF_390_32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    int64_t Res = Value + Addend;
-
-    // Overflow check allows for both signed and unsigned interpretation.
-    if (Res < INT32_MIN || Res > UINT32_MAX)
-      HasError = true;
-
-    return static_cast<uint32_t>(Res);
-  }
-
-  uint64_t visitELF_390_64(RelocationRef R, uint64_t Value) {
-    return Value + getELFAddend(R);
-  }
-
-  uint64_t visitELF_SPARC_32(RelocationRef R, uint32_t Value) {
-    return Value + getELFAddend(R);
-  }
-
-  uint64_t visitELF_SPARCV9_32(RelocationRef R, uint64_t Value) {
-    return Value + getELFAddend(R);
-  }
-
-  uint64_t visitELF_SPARCV9_64(RelocationRef R, uint64_t Value) {
-    return Value + getELFAddend(R);
-  }
-
-  uint64_t visitELF_ARM_ABS32(RelocationRef R, uint64_t Value) {
-    int64_t Res = Value;
-
-    // Overflow check allows for both signed and unsigned interpretation.
-    if (Res < INT32_MIN || Res > UINT32_MAX)
-      HasError = true;
-
-    return static_cast<uint32_t>(Res);
-  }
-
-  uint64_t visitELF_HEX_32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    return Value + Addend;
-  }
-
-  uint64_t visitELF_AMDGPU_ABS32(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    return Value + Addend;
-  }
-
-  uint64_t visitELF_AMDGPU_ABS64(RelocationRef R, uint64_t Value) {
-    int64_t Addend = getELFAddend(R);
-    return Value + Addend;
-  }
-
-  /// I386 COFF
-  uint64_t visitCOFF_I386_SECREL(RelocationRef R, uint64_t Value) {
-    return static_cast<uint32_t>(Value);
-  }
-
-  uint64_t visitCOFF_I386_DIR32(RelocationRef R, uint64_t Value) {
-    return static_cast<uint32_t>(Value);
-  }
-
-  /// AMD64 COFF
-  uint64_t visitCOFF_AMD64_SECREL(RelocationRef R, uint64_t Value) {
-    return static_cast<uint32_t>(Value);
-  }
-
-  uint64_t visitCOFF_AMD64_ADDR64(RelocationRef R, uint64_t Value) {
-    return Value;
-  }
-
-  // X86_64 MachO
-  uint64_t visitMACHO_X86_64_UNSIGNED(RelocationRef R, uint64_t Value) {
-    return Value;
+  uint64_t visitMachO(uint32_t Rel, RelocationRef R, uint64_t Value) {
+    if (ObjToVisit.getArch() == Triple::x86_64 &&
+        Rel == MachO::X86_64_RELOC_UNSIGNED)
+      return Value;
+    HasError = true;
+    return 0;
   }
 };
 
diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h
index d200d4a148e3..de54a4928cce 100644
--- a/include/llvm/Object/Wasm.h
+++ b/include/llvm/Object/Wasm.h
@@ -119,6 +119,7 @@ public:
   std::error_code getSectionName(DataRefImpl Sec,
                                  StringRef &Res) const override;
   uint64_t getSectionAddress(DataRefImpl Sec) const override;
+  uint64_t getSectionIndex(DataRefImpl Sec) const override;
   uint64_t getSectionSize(DataRefImpl Sec) const override;
   std::error_code getSectionContents(DataRefImpl Sec,
                                      StringRef &Res) const override;
diff --git a/include/llvm/Option/OptTable.h b/include/llvm/Option/OptTable.h
index 390e52774fea..8a323a255ca1 100644
--- a/include/llvm/Option/OptTable.h
+++ b/include/llvm/Option/OptTable.h
@@ -113,6 +113,14 @@ public:
     return getInfo(id).MetaVar;
   }
 
+  /// Find flags from OptTable which starts with Cur.
+  ///
+  /// \param [in] Cur - String prefix that all returned flags need
+  //  to start with.
+  ///
+  /// \return The vector of flags which start with Cur.
+  std::vector<std::string> findByPrefix(StringRef Cur) const;
+
   /// \brief Parse a single argument; returning the new argument and
   /// updating Index.
   ///
diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index 1b07c33746e7..0dbb2cf9f269 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -212,12 +212,12 @@ StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName,
 ///  third field is the uncompressed strings; otherwise it is the
 /// compressed string. When the string compression is off, the
 /// second field will have value zero.
-Error collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
+Error collectPGOFuncNameStrings(ArrayRef<std::string> NameStrs,
                                 bool doCompression, std::string &Result);
 
 /// Produce \c Result string with the same format described above. The input
 /// is vector of PGO function name variables that are referenced.
-Error collectPGOFuncNameStrings(const std::vector<GlobalVariable *> &NameVars,
+Error collectPGOFuncNameStrings(ArrayRef<GlobalVariable *> NameVars,
                                 std::string &Result, bool doCompression = true);
 
 /// \c NameStrings is a string composed of one of more sub-strings encoded in
@@ -967,7 +967,7 @@ struct Header {
 } // end namespace RawInstrProf
 
 // Parse MemOP Size range option.
-void getMemOPSizeRangeFromOption(std::string Str, int64_t &RangeStart,
+void getMemOPSizeRangeFromOption(StringRef Str, int64_t &RangeStart,
                                  int64_t &RangeLast);
 
 } // end namespace llvm
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index fef5bf304566..d14a56cb87e0 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -671,7 +671,7 @@ public:
 /// [AL, AH, CL] - Represent a list of defs
 ///
 class ListInit final : public TypedInit, public FoldingSetNode,
-                       public TrailingObjects<BitsInit, Init *> {
+                       public TrailingObjects<ListInit, Init *> {
   unsigned NumValues;
 
 public:
@@ -1137,17 +1137,19 @@ public:
 /// to have at least one value then a (possibly empty) list of arguments.  Each
 /// argument can have a name associated with it.
 ///
-class DagInit : public TypedInit, public FoldingSetNode {
+class DagInit final : public TypedInit, public FoldingSetNode,
+                      public TrailingObjects<DagInit, Init *, StringInit *> {
   Init *Val;
   StringInit *ValName;
-  SmallVector<Init*, 4> Args;
-  SmallVector<StringInit*, 4> ArgNames;
+  unsigned NumArgs;
+  unsigned NumArgNames;
 
-  DagInit(Init *V, StringInit *VN, ArrayRef<Init *> ArgRange,
-          ArrayRef<StringInit *> NameRange)
+  DagInit(Init *V, StringInit *VN, unsigned NumArgs, unsigned NumArgNames)
       : TypedInit(IK_DagInit, DagRecTy::get()), Val(V), ValName(VN),
-          Args(ArgRange.begin(), ArgRange.end()),
-          ArgNames(NameRange.begin(), NameRange.end()) {}
+        NumArgs(NumArgs), NumArgNames(NumArgNames) {}
+
+  friend TrailingObjects;
+  size_t numTrailingObjects(OverloadToken<Init *>) const { return NumArgs; }
 
 public:
   DagInit(const DagInit &Other) = delete;
@@ -1173,20 +1175,24 @@ public:
     return ValName ? ValName->getValue() : StringRef();
   }
 
-  unsigned getNumArgs() const { return Args.size(); }
+  unsigned getNumArgs() const { return NumArgs; }
   Init *getArg(unsigned Num) const {
-    assert(Num < Args.size() && "Arg number out of range!");
-    return Args[Num];
+    assert(Num < NumArgs && "Arg number out of range!");
+    return getTrailingObjects<Init *>()[Num];
   }
   StringInit *getArgName(unsigned Num) const {
-    assert(Num < ArgNames.size() && "Arg number out of range!");
-    return ArgNames[Num];
+    assert(Num < NumArgNames && "Arg number out of range!");
+    return getTrailingObjects<StringInit *>()[Num];
   }
   StringRef getArgNameStr(unsigned Num) const {
     StringInit *Init = getArgName(Num);
     return Init ? Init->getValue() : StringRef();
   }
 
+  ArrayRef<StringInit *> getArgNames() const {
+    return makeArrayRef(getTrailingObjects<StringInit *>(), NumArgNames);
+  }
+
   Init *resolveReferences(Record &R, const RecordVal *RV) const override;
 
   std::string getAsString() const override;
@@ -1194,20 +1200,20 @@ public:
   typedef SmallVectorImpl<Init*>::const_iterator       const_arg_iterator;
   typedef SmallVectorImpl<StringInit*>::const_iterator const_name_iterator;
 
-  inline const_arg_iterator  arg_begin() const { return Args.begin(); }
-  inline const_arg_iterator  arg_end  () const { return Args.end();   }
+  inline const_arg_iterator  arg_begin() const { return getTrailingObjects<Init *>(); }
+  inline const_arg_iterator  arg_end  () const { return arg_begin() + NumArgs;   }
   inline iterator_range<const_arg_iterator> args() const {
     return llvm::make_range(arg_begin(), arg_end());
   }
 
-  inline size_t              arg_size () const { return Args.size();  }
-  inline bool                arg_empty() const { return Args.empty(); }
+  inline size_t              arg_size () const { return NumArgs;  }
+  inline bool                arg_empty() const { return NumArgs == 0; }
 
-  inline const_name_iterator name_begin() const { return ArgNames.begin(); }
-  inline const_name_iterator name_end  () const { return ArgNames.end();   }
+  inline const_name_iterator name_begin() const { return getTrailingObjects<StringInit *>(); }
+  inline const_name_iterator name_end  () const { return name_begin() + NumArgNames;   }
 
-  inline size_t              name_size () const { return ArgNames.size();  }
-  inline bool                name_empty() const { return ArgNames.empty(); }
+  inline size_t              name_size () const { return NumArgNames;  }
+  inline bool                name_empty() const { return NumArgNames == 0; }
 
   Init *getBit(unsigned Bit) const override {
     llvm_unreachable("Illegal bit reference off dag");
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 1ca32d4c3589..17182b958ecb 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -405,7 +405,9 @@ public:
   }
 
   /// Returns if it's reasonable to merge stores to MemVT size.
-  virtual bool canMergeStoresTo(EVT MemVT) const { return true; }
+  virtual bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT) const {
+    return true;
+  }
 
   /// \brief Return true if it is cheap to speculate a call to intrinsic cttz.
   virtual bool isCheapToSpeculateCttz() const {
@@ -736,7 +738,7 @@ public:
     if (VT.isExtended()) return Expand;
     // If a target-specific SDNode requires legalization, require the target
     // to provide custom legalization for it.
-    if (Op > array_lengthof(OpActions[0])) return Custom;
+    if (Op >= array_lengthof(OpActions[0])) return Custom;
     return OpActions[(unsigned)VT.getSimpleVT().SimpleTy][Op];
   }
 
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index ba0a3ee1287a..856c288a071f 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -354,6 +354,13 @@ FunctionPass *createEarlyCSEPass(bool UseMemorySSA = false);
 //
 FunctionPass *createGVNHoistPass();
 
+//===----------------------------------------------------------------------===//
+//
+// GVNSink - This pass uses an "inverted" value numbering to decide the
+// similarity of expressions and sinks similar expressions into successors.
+//
+FunctionPass *createGVNSinkPass();
+
 //===----------------------------------------------------------------------===//
 //
 // MergedLoadStoreMotion - This pass merges loads and stores in diamonds. Loads
diff --git a/include/llvm/Transforms/Scalar/GVN.h b/include/llvm/Transforms/Scalar/GVN.h
index 8f05e8cdb233..3f97789cabbc 100644
--- a/include/llvm/Transforms/Scalar/GVN.h
+++ b/include/llvm/Transforms/Scalar/GVN.h
@@ -68,6 +68,24 @@ public:
   class ValueTable {
     DenseMap<Value *, uint32_t> valueNumbering;
     DenseMap<Expression, uint32_t> expressionNumbering;
+
+    // Expressions is the vector of Expression. ExprIdx is the mapping from
+    // value number to the index of Expression in Expressions. We use it
+    // instead of a DenseMap because filling such mapping is faster than
+    // filling a DenseMap and the compile time is a little better.
+    uint32_t nextExprNumber;
+    std::vector<Expression> Expressions;
+    std::vector<uint32_t> ExprIdx;
+    // Value number to PHINode mapping. Used for phi-translate in scalarpre.
+    DenseMap<uint32_t, PHINode *> NumberingPhi;
+    // Cache for phi-translate in scalarpre.
+    typedef DenseMap<std::pair<uint32_t, const BasicBlock *>, uint32_t>
+        PhiTranslateMap;
+    PhiTranslateMap PhiTranslateTable;
+    // Map the block to reversed postorder traversal number. It is used to
+    // find back edge easily.
+    DenseMap<const BasicBlock *, uint32_t> BlockRPONumber;
+
     AliasAnalysis *AA;
     MemoryDependenceResults *MD;
     DominatorTree *DT;
@@ -79,6 +97,10 @@ public:
                              Value *LHS, Value *RHS);
     Expression createExtractvalueExpr(ExtractValueInst *EI);
     uint32_t lookupOrAddCall(CallInst *C);
+    uint32_t phiTranslateImpl(const BasicBlock *BB, const BasicBlock *PhiBlock,
+                              uint32_t Num, GVN &Gvn);
+    std::pair<uint32_t, bool> assignExpNewValueNum(Expression &exp);
+    bool areAllValsInBB(uint32_t num, const BasicBlock *BB, GVN &Gvn);
 
   public:
     ValueTable();
@@ -87,9 +109,12 @@ public:
     ~ValueTable();
 
     uint32_t lookupOrAdd(Value *V);
-    uint32_t lookup(Value *V) const;
+    uint32_t lookup(Value *V, bool Verify = true) const;
     uint32_t lookupOrAddCmp(unsigned Opcode, CmpInst::Predicate Pred,
                             Value *LHS, Value *RHS);
+    uint32_t phiTranslate(const BasicBlock *BB, const BasicBlock *PhiBlock,
+                          uint32_t Num, GVN &Gvn);
+    void assignBlockRPONumber(Function &F);
     bool exists(Value *V) const;
     void add(Value *V, uint32_t num);
     void clear();
@@ -238,7 +263,12 @@ struct GVNHoistPass : PassInfoMixin<GVNHoistPass> {
   /// \brief Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
-
+/// \brief Uses an "inverted" value numbering to decide the similarity of
+/// expressions and sinks similar expressions into successors.
+struct GVNSinkPass : PassInfoMixin<GVNSinkPass> {
+  /// \brief Run the pass over the function.
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
 }
 
 #endif
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index b5a5f4c2704c..8942111307ff 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -356,6 +356,10 @@ void combineMetadata(Instruction *K, const Instruction *J, ArrayRef<unsigned> Kn
 /// Unknown metadata is removed.
 void combineMetadataForCSE(Instruction *K, const Instruction *J);
 
+// Replace each use of 'From' with 'To', if that use does not belong to basic
+// block where 'From' is defined. Returns the number of replacements made.
+unsigned replaceNonLocalUsesWith(Instruction *From, Value *To);
+
 /// Replace each use of 'From' with 'To' if that use is dominated by
 /// the given edge.  Returns the number of replacements made.
 unsigned replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT,
@@ -406,6 +410,14 @@ bool recognizeBSwapOrBitReverseIdiom(
 void maybeMarkSanitizerLibraryCallNoBuiltin(CallInst *CI,
                                             const TargetLibraryInfo *TLI);
 
+//===----------------------------------------------------------------------===//
+//  Transform predicates
+//
+
+/// Given an instruction, is it legal to set operand OpIdx to a non-constant
+/// value?
+bool canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx);
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 0ca712bbfe70..79517ec6a3a8 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -687,11 +687,8 @@ Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0, Constant *Op1,
   // bits.
 
   if (Opc == Instruction::And) {
-    unsigned BitWidth = DL.getTypeSizeInBits(Op0->getType()->getScalarType());
-    KnownBits Known0(BitWidth);
-    KnownBits Known1(BitWidth);
-    computeKnownBits(Op0, Known0, DL);
-    computeKnownBits(Op1, Known1, DL);
+    KnownBits Known0 = computeKnownBits(Op0, DL);
+    KnownBits Known1 = computeKnownBits(Op1, DL);
     if ((Known1.One | Known0.Zero).isAllOnesValue()) {
       // All the bits of Op0 that the 'and' could be masking are already zero.
       return Op0;
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 2e72d5aa8269..122442bafb11 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -688,9 +688,7 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
     if (isNUW)
       return Op0;
 
-    unsigned BitWidth = Op1->getType()->getScalarSizeInBits();
-    KnownBits Known(BitWidth);
-    computeKnownBits(Op1, Known, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+    KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
     if (Known.Zero.isMaxSignedValue()) {
       // Op1 is either 0 or the minimum signed value. If the sub is NSW, then
       // Op1 must be 0 because negating the minimum signed value is undefined.
@@ -1309,15 +1307,13 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
 
   // If any bits in the shift amount make that value greater than or equal to
   // the number of bits in the type, the shift is undefined.
-  unsigned BitWidth = Op1->getType()->getScalarSizeInBits();
-  KnownBits Known(BitWidth);
-  computeKnownBits(Op1, Known, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-  if (Known.One.getLimitedValue() >= BitWidth)
+  KnownBits Known = computeKnownBits(Op1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
+  if (Known.One.getLimitedValue() >= Known.getBitWidth())
     return UndefValue::get(Op0->getType());
 
   // If all valid bits in the shift amount are known zero, the first operand is
   // unchanged.
-  unsigned NumValidShiftBits = Log2_32_Ceil(BitWidth);
+  unsigned NumValidShiftBits = Log2_32_Ceil(Known.getBitWidth());
   if (Known.countMinTrailingZeros() >= NumValidShiftBits)
     return Op0;
 
@@ -1343,9 +1339,7 @@ static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0,
 
   // The low bit cannot be shifted out of an exact shift if it is set.
   if (isExact) {
-    unsigned BitWidth = Op0->getType()->getScalarSizeInBits();
-    KnownBits Op0Known(BitWidth);
-    computeKnownBits(Op0, Op0Known, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
+    KnownBits Op0Known = computeKnownBits(Op0, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
     if (Op0Known.One[0])
       return Op0;
   }
@@ -1428,6 +1422,8 @@ Value *llvm::SimplifyAShrInst(Value *Op0, Value *Op1, bool isExact,
   return ::SimplifyAShrInst(Op0, Op1, isExact, Q, RecursionLimit);
 }
 
+/// Commuted variants are assumed to be handled by calling this function again
+/// with the parameters swapped.
 static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
                                          ICmpInst *UnsignedICmp, bool IsAnd) {
   Value *X, *Y;
@@ -1560,20 +1556,8 @@ static Value *simplifyAndOrOfICmpsWithConstants(ICmpInst *Cmp0, ICmpInst *Cmp1,
   return nullptr;
 }
 
-/// Commuted variants are assumed to be handled by calling this function again
-/// with the parameters swapped.
-static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
-  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true))
-    return X;
-
-  if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1))
-    return X;
-
-  if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, true))
-    return X;
-
+static Value *simplifyAndOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1) {
   // (icmp (add V, C0), C1) & (icmp V, C0)
-  Type *ITy = Op0->getType();
   ICmpInst::Predicate Pred0, Pred1;
   const APInt *C0, *C1;
   Value *V;
@@ -1587,6 +1571,7 @@ static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   if (AddInst->getOperand(1) != Op1->getOperand(1))
     return nullptr;
 
+  Type *ITy = Op0->getType();
   bool isNSW = AddInst->hasNoSignedWrap();
   bool isNUW = AddInst->hasNoUnsignedWrap();
 
@@ -1617,18 +1602,29 @@ static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   return nullptr;
 }
 
-/// Commuted variants are assumed to be handled by calling this function again
-/// with the parameters swapped.
-static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
-  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false))
+static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
+  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true))
+    return X;
+  if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/true))
     return X;
 
-  if (Value *X = simplifyOrOfICmpsWithSameOperands(Op0, Op1))
+  if (Value *X = simplifyAndOfICmpsWithSameOperands(Op0, Op1))
+    return X;
+  if (Value *X = simplifyAndOfICmpsWithSameOperands(Op1, Op0))
     return X;
 
-  if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, false))
+  if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, true))
     return X;
 
+  if (Value *X = simplifyAndOfICmpsWithAdd(Op0, Op1))
+    return X;
+  if (Value *X = simplifyAndOfICmpsWithAdd(Op1, Op0))
+    return X;
+
+  return nullptr;
+}
+
+static Value *simplifyOrOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1) {
   // (icmp (add V, C0), C1) | (icmp V, C0)
   ICmpInst::Predicate Pred0, Pred1;
   const APInt *C0, *C1;
@@ -1674,19 +1670,24 @@ static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
   return nullptr;
 }
 
-static Value *simplifyPossiblyCastedAndOrOfICmps(ICmpInst *Cmp0, ICmpInst *Cmp1,
-                                                 bool IsAnd, CastInst *Cast) {
-  Value *V =
-      IsAnd ? simplifyAndOfICmps(Cmp0, Cmp1) : simplifyOrOfICmps(Cmp0, Cmp1);
-  if (!V)
-    return nullptr;
-  if (!Cast)
-    return V;
+static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
+  if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/false))
+    return X;
+  if (Value *X = simplifyUnsignedRangeCheck(Op1, Op0, /*IsAnd=*/false))
+    return X;
 
-  // If we looked through casts, we can only handle a constant simplification
-  // because we are not allowed to create a cast instruction here.
-  if (auto *C = dyn_cast<Constant>(V))
-    return ConstantExpr::getCast(Cast->getOpcode(), C, Cast->getType());
+  if (Value *X = simplifyOrOfICmpsWithSameOperands(Op0, Op1))
+    return X;
+  if (Value *X = simplifyOrOfICmpsWithSameOperands(Op1, Op0))
+    return X;
+
+  if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, false))
+    return X;
+
+  if (Value *X = simplifyOrOfICmpsWithAdd(Op0, Op1))
+    return X;
+  if (Value *X = simplifyOrOfICmpsWithAdd(Op1, Op0))
+    return X;
 
   return nullptr;
 }
@@ -1706,11 +1707,18 @@ static Value *simplifyAndOrOfICmps(Value *Op0, Value *Op1, bool IsAnd) {
   if (!Cmp0 || !Cmp1)
     return nullptr;
 
-  if (Value *V = simplifyPossiblyCastedAndOrOfICmps(Cmp0, Cmp1, IsAnd, Cast0))
-    return V;
-  if (Value *V = simplifyPossiblyCastedAndOrOfICmps(Cmp1, Cmp0, IsAnd, Cast0))
+  Value *V =
+      IsAnd ? simplifyAndOfICmps(Cmp0, Cmp1) : simplifyOrOfICmps(Cmp0, Cmp1);
+  if (!V)
+    return nullptr;
+  if (!Cast0)
     return V;
 
+  // If we looked through casts, we can only handle a constant simplification
+  // because we are not allowed to create a cast instruction here.
+  if (auto *C = dyn_cast<Constant>(V))
+    return ConstantExpr::getCast(Cast0->getOpcode(), C, Cast0->getType());
+
   return nullptr;
 }
 
@@ -1927,37 +1935,27 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
                                          MaxRecurse))
       return V;
 
-  // (A & C)|(B & D)
-  Value *C = nullptr, *D = nullptr;
-  if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
-      match(Op1, m_And(m_Value(B), m_Value(D)))) {
-    ConstantInt *C1 = dyn_cast<ConstantInt>(C);
-    ConstantInt *C2 = dyn_cast<ConstantInt>(D);
-    if (C1 && C2 && (C1->getValue() == ~C2->getValue())) {
+  // (A & C1)|(B & C2)
+  const APInt *C1, *C2;
+  if (match(Op0, m_And(m_Value(A), m_APInt(C1))) &&
+      match(Op1, m_And(m_Value(B), m_APInt(C2)))) {
+    if (*C1 == ~*C2) {
       // (A & C1)|(B & C2)
       // If we have: ((V + N) & C1) | (V & C2)
       // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
       // replace with V+N.
-      Value *V1, *V2;
-      if ((C2->getValue() & (C2->getValue() + 1)) == 0 && // C2 == 0+1+
-          match(A, m_Add(m_Value(V1), m_Value(V2)))) {
+      Value *N;
+      if (C2->isMask() && // C2 == 0+1+
+          match(A, m_c_Add(m_Specific(B), m_Value(N)))) {
         // Add commutes, try both ways.
-        if (V1 == B &&
-            MaskedValueIsZero(V2, C2->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
-          return A;
-        if (V2 == B &&
-            MaskedValueIsZero(V1, C2->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+        if (MaskedValueIsZero(N, *C2, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return A;
       }
       // Or commutes, try both ways.
-      if ((C1->getValue() & (C1->getValue() + 1)) == 0 &&
-          match(B, m_Add(m_Value(V1), m_Value(V2)))) {
+      if (C1->isMask() &&
+          match(B, m_c_Add(m_Specific(A), m_Value(N)))) {
         // Add commutes, try both ways.
-        if (V1 == A &&
-            MaskedValueIsZero(V2, C1->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
-          return B;
-        if (V2 == A &&
-            MaskedValueIsZero(V1, C1->getValue(), Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
+        if (MaskedValueIsZero(N, *C1, Q.DL, 0, Q.AC, Q.CxtI, Q.DT))
           return B;
       }
     }
@@ -3372,9 +3370,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   if (ICmpInst::isEquality(Pred)) {
     const APInt *RHSVal;
     if (match(RHS, m_APInt(RHSVal))) {
-      unsigned BitWidth = RHSVal->getBitWidth();
-      KnownBits LHSKnown(BitWidth);
-      computeKnownBits(LHS, LHSKnown, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
+      KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
       if (LHSKnown.Zero.intersects(*RHSVal) ||
           !LHSKnown.One.isSubsetOf(*RHSVal))
         return Pred == ICmpInst::ICMP_EQ ? ConstantInt::getFalse(ITy)
@@ -3539,6 +3535,10 @@ static const Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   if (V == Op)
     return RepOp;
 
+  // We cannot replace a constant, and shouldn't even try.
+  if (isa<Constant>(Op))
+    return nullptr;
+
   auto *I = dyn_cast<Instruction>(V);
   if (!I)
     return nullptr;
@@ -4444,19 +4444,21 @@ static Value *SimplifyIntrinsic(Function *F, IterTy ArgBegin, IterTy ArgEnd,
     case Intrinsic::uadd_with_overflow:
     case Intrinsic::sadd_with_overflow: {
       // X + undef -> undef
-      if (isa<UndefValue>(RHS))
+      if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
         return UndefValue::get(ReturnType);
 
       return nullptr;
     }
     case Intrinsic::umul_with_overflow:
     case Intrinsic::smul_with_overflow: {
+      // 0 * X -> { 0, false }
       // X * 0 -> { 0, false }
-      if (match(RHS, m_Zero()))
+      if (match(LHS, m_Zero()) || match(RHS, m_Zero()))
         return Constant::getNullValue(ReturnType);
 
+      // undef * X -> { 0, false }
       // X * undef -> { 0, false }
-      if (match(RHS, m_Undef()))
+      if (match(LHS, m_Undef()) || match(RHS, m_Undef()))
         return Constant::getNullValue(ReturnType);
 
       return nullptr;
@@ -4680,9 +4682,7 @@ Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
   // In general, it is possible for computeKnownBits to determine all bits in a
   // value even when the operands are not all constants.
   if (!Result && I->getType()->isIntOrIntVectorTy()) {
-    unsigned BitWidth = I->getType()->getScalarSizeInBits();
-    KnownBits Known(BitWidth);
-    computeKnownBits(I, Known, Q.DL, /*Depth*/ 0, Q.AC, I, Q.DT, ORE);
+    KnownBits Known = computeKnownBits(I, Q.DL, /*Depth*/ 0, Q.AC, I, Q.DT, ORE);
     if (Known.isConstant())
       Result = ConstantInt::get(I->getType(), Known.getConstant());
   }
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index 471ccb62970d..e6391792bc23 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -534,9 +534,7 @@ static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT,
 
   VectorType *VecTy = dyn_cast<VectorType>(V->getType());
   if (!VecTy) {
-    unsigned BitWidth = V->getType()->getIntegerBitWidth();
-    KnownBits Known(BitWidth);
-    computeKnownBits(V, Known, DL, 0, AC, dyn_cast<Instruction>(V), DT);
+    KnownBits Known = computeKnownBits(V, DL, 0, AC, dyn_cast<Instruction>(V), DT);
     return Known.isZero();
   }
 
@@ -550,14 +548,12 @@ static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT,
 
   // For a vector, KnownZero will only be true if all values are zero, so check
   // this per component
-  unsigned BitWidth = VecTy->getElementType()->getIntegerBitWidth();
   for (unsigned I = 0, N = VecTy->getNumElements(); I != N; ++I) {
     Constant *Elem = C->getAggregateElement(I);
     if (isa<UndefValue>(Elem))
       return true;
 
-    KnownBits Known(BitWidth);
-    computeKnownBits(Elem, Known, DL);
+    KnownBits Known = computeKnownBits(Elem, DL);
     if (Known.isZero())
       return true;
   }
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 0b5f6266e373..e988f6444a58 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -73,30 +73,23 @@ LPPassManager::LPPassManager()
   CurrentLoop = nullptr;
 }
 
-// Inset loop into loop nest (LoopInfo) and loop queue (LQ).
-Loop &LPPassManager::addLoop(Loop *ParentLoop) {
-  // Create a new loop. LI will take ownership.
-  Loop *L = new Loop();
-
-  // Insert into the loop nest and the loop queue.
-  if (!ParentLoop) {
+// Insert loop into loop nest (LoopInfo) and loop queue (LQ).
+void LPPassManager::addLoop(Loop &L) {
+  if (!L.getParentLoop()) {
     // This is the top level loop.
-    LI->addTopLevelLoop(L);
-    LQ.push_front(L);
-    return *L;
+    LQ.push_front(&L);
+    return;
   }
 
-  ParentLoop->addChildLoop(L);
   // Insert L into the loop queue after the parent loop.
   for (auto I = LQ.begin(), E = LQ.end(); I != E; ++I) {
-    if (*I == L->getParentLoop()) {
+    if (*I == L.getParentLoop()) {
       // deque does not support insert after.
       ++I;
-      LQ.insert(I, 1, L);
-      break;
+      LQ.insert(I, 1, &L);
+      return;
     }
   }
-  return *L;
 }
 
 /// cloneBasicBlockSimpleAnalysis - Invoke cloneBasicBlockAnalysis hook for
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 78ded8141c08..d280fda0a162 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -2178,6 +2178,63 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
   return Flags;
 }
 
+bool ScalarEvolution::isAvailableAtLoopEntry(const SCEV *S, const Loop *L,
+                                             DominatorTree &DT, LoopInfo &LI) {
+  if (!isLoopInvariant(S, L))
+    return false;
+  // If a value depends on a SCEVUnknown which is defined after the loop, we
+  // conservatively assume that we cannot calculate it at the loop's entry.
+  struct FindDominatedSCEVUnknown {
+    bool Found = false;
+    const Loop *L;
+    DominatorTree &DT;
+    LoopInfo &LI;
+
+    FindDominatedSCEVUnknown(const Loop *L, DominatorTree &DT, LoopInfo &LI)
+        : L(L), DT(DT), LI(LI) {}
+
+    bool checkSCEVUnknown(const SCEVUnknown *SU) {
+      if (auto *I = dyn_cast<Instruction>(SU->getValue())) {
+        if (DT.dominates(L->getHeader(), I->getParent()))
+          Found = true;
+        else
+          assert(DT.dominates(I->getParent(), L->getHeader()) &&
+                 "No dominance relationship between SCEV and loop?");
+      }
+      return false;
+    }
+
+    bool follow(const SCEV *S) {
+      switch (static_cast<SCEVTypes>(S->getSCEVType())) {
+      case scConstant:
+        return false;
+      case scAddRecExpr:
+      case scTruncate:
+      case scZeroExtend:
+      case scSignExtend:
+      case scAddExpr:
+      case scMulExpr:
+      case scUMaxExpr:
+      case scSMaxExpr:
+      case scUDivExpr:
+        return true;
+      case scUnknown:
+        return checkSCEVUnknown(cast<SCEVUnknown>(S));
+      case scCouldNotCompute:
+        llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
+      }
+      return false;
+    }
+
+    bool isDone() { return Found; }
+  };
+
+  FindDominatedSCEVUnknown FSU(L, DT, LI);
+  SCEVTraversal<FindDominatedSCEVUnknown> ST(FSU);
+  ST.visitAll(S);
+  return !FSU.Found;
+}
+
 /// Get a canonical add expression, or something simpler if possible.
 const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
                                         SCEV::NoWrapFlags Flags,
@@ -2459,7 +2516,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
     const Loop *AddRecLoop = AddRec->getLoop();
     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (isLoopInvariant(Ops[i], AddRecLoop)) {
+      if (isAvailableAtLoopEntry(Ops[i], AddRecLoop, DT, LI)) {
         LIOps.push_back(Ops[i]);
         Ops.erase(Ops.begin()+i);
         --i; --e;
@@ -2734,7 +2791,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
     const SCEVAddRecExpr *AddRec = cast<SCEVAddRecExpr>(Ops[Idx]);
     const Loop *AddRecLoop = AddRec->getLoop();
     for (unsigned i = 0, e = Ops.size(); i != e; ++i)
-      if (isLoopInvariant(Ops[i], AddRecLoop)) {
+      if (isAvailableAtLoopEntry(Ops[i], AddRecLoop, DT, LI)) {
         LIOps.push_back(Ops[i]);
         Ops.erase(Ops.begin()+i);
         --i; --e;
@@ -4648,10 +4705,7 @@ uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) {
 
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
     // For a SCEVUnknown, ask ValueTracking.
-    unsigned BitWidth = getTypeSizeInBits(U->getType());
-    KnownBits Known(BitWidth);
-    computeKnownBits(U->getValue(), Known, getDataLayout(), 0, &AC,
-                     nullptr, &DT);
+    KnownBits Known = computeKnownBits(U->getValue(), getDataLayout(), 0, &AC, nullptr, &DT);
     return Known.countMinTrailingZeros();
   }
 
@@ -4831,8 +4885,7 @@ ScalarEvolution::getRange(const SCEV *S,
     const DataLayout &DL = getDataLayout();
     if (SignHint == ScalarEvolution::HINT_RANGE_UNSIGNED) {
       // For a SCEVUnknown, ask ValueTracking.
-      KnownBits Known(BitWidth);
-      computeKnownBits(U->getValue(), Known, DL, 0, &AC, nullptr, &DT);
+      KnownBits Known = computeKnownBits(U->getValue(), DL, 0, &AC, nullptr, &DT);
       if (Known.One != ~Known.Zero + 1)
         ConservativeResult =
             ConservativeResult.intersectWith(ConstantRange(Known.One,
@@ -9537,8 +9590,11 @@ struct SCEVCollectAddRecMultiplies {
       bool HasAddRec = false;
       SmallVector<const SCEV *, 0> Operands;
       for (auto Op : Mul->operands()) {
-        if (isa<SCEVUnknown>(Op)) {
+        const SCEVUnknown *Unknown = dyn_cast<SCEVUnknown>(Op);
+        if (Unknown && !isa<CallInst>(Unknown->getValue())) {
           Operands.push_back(Op);
+        } else if (Unknown) {
+          HasAddRec = true;
         } else {
           bool ContainsAddRec;
           SCEVHasAddRec ContiansAddRec(ContainsAddRec);
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 86cbd79aa84e..f9b9df2bc707 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -1305,12 +1305,17 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   // Expand the core addrec. If we need post-loop scaling, force it to
   // expand to an integer type to avoid the need for additional casting.
   Type *ExpandTy = PostLoopScale ? IntTy : STy;
+  // We can't use a pointer type for the addrec if the pointer type is
+  // non-integral.
+  Type *AddRecPHIExpandTy =
+      DL.isNonIntegralPointerType(STy) ? Normalized->getType() : ExpandTy;
+
   // In some cases, we decide to reuse an existing phi node but need to truncate
   // it and/or invert the step.
   Type *TruncTy = nullptr;
   bool InvertStep = false;
-  PHINode *PN = getAddRecExprPHILiterally(Normalized, L, ExpandTy, IntTy,
-                                          TruncTy, InvertStep);
+  PHINode *PN = getAddRecExprPHILiterally(Normalized, L, AddRecPHIExpandTy,
+                                          IntTy, TruncTy, InvertStep);
 
   // Accommodate post-inc mode, if necessary.
   Value *Result;
@@ -1383,8 +1388,15 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
   // Re-apply any non-loop-dominating offset.
   if (PostLoopOffset) {
     if (PointerType *PTy = dyn_cast<PointerType>(ExpandTy)) {
-      const SCEV *const OffsetArray[1] = { PostLoopOffset };
-      Result = expandAddToGEP(OffsetArray, OffsetArray+1, PTy, IntTy, Result);
+      if (Result->getType()->isIntegerTy()) {
+        Value *Base = expandCodeFor(PostLoopOffset, ExpandTy);
+        const SCEV *const OffsetArray[1] = {SE.getUnknown(Result)};
+        Result = expandAddToGEP(OffsetArray, OffsetArray + 1, PTy, IntTy, Base);
+      } else {
+        const SCEV *const OffsetArray[1] = {PostLoopOffset};
+        Result =
+            expandAddToGEP(OffsetArray, OffsetArray + 1, PTy, IntTy, Result);
+      }
     } else {
       Result = InsertNoopCastOfTo(Result, IntTy);
       Result = Builder.CreateAdd(Result,
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 8a5d10473662..7a8d4f3be24f 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -149,6 +149,10 @@ bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType) const {
   return TTIImpl->isLegalMaskedGather(DataType);
 }
 
+bool TargetTransformInfo::prefersVectorizedAddressing() const {
+  return TTIImpl->prefersVectorizedAddressing();
+}
+
 int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                               int64_t BaseOffset,
                                               bool HasBaseReg,
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 8e6c1096eec8..bd79cd56a18b 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -149,8 +149,10 @@ static KnownBits computeKnownBits(const Value *V, unsigned Depth,
 KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL,
                                  unsigned Depth, AssumptionCache *AC,
                                  const Instruction *CxtI,
-                                 const DominatorTree *DT) {
-  return ::computeKnownBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT));
+                                 const DominatorTree *DT,
+                                 OptimizationRemarkEmitter *ORE) {
+  return ::computeKnownBits(V, Depth,
+                            Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
 }
 
 bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index 1f8b50342c2d..c1d81ac203a1 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -660,10 +660,12 @@ void ModuleBitcodeWriter::writeAttributeTable() {
 
   SmallVector<uint64_t, 64> Record;
   for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-    const AttributeList &A = Attrs[i];
-    for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i)
-      Record.push_back(
-          VE.getAttributeGroupID({A.getSlotIndex(i), A.getSlotAttributes(i)}));
+    AttributeList AL = Attrs[i];
+    for (unsigned i = AL.index_begin(), e = AL.index_end(); i != e; ++i) {
+      AttributeSet AS = AL.getAttributes(i);
+      if (AS.hasAttributes())
+        Record.push_back(VE.getAttributeGroupID({i, AS}));
+    }
 
     Stream.EmitRecord(bitc::PARAMATTR_CODE_ENTRY, Record);
     Record.clear();
@@ -3413,30 +3415,8 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
 
   // Create value IDs for undefined references.
   forEachSummary([&](GVInfo I) {
-    if (auto *VS = dyn_cast<GlobalVarSummary>(I.second)) {
-      for (auto &RI : VS->refs())
-        assignValueId(RI.getGUID());
-      return;
-    }
-
-    auto *FS = dyn_cast<FunctionSummary>(I.second);
-    if (!FS)
-      return;
-    for (auto &RI : FS->refs())
+    for (auto &RI : I.second->refs())
       assignValueId(RI.getGUID());
-
-    for (auto &EI : FS->calls()) {
-      GlobalValue::GUID GUID = EI.first.getGUID();
-      if (!hasValueId(GUID)) {
-        // For SamplePGO, the indirect call targets for local functions will
-        // have its original name annotated in profile. We try to find the
-        // corresponding PGOFuncName as the GUID.
-        GUID = Index.getGUIDFromOriginalID(GUID);
-        if (GUID == 0 || !hasValueId(GUID))
-          continue;
-      }
-      assignValueId(GUID);
-    }
   });
 
   for (const auto &GVI : valueIds()) {
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index fd76400331d9..bb626baabd12 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -902,8 +902,11 @@ void ValueEnumerator::EnumerateAttributes(AttributeList PAL) {
   }
 
   // Do lookups for all attribute groups.
-  for (unsigned i = 0, e = PAL.getNumSlots(); i != e; ++i) {
-    IndexAndAttrSet Pair = {PAL.getSlotIndex(i), PAL.getSlotAttributes(i)};
+  for (unsigned i = PAL.index_begin(), e = PAL.index_end(); i != e; ++i) {
+    AttributeSet AS = PAL.getAttributes(i);
+    if (!AS.hasAttributes())
+      continue;
+    IndexAndAttrSet Pair = {i, AS};
     unsigned &Entry = AttributeGroupMap[Pair];
     if (Entry == 0) {
       AttributeGroups.push_back(Pair);
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 7ddb86d80bf0..d72cf5922987 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -628,12 +628,15 @@ void AsmPrinter::EmitDebugThreadLocal(const MCExpr *Value,
 /// EmitFunctionHeader - This method emits the header for the current
 /// function.
 void AsmPrinter::EmitFunctionHeader() {
+  const Function *F = MF->getFunction();
+
+  if (isVerbose())
+    OutStreamer->GetCommentOS() << "-- Begin function " << F->getName() << '\n';
+
   // Print out constants referenced by the function
   EmitConstantPool();
 
   // Print the 'header' of function.
-  const Function *F = MF->getFunction();
-
   OutStreamer->SwitchSection(getObjFileLowering().SectionForGlobal(F, TM));
   EmitVisibility(CurrentFnSym, F->getVisibility());
 
@@ -1107,6 +1110,9 @@ void AsmPrinter::EmitFunctionBody() {
     HI.Handler->endFunction(MF);
   }
 
+  if (isVerbose())
+    OutStreamer->GetCommentOS() << "-- End function\n";
+
   OutStreamer->AddBlankLine();
 }
 
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 1b39e46ee466..114aea391a86 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -1025,11 +1025,11 @@ void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) {
   bool EmptyPrologue = true;
   for (const auto &MBB : *MF) {
     for (const auto &MI : MBB) {
-      if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
+      if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) &&
           MI.getDebugLoc()) {
         PrologEndLoc = MI.getDebugLoc();
         break;
-      } else if (!MI.isDebugValue()) {
+      } else if (!MI.isMetaInstruction()) {
         EmptyPrologue = false;
       }
     }
@@ -1562,7 +1562,7 @@ TypeIndex CodeViewDebug::lowerTypeEnum(const DICompositeType *Ty) {
         EnumeratorCount++;
       }
     }
-    FTI = FLRB.end();
+    FTI = FLRB.end(true);
   }
 
   std::string FullName = getFullyQualifiedName(Ty);
@@ -1869,7 +1869,7 @@ CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) {
     MemberCount++;
   }
 
-  TypeIndex FieldTI = FLBR.end();
+  TypeIndex FieldTI = FLBR.end(true);
   return std::make_tuple(FieldTI, Info.VShapeTI, MemberCount,
                          !Info.NestedClasses.empty());
 }
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
index 8e3b88d0af0e..201030f0ac5c 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -116,65 +116,17 @@ void DIEHash::addParentContext(const DIE &Parent) {
 
 // Collect all of the attributes for a particular DIE in single structure.
 void DIEHash::collectAttributes(const DIE &Die, DIEAttrs &Attrs) {
-#define COLLECT_ATTR(NAME)                                                     \
-  case dwarf::NAME:                                                            \
-    Attrs.NAME = V;                                                            \
-    break
 
   for (const auto &V : Die.values()) {
     DEBUG(dbgs() << "Attribute: "
                  << dwarf::AttributeString(V.getAttribute())
                  << " added.\n");
     switch (V.getAttribute()) {
-      COLLECT_ATTR(DW_AT_name);
-      COLLECT_ATTR(DW_AT_accessibility);
-      COLLECT_ATTR(DW_AT_address_class);
-      COLLECT_ATTR(DW_AT_allocated);
-      COLLECT_ATTR(DW_AT_artificial);
-      COLLECT_ATTR(DW_AT_associated);
-      COLLECT_ATTR(DW_AT_binary_scale);
-      COLLECT_ATTR(DW_AT_bit_offset);
-      COLLECT_ATTR(DW_AT_bit_size);
-      COLLECT_ATTR(DW_AT_bit_stride);
-      COLLECT_ATTR(DW_AT_byte_size);
-      COLLECT_ATTR(DW_AT_byte_stride);
-      COLLECT_ATTR(DW_AT_const_expr);
-      COLLECT_ATTR(DW_AT_const_value);
-      COLLECT_ATTR(DW_AT_containing_type);
-      COLLECT_ATTR(DW_AT_count);
-      COLLECT_ATTR(DW_AT_data_bit_offset);
-      COLLECT_ATTR(DW_AT_data_location);
-      COLLECT_ATTR(DW_AT_data_member_location);
-      COLLECT_ATTR(DW_AT_decimal_scale);
-      COLLECT_ATTR(DW_AT_decimal_sign);
-      COLLECT_ATTR(DW_AT_default_value);
-      COLLECT_ATTR(DW_AT_digit_count);
-      COLLECT_ATTR(DW_AT_discr);
-      COLLECT_ATTR(DW_AT_discr_list);
-      COLLECT_ATTR(DW_AT_discr_value);
-      COLLECT_ATTR(DW_AT_encoding);
-      COLLECT_ATTR(DW_AT_enum_class);
-      COLLECT_ATTR(DW_AT_endianity);
-      COLLECT_ATTR(DW_AT_explicit);
-      COLLECT_ATTR(DW_AT_is_optional);
-      COLLECT_ATTR(DW_AT_location);
-      COLLECT_ATTR(DW_AT_lower_bound);
-      COLLECT_ATTR(DW_AT_mutable);
-      COLLECT_ATTR(DW_AT_ordering);
-      COLLECT_ATTR(DW_AT_picture_string);
-      COLLECT_ATTR(DW_AT_prototyped);
-      COLLECT_ATTR(DW_AT_small);
-      COLLECT_ATTR(DW_AT_segment);
-      COLLECT_ATTR(DW_AT_string_length);
-      COLLECT_ATTR(DW_AT_threads_scaled);
-      COLLECT_ATTR(DW_AT_upper_bound);
-      COLLECT_ATTR(DW_AT_use_location);
-      COLLECT_ATTR(DW_AT_use_UTF8);
-      COLLECT_ATTR(DW_AT_variable_parameter);
-      COLLECT_ATTR(DW_AT_virtuality);
-      COLLECT_ATTR(DW_AT_visibility);
-      COLLECT_ATTR(DW_AT_vtable_elem_location);
-      COLLECT_ATTR(DW_AT_type);
+#define HANDLE_DIE_HASH_ATTR(NAME)                                             \
+  case dwarf::NAME:                                                            \
+    Attrs.NAME = V;                                                            \
+    break;
+#include "DIEHashAttributes.def"
     default:
       break;
     }
@@ -366,62 +318,12 @@ void DIEHash::hashAttribute(const DIEValue &Value, dwarf::Tag Tag) {
 // Go through the attributes from \param Attrs in the order specified in 7.27.4
 // and hash them.
 void DIEHash::hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag) {
-#define ADD_ATTR(ATTR)                                                         \
+#define HANDLE_DIE_HASH_ATTR(NAME)                                             \
   {                                                                            \
-    if (ATTR)                                                                  \
-      hashAttribute(ATTR, Tag);                                                \
+    if (Attrs.NAME)                                                           \
+      hashAttribute(Attrs.NAME, Tag);                                         \
   }
-
-  ADD_ATTR(Attrs.DW_AT_name);
-  ADD_ATTR(Attrs.DW_AT_accessibility);
-  ADD_ATTR(Attrs.DW_AT_address_class);
-  ADD_ATTR(Attrs.DW_AT_allocated);
-  ADD_ATTR(Attrs.DW_AT_artificial);
-  ADD_ATTR(Attrs.DW_AT_associated);
-  ADD_ATTR(Attrs.DW_AT_binary_scale);
-  ADD_ATTR(Attrs.DW_AT_bit_offset);
-  ADD_ATTR(Attrs.DW_AT_bit_size);
-  ADD_ATTR(Attrs.DW_AT_bit_stride);
-  ADD_ATTR(Attrs.DW_AT_byte_size);
-  ADD_ATTR(Attrs.DW_AT_byte_stride);
-  ADD_ATTR(Attrs.DW_AT_const_expr);
-  ADD_ATTR(Attrs.DW_AT_const_value);
-  ADD_ATTR(Attrs.DW_AT_containing_type);
-  ADD_ATTR(Attrs.DW_AT_count);
-  ADD_ATTR(Attrs.DW_AT_data_bit_offset);
-  ADD_ATTR(Attrs.DW_AT_data_location);
-  ADD_ATTR(Attrs.DW_AT_data_member_location);
-  ADD_ATTR(Attrs.DW_AT_decimal_scale);
-  ADD_ATTR(Attrs.DW_AT_decimal_sign);
-  ADD_ATTR(Attrs.DW_AT_default_value);
-  ADD_ATTR(Attrs.DW_AT_digit_count);
-  ADD_ATTR(Attrs.DW_AT_discr);
-  ADD_ATTR(Attrs.DW_AT_discr_list);
-  ADD_ATTR(Attrs.DW_AT_discr_value);
-  ADD_ATTR(Attrs.DW_AT_encoding);
-  ADD_ATTR(Attrs.DW_AT_enum_class);
-  ADD_ATTR(Attrs.DW_AT_endianity);
-  ADD_ATTR(Attrs.DW_AT_explicit);
-  ADD_ATTR(Attrs.DW_AT_is_optional);
-  ADD_ATTR(Attrs.DW_AT_location);
-  ADD_ATTR(Attrs.DW_AT_lower_bound);
-  ADD_ATTR(Attrs.DW_AT_mutable);
-  ADD_ATTR(Attrs.DW_AT_ordering);
-  ADD_ATTR(Attrs.DW_AT_picture_string);
-  ADD_ATTR(Attrs.DW_AT_prototyped);
-  ADD_ATTR(Attrs.DW_AT_small);
-  ADD_ATTR(Attrs.DW_AT_segment);
-  ADD_ATTR(Attrs.DW_AT_string_length);
-  ADD_ATTR(Attrs.DW_AT_threads_scaled);
-  ADD_ATTR(Attrs.DW_AT_upper_bound);
-  ADD_ATTR(Attrs.DW_AT_use_location);
-  ADD_ATTR(Attrs.DW_AT_use_UTF8);
-  ADD_ATTR(Attrs.DW_AT_variable_parameter);
-  ADD_ATTR(Attrs.DW_AT_virtuality);
-  ADD_ATTR(Attrs.DW_AT_visibility);
-  ADD_ATTR(Attrs.DW_AT_vtable_elem_location);
-  ADD_ATTR(Attrs.DW_AT_type);
-
+#include "DIEHashAttributes.def"
   // FIXME: Add the extended attributes.
 }
 
@@ -478,10 +380,12 @@ void DIEHash::computeHash(const DIE &Die) {
 /// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
 /// with the inclusion of the full CU and all top level CU entities.
 // TODO: Initialize the type chain at 0 instead of 1 for CU signatures.
-uint64_t DIEHash::computeCUSignature(const DIE &Die) {
+uint64_t DIEHash::computeCUSignature(StringRef DWOName, const DIE &Die) {
   Numbering.clear();
   Numbering[&Die] = 1;
 
+  if (!DWOName.empty())
+    Hash.update(DWOName);
   // Hash the DIE.
   computeHash(Die);
 
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.h b/lib/CodeGen/AsmPrinter/DIEHash.h
index 996cd7ef3d2e..29337ae38a99 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -28,64 +28,15 @@ class CompileUnit;
 class DIEHash {
   // Collection of all attributes used in hashing a particular DIE.
   struct DIEAttrs {
-    DIEValue DW_AT_name;
-    DIEValue DW_AT_accessibility;
-    DIEValue DW_AT_address_class;
-    DIEValue DW_AT_allocated;
-    DIEValue DW_AT_artificial;
-    DIEValue DW_AT_associated;
-    DIEValue DW_AT_binary_scale;
-    DIEValue DW_AT_bit_offset;
-    DIEValue DW_AT_bit_size;
-    DIEValue DW_AT_bit_stride;
-    DIEValue DW_AT_byte_size;
-    DIEValue DW_AT_byte_stride;
-    DIEValue DW_AT_const_expr;
-    DIEValue DW_AT_const_value;
-    DIEValue DW_AT_containing_type;
-    DIEValue DW_AT_count;
-    DIEValue DW_AT_data_bit_offset;
-    DIEValue DW_AT_data_location;
-    DIEValue DW_AT_data_member_location;
-    DIEValue DW_AT_decimal_scale;
-    DIEValue DW_AT_decimal_sign;
-    DIEValue DW_AT_default_value;
-    DIEValue DW_AT_digit_count;
-    DIEValue DW_AT_discr;
-    DIEValue DW_AT_discr_list;
-    DIEValue DW_AT_discr_value;
-    DIEValue DW_AT_encoding;
-    DIEValue DW_AT_enum_class;
-    DIEValue DW_AT_endianity;
-    DIEValue DW_AT_explicit;
-    DIEValue DW_AT_is_optional;
-    DIEValue DW_AT_location;
-    DIEValue DW_AT_lower_bound;
-    DIEValue DW_AT_mutable;
-    DIEValue DW_AT_ordering;
-    DIEValue DW_AT_picture_string;
-    DIEValue DW_AT_prototyped;
-    DIEValue DW_AT_small;
-    DIEValue DW_AT_segment;
-    DIEValue DW_AT_string_length;
-    DIEValue DW_AT_threads_scaled;
-    DIEValue DW_AT_upper_bound;
-    DIEValue DW_AT_use_location;
-    DIEValue DW_AT_use_UTF8;
-    DIEValue DW_AT_variable_parameter;
-    DIEValue DW_AT_virtuality;
-    DIEValue DW_AT_visibility;
-    DIEValue DW_AT_vtable_elem_location;
-    DIEValue DW_AT_type;
-
-    // Insert any additional ones here...
+#define HANDLE_DIE_HASH_ATTR(NAME) DIEValue NAME;
+#include "DIEHashAttributes.def"
   };
 
 public:
   DIEHash(AsmPrinter *A = nullptr) : AP(A) {}
 
   /// \brief Computes the CU signature.
-  uint64_t computeCUSignature(const DIE &Die);
+  uint64_t computeCUSignature(StringRef DWOName, const DIE &Die);
 
   /// \brief Computes the type signature.
   uint64_t computeTypeSignature(const DIE &Die);
diff --git a/lib/CodeGen/AsmPrinter/DIEHashAttributes.def b/lib/CodeGen/AsmPrinter/DIEHashAttributes.def
new file mode 100644
index 000000000000..28a02390fccb
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DIEHashAttributes.def
@@ -0,0 +1,55 @@
+#ifndef HANDLE_DIE_HASH_ATTR
+#error "Missing macro definition of HANDLE_DIE_HASH_ATTR"
+#endif
+
+HANDLE_DIE_HASH_ATTR(DW_AT_name)
+HANDLE_DIE_HASH_ATTR(DW_AT_accessibility)
+HANDLE_DIE_HASH_ATTR(DW_AT_address_class)
+HANDLE_DIE_HASH_ATTR(DW_AT_allocated)
+HANDLE_DIE_HASH_ATTR(DW_AT_artificial)
+HANDLE_DIE_HASH_ATTR(DW_AT_associated)
+HANDLE_DIE_HASH_ATTR(DW_AT_binary_scale)
+HANDLE_DIE_HASH_ATTR(DW_AT_bit_offset)
+HANDLE_DIE_HASH_ATTR(DW_AT_bit_size)
+HANDLE_DIE_HASH_ATTR(DW_AT_bit_stride)
+HANDLE_DIE_HASH_ATTR(DW_AT_byte_size)
+HANDLE_DIE_HASH_ATTR(DW_AT_byte_stride)
+HANDLE_DIE_HASH_ATTR(DW_AT_const_expr)
+HANDLE_DIE_HASH_ATTR(DW_AT_const_value)
+HANDLE_DIE_HASH_ATTR(DW_AT_containing_type)
+HANDLE_DIE_HASH_ATTR(DW_AT_count)
+HANDLE_DIE_HASH_ATTR(DW_AT_data_bit_offset)
+HANDLE_DIE_HASH_ATTR(DW_AT_data_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_data_member_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_decimal_scale)
+HANDLE_DIE_HASH_ATTR(DW_AT_decimal_sign)
+HANDLE_DIE_HASH_ATTR(DW_AT_default_value)
+HANDLE_DIE_HASH_ATTR(DW_AT_digit_count)
+HANDLE_DIE_HASH_ATTR(DW_AT_discr)
+HANDLE_DIE_HASH_ATTR(DW_AT_discr_list)
+HANDLE_DIE_HASH_ATTR(DW_AT_discr_value)
+HANDLE_DIE_HASH_ATTR(DW_AT_encoding)
+HANDLE_DIE_HASH_ATTR(DW_AT_enum_class)
+HANDLE_DIE_HASH_ATTR(DW_AT_endianity)
+HANDLE_DIE_HASH_ATTR(DW_AT_explicit)
+HANDLE_DIE_HASH_ATTR(DW_AT_is_optional)
+HANDLE_DIE_HASH_ATTR(DW_AT_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_lower_bound)
+HANDLE_DIE_HASH_ATTR(DW_AT_mutable)
+HANDLE_DIE_HASH_ATTR(DW_AT_ordering)
+HANDLE_DIE_HASH_ATTR(DW_AT_picture_string)
+HANDLE_DIE_HASH_ATTR(DW_AT_prototyped)
+HANDLE_DIE_HASH_ATTR(DW_AT_small)
+HANDLE_DIE_HASH_ATTR(DW_AT_segment)
+HANDLE_DIE_HASH_ATTR(DW_AT_string_length)
+HANDLE_DIE_HASH_ATTR(DW_AT_threads_scaled)
+HANDLE_DIE_HASH_ATTR(DW_AT_upper_bound)
+HANDLE_DIE_HASH_ATTR(DW_AT_use_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_use_UTF8)
+HANDLE_DIE_HASH_ATTR(DW_AT_variable_parameter)
+HANDLE_DIE_HASH_ATTR(DW_AT_virtuality)
+HANDLE_DIE_HASH_ATTR(DW_AT_visibility)
+HANDLE_DIE_HASH_ATTR(DW_AT_vtable_elem_location)
+HANDLE_DIE_HASH_ATTR(DW_AT_type)
+
+#undef HANDLE_DIE_HASH_ATTR
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 826162ad47c4..0971c5942203 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -115,7 +115,8 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
   return getBaseTypeSize(BaseType);
 }
 
-bool hasDebugInfo(const MachineModuleInfo *MMI, const MachineFunction *MF) {
+static bool hasDebugInfo(const MachineModuleInfo *MMI,
+                         const MachineFunction *MF) {
   if (!MMI->hasDebugInfo())
     return false;
   auto *SP = MF->getFunction()->getSubprogram();
@@ -223,9 +224,9 @@ void DebugHandlerBase::endInstruction() {
     return;
 
   assert(CurMI != nullptr);
-  // Don't create a new label after DBG_VALUE instructions.
-  // They don't generate code.
-  if (!CurMI->isDebugValue()) {
+  // Don't create a new label after DBG_VALUE and other instructions that don't
+  // generate code.
+  if (!CurMI->isMetaInstruction()) {
     PrevLabel = nullptr;
     PrevInstBB = CurMI->getParent();
   }
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index e172712cf889..04073b3aed68 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -760,7 +760,7 @@ void DwarfCompileUnit::emitHeader(bool UseOffsets) {
 /// addGlobalName - Add a new global name to the compile unit.
 void DwarfCompileUnit::addGlobalName(StringRef Name, const DIE &Die,
                                      const DIScope *Context) {
-  if (includeMinimalInlineScopes())
+  if (!DD->hasDwarfPubSections(includeMinimalInlineScopes()))
     return;
   std::string FullName = getParentContextString(Context) + Name.str();
   GlobalNames[FullName] = &Die;
@@ -768,7 +768,7 @@ void DwarfCompileUnit::addGlobalName(StringRef Name, const DIE &Die,
 
 void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name,
                                                 const DIScope *Context) {
-  if (includeMinimalInlineScopes())
+  if (!DD->hasDwarfPubSections(includeMinimalInlineScopes()))
     return;
   std::string FullName = getParentContextString(Context) + Name.str();
   // Insert, allowing the entry to remain as-is if it's already present
@@ -781,7 +781,7 @@ void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name,
 /// Add a new global type to the unit.
 void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,
                                      const DIScope *Context) {
-  if (includeMinimalInlineScopes())
+  if (!DD->hasDwarfPubSections(includeMinimalInlineScopes()))
     return;
   std::string FullName = getParentContextString(Context) + Ty->getName().str();
   GlobalTypes[FullName] = &Die;
@@ -789,7 +789,7 @@ void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,
 
 void DwarfCompileUnit::addGlobalTypeUnitType(const DIType *Ty,
                                              const DIScope *Context) {
-  if (includeMinimalInlineScopes())
+  if (!DD->hasDwarfPubSections(includeMinimalInlineScopes()))
     return;
   std::string FullName = getParentContextString(Context) + Ty->getName().str();
   // Insert, allowing the entry to remain as-is if it's already present
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 77e9e671529f..b8f57472f17c 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -77,8 +77,6 @@ class DwarfCompileUnit final : public DwarfUnit {
 
   bool isDwoUnit() const override;
 
-  bool includeMinimalInlineScopes() const;
-
   DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
     if (isDwoUnit() && !DD->shareAcrossDWOCUs())
       return AbstractSPDies;
@@ -101,6 +99,8 @@ public:
     return Skeleton;
   }
 
+  bool includeMinimalInlineScopes() const;
+
   void initStmtList();
 
   /// Apply the DW_AT_stmt_list from this compile unit to the specified DIE.
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 3410b98d7776..bf27516e1ccd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -252,12 +252,6 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   // Handle split DWARF.
   HasSplitDwarf = !Asm->TM.Options.MCOptions.SplitDwarfFile.empty();
 
-  // Pubnames/pubtypes on by default for GDB.
-  if (DwarfPubSections == Default)
-    HasDwarfPubSections = tuneForGDB();
-  else
-    HasDwarfPubSections = DwarfPubSections == Enable;
-
   // SCE defaults to linkage names only for abstract subprograms.
   if (DwarfLinkageNames == DefaultLinkageNames)
     UseAllLinkageNames = !tuneForSCE();
@@ -380,19 +374,35 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
 
   // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
   // was inlined from another compile unit.
-  auto &CU = *CUMap.lookup(SP->getUnit());
-  if (auto *SkelCU = CU.getSkeleton()) {
-    (shareAcrossDWOCUs() ? CU : SrcCU)
-        .constructAbstractSubprogramScopeDIE(Scope);
-    if (CU.getCUNode()->getSplitDebugInlining())
-      SkelCU->constructAbstractSubprogramScopeDIE(Scope);
-  } else {
-    CU.constructAbstractSubprogramScopeDIE(Scope);
+  if (useSplitDwarf() && !shareAcrossDWOCUs() && !SP->getUnit()->getSplitDebugInlining())
+    // Avoid building the original CU if it won't be used
+    SrcCU.constructAbstractSubprogramScopeDIE(Scope);
+  else {
+    auto &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
+    if (auto *SkelCU = CU.getSkeleton()) {
+      (shareAcrossDWOCUs() ? CU : SrcCU)
+          .constructAbstractSubprogramScopeDIE(Scope);
+      if (CU.getCUNode()->getSplitDebugInlining())
+        SkelCU->constructAbstractSubprogramScopeDIE(Scope);
+    } else
+      CU.constructAbstractSubprogramScopeDIE(Scope);
   }
 }
 
-void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const {
-  if (!GenerateGnuPubSections)
+bool DwarfDebug::hasDwarfPubSections(bool includeMinimalInlineScopes) const {
+  // Opting in to GNU Pubnames/types overrides the default to ensure these are
+  // generated for things like Gold's gdb_index generation.
+  if (GenerateGnuPubSections)
+    return true;
+
+  if (DwarfPubSections == Default)
+    return tuneForGDB() && !includeMinimalInlineScopes;
+
+  return DwarfPubSections == Enable;
+}
+
+void DwarfDebug::addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const {
+  if (!hasDwarfPubSections(U.includeMinimalInlineScopes()))
     return;
 
   U.addFlag(D, dwarf::DW_AT_GNU_pubnames);
@@ -401,7 +411,9 @@ void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const {
 // Create new DwarfCompileUnit for the given metadata node with tag
 // DW_TAG_compile_unit.
 DwarfCompileUnit &
-DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) {
+DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) {
+  if (auto *CU = CUMap.lookup(DIUnit))
+    return *CU;
   StringRef FN = DIUnit->getFilename();
   CompilationDir = DIUnit->getDirectory();
 
@@ -534,7 +546,12 @@ void DwarfDebug::beginModule() {
   }
 
   for (DICompileUnit *CUNode : M->debug_compile_units()) {
-    DwarfCompileUnit &CU = constructDwarfCompileUnit(CUNode);
+    if (CUNode->getEnumTypes().empty() && CUNode->getRetainedTypes().empty() &&
+        CUNode->getGlobalVariables().empty() &&
+        CUNode->getImportedEntities().empty() && CUNode->getMacros().empty())
+      continue;
+
+    DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(CUNode);
     for (auto *IE : CUNode->getImportedEntities())
       CU.addImportedEntity(IE);
 
@@ -581,11 +598,12 @@ void DwarfDebug::finishVariableDefinitions() {
 }
 
 void DwarfDebug::finishSubprogramDefinitions() {
-  for (const DISubprogram *SP : ProcessedSPNodes)
-    if (SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug)
-      forBothCUs(*CUMap.lookup(SP->getUnit()), [&](DwarfCompileUnit &CU) {
-        CU.finishSubprogramDefinition(SP);
-      });
+  for (const DISubprogram *SP : ProcessedSPNodes) {
+    assert(SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug);
+    forBothCUs(
+        getOrCreateDwarfCompileUnit(SP->getUnit()),
+        [&](DwarfCompileUnit &CU) { CU.finishSubprogramDefinition(SP); });
+  }
 }
 
 void DwarfDebug::finalizeModuleInfo() {
@@ -595,6 +613,13 @@ void DwarfDebug::finalizeModuleInfo() {
 
   finishVariableDefinitions();
 
+  // Include the DWO file name in the hash if there's more than one CU.
+  // This handles ThinLTO's situation where imported CUs may very easily be
+  // duplicate with the same CU partially imported into another ThinLTO unit.
+  StringRef DWOName;
+  if (CUMap.size() > 1)
+    DWOName = Asm->TM.Options.MCOptions.SplitDwarfFile;
+
   // Handle anything that needs to be done on a per-unit basis after
   // all other generation.
   for (const auto &P : CUMap) {
@@ -609,7 +634,8 @@ void DwarfDebug::finalizeModuleInfo() {
     auto *SkCU = TheCU.getSkeleton();
     if (useSplitDwarf()) {
       // Emit a unique identifier for this CU.
-      uint64_t ID = DIEHash(Asm).computeCUSignature(TheCU.getUnitDie());
+      uint64_t ID =
+          DIEHash(Asm).computeCUSignature(DWOName, TheCU.getUnitDie());
       TheCU.addUInt(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
                     dwarf::DW_FORM_data8, ID);
       SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
@@ -718,7 +744,9 @@ void DwarfDebug::endModule() {
   }
 
   // Emit the pubnames and pubtypes sections if requested.
-  if (HasDwarfPubSections) {
+  // The condition is optimistically correct - any CU not using GMLT (&
+  // implicit/default pubnames state) might still have pubnames.
+  if (hasDwarfPubSections(/* gmlt */ false)) {
     emitDebugPubNames(GenerateGnuPubSections);
     emitDebugPubTypes(GenerateGnuPubSections);
   }
@@ -1028,8 +1056,12 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
   DebugHandlerBase::beginInstruction(MI);
   assert(CurMI);
 
+  const auto *SP = MI->getParent()->getParent()->getFunction()->getSubprogram();
+  if (!SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug)
+    return;
+
   // Check if source location changes, but ignore DBG_VALUE and CFI locations.
-  if (MI->isDebugValue() || MI->isCFIInstruction())
+  if (MI->isMetaInstruction())
     return;
   const DebugLoc &DL = MI->getDebugLoc();
   // When we emit a line-0 record, we don't update PrevInstLoc; so look at
@@ -1111,7 +1143,7 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
   // the beginning of the function body.
   for (const auto &MBB : *MF)
     for (const auto &MI : MBB)
-      if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
+      if (!MI.isMetaInstruction() && !MI.getFlag(MachineInstr::FrameSetup) &&
           MI.getDebugLoc())
         return MI.getDebugLoc();
   return DebugLoc();
@@ -1122,40 +1154,28 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
 void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
   CurFn = MF;
 
-  if (LScopes.empty())
+  auto *SP = MF->getFunction()->getSubprogram();
+  assert(LScopes.empty() || SP == LScopes.getCurrentFunctionScope()->getScopeNode());
+  if (SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug)
     return;
 
+  DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
+
   // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function
   // belongs to so that we add to the correct per-cu line table in the
   // non-asm case.
-  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
-  // FnScope->getScopeNode() and DI->second should represent the same function,
-  // though they may not be the same MDNode due to inline functions merged in
-  // LTO where the debug info metadata still differs (either due to distinct
-  // written differences - two versions of a linkonce_odr function
-  // written/copied into two separate files, or some sub-optimal metadata that
-  // isn't structurally identical (see: file path/name info from clang, which
-  // includes the directory of the cpp file being built, even when the file name
-  // is absolute (such as an <> lookup header)))
-  auto *SP = cast<DISubprogram>(FnScope->getScopeNode());
-  DwarfCompileUnit *TheCU = CUMap.lookup(SP->getUnit());
-  if (!TheCU) {
-    assert(SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug &&
-           "DICompileUnit missing from llvm.dbg.cu?");
-    return;
-  }
   if (Asm->OutStreamer->hasRawTextSupport())
     // Use a single line table if we are generating assembly.
     Asm->OutStreamer->getContext().setDwarfCompileUnitID(0);
   else
-    Asm->OutStreamer->getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
+    Asm->OutStreamer->getContext().setDwarfCompileUnitID(CU.getUniqueID());
 
   // Record beginning of function.
   PrologEndLoc = findPrologueEndLoc(MF);
-  if (DILocation *L = PrologEndLoc) {
+  if (PrologEndLoc) {
     // We'd like to list the prologue as "not statements" but GDB behaves
     // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
-    auto *SP = L->getInlinedAtScope()->getSubprogram();
+    auto *SP = PrologEndLoc->getInlinedAtScope()->getSubprogram();
     recordSourceLine(SP->getScopeLine(), 0, SP, DWARF2_FLAG_IS_STMT);
   }
 }
@@ -1395,7 +1415,7 @@ void DwarfDebug::emitDebugPubSection(
 
     const auto &Globals = (TheU->*Accessor)();
 
-    if (Globals.empty())
+    if (!hasDwarfPubSections(TheU->includeMinimalInlineScopes()))
       continue;
 
     if (auto *Skeleton = TheU->getSkeleton())
@@ -1544,6 +1564,9 @@ void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry) {
 
 // Emit locations into the debug loc section.
 void DwarfDebug::emitDebugLoc() {
+  if (DebugLocs.getLists().empty())
+    return;
+
   // Start the dwarf loc section.
   Asm->OutStreamer->SwitchSection(
       Asm->getObjFileLowering().getDwarfLocSection());
@@ -1755,6 +1778,9 @@ void DwarfDebug::emitDebugARanges() {
 
 /// Emit address ranges into a debug ranges section.
 void DwarfDebug::emitDebugRanges() {
+  if (CUMap.empty())
+    return;
+
   // Start the dwarf ranges section.
   Asm->OutStreamer->SwitchSection(
       Asm->getObjFileLowering().getDwarfRangesSection());
@@ -1834,6 +1860,9 @@ void DwarfDebug::emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U) {
 
 /// Emit macros into a debug macinfo section.
 void DwarfDebug::emitDebugMacinfo() {
+  if (CUMap.empty())
+    return;
+
   // Start the dwarf macinfo section.
   Asm->OutStreamer->SwitchSection(
       Asm->getObjFileLowering().getDwarfMacinfoSection());
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index b9c5aa9ffb23..ebfba4cfc275 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -246,9 +246,6 @@ class DwarfDebug : public DebugHandlerBase {
       std::pair<std::unique_ptr<DwarfTypeUnit>, const DICompositeType *>, 1>
       TypeUnitsUnderConstruction;
 
-  /// Whether to emit the pubnames/pubtypes sections.
-  bool HasDwarfPubSections;
-
   /// Whether to use the GNU TLS opcode (instead of the standard opcode).
   bool UseGNUTLSOpcode;
 
@@ -415,11 +412,11 @@ class DwarfDebug : public DebugHandlerBase {
 
   /// Flags to let the linker know we have emitted new style pubnames. Only
   /// emit it here if we don't have a skeleton CU for split dwarf.
-  void addGnuPubAttributes(DwarfUnit &U, DIE &D) const;
+  void addGnuPubAttributes(DwarfCompileUnit &U, DIE &D) const;
 
   /// Create new DwarfCompileUnit for the given metadata node with tag
   /// DW_TAG_compile_unit.
-  DwarfCompileUnit &constructDwarfCompileUnit(const DICompileUnit *DIUnit);
+  DwarfCompileUnit &getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit);
 
   /// Construct imported_module or imported_declaration DIE.
   void constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU,
@@ -556,6 +553,8 @@ public:
   /// A helper function to check whether the DIE for a given Scope is
   /// going to be null.
   bool isLexicalScopeDIENull(LexicalScope *Scope);
+
+  bool hasDwarfPubSections(bool includeMinimalInlineScopes) const;
 };
 } // End of namespace llvm
 
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index 984973cf3a3b..344136b1f195 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -96,7 +96,7 @@ namespace {
 
 char AtomicExpand::ID = 0;
 char &llvm::AtomicExpandID = AtomicExpand::ID;
-INITIALIZE_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions",
+INITIALIZE_PASS(AtomicExpand, DEBUG_TYPE, "Expand Atomic instructions",
                 false, false)
 
 FunctionPass *llvm::createAtomicExpandPass() { return new AtomicExpand(); }
diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index a67e194356d8..d3fced436b68 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -24,8 +24,6 @@
 #include <utility>
 using namespace llvm;
 
-#define DEBUG_TYPE "basictti"
-
 // This flag is used by the template base class for BasicTTIImpl, and here to
 // provide a definition.
 cl::opt<unsigned>
diff --git a/lib/CodeGen/BranchCoalescing.cpp b/lib/CodeGen/BranchCoalescing.cpp
index efdf300df850..2c41b597843c 100644
--- a/lib/CodeGen/BranchCoalescing.cpp
+++ b/lib/CodeGen/BranchCoalescing.cpp
@@ -27,7 +27,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "coal-branch"
+#define DEBUG_TYPE "branch-coalescing"
 
 static cl::opt<cl::boolOrDefault>
     EnableBranchCoalescing("enable-branch-coalesce", cl::Hidden,
@@ -193,11 +193,11 @@ public:
 char BranchCoalescing::ID = 0;
 char &llvm::BranchCoalescingID = BranchCoalescing::ID;
 
-INITIALIZE_PASS_BEGIN(BranchCoalescing, "branch-coalescing",
+INITIALIZE_PASS_BEGIN(BranchCoalescing, DEBUG_TYPE,
                       "Branch Coalescing", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
-INITIALIZE_PASS_END(BranchCoalescing, "branch-coalescing", "Branch Coalescing",
+INITIALIZE_PASS_END(BranchCoalescing, DEBUG_TYPE, "Branch Coalescing",
                     false, false)
 
 BranchCoalescing::CoalescingCandidateInfo::CoalescingCandidateInfo()
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index b63d9f4a4351..03ceac10beec 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -44,7 +44,7 @@
 #include <algorithm>
 using namespace llvm;
 
-#define DEBUG_TYPE "branchfolding"
+#define DEBUG_TYPE "branch-folder"
 
 STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
 STATISTIC(NumBranchOpts, "Number of branches optimized");
@@ -89,7 +89,7 @@ namespace {
 char BranchFolderPass::ID = 0;
 char &llvm::BranchFolderPassID = BranchFolderPass::ID;
 
-INITIALIZE_PASS(BranchFolderPass, "branch-folder",
+INITIALIZE_PASS(BranchFolderPass, DEBUG_TYPE,
                 "Control Flow Optimizer", false, false)
 
 bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
@@ -153,13 +153,14 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
 
   TriedMerging.clear();
 
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   AfterBlockPlacement = AfterPlacement;
   TII = tii;
   TRI = tri;
   MMI = mmi;
   MLI = mli;
+  this->MRI = &MRI;
 
-  MachineRegisterInfo &MRI = MF.getRegInfo();
   UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF);
   if (!UpdateLiveIns)
     MRI.invalidateLiveness();
@@ -351,7 +352,7 @@ void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
 
   if (UpdateLiveIns) {
     NewDest->clearLiveIns();
-    computeLiveIns(LiveRegs, *TRI, *NewDest);
+    computeLiveIns(LiveRegs, *MRI, *NewDest);
   }
 
   ++NumTailMerge;
@@ -388,7 +389,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
   MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB));
 
   if (UpdateLiveIns)
-    computeLiveIns(LiveRegs, *TRI, *NewMBB);
+    computeLiveIns(LiveRegs, *MRI, *NewMBB);
 
   // Add the new block to the funclet.
   const auto &FuncletI = FuncletMembership.find(&CurMBB);
diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index 4852721eea10..92681137e4c6 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h
@@ -108,6 +108,7 @@ namespace llvm {
     bool UpdateLiveIns;
     unsigned MinCommonTailLength;
     const TargetInstrInfo *TII;
+    const MachineRegisterInfo *MRI;
     const TargetRegisterInfo *TRI;
     MachineModuleInfo *MMI;
     MachineLoopInfo *MLI;
diff --git a/lib/CodeGen/BranchRelaxation.cpp b/lib/CodeGen/BranchRelaxation.cpp
index 7af136941661..e3de61c7816f 100644
--- a/lib/CodeGen/BranchRelaxation.cpp
+++ b/lib/CodeGen/BranchRelaxation.cpp
@@ -259,7 +259,7 @@ MachineBasicBlock *BranchRelaxation::splitBlockBeforeInstr(MachineInstr &MI,
 
   // Need to fix live-in lists if we track liveness.
   if (TRI->trackLivenessAfterRegAlloc(*MF))
-    computeLiveIns(LiveRegs, *TRI, *NewBB);
+    computeLiveIns(LiveRegs, MF->getRegInfo(), *NewBB);
 
   ++NumSplit;
 
@@ -345,6 +345,10 @@ bool BranchRelaxation::fixupConditionalBranch(MachineInstr &MI) {
     // Do it here since if there's no split, no update is needed.
     MBB->replaceSuccessor(FBB, &NewBB);
     NewBB.addSuccessor(FBB);
+
+    // Need to fix live-in lists if we track liveness.
+    if (TRI->trackLivenessAfterRegAlloc(*MF))
+      computeLiveIns(LiveRegs, MF->getRegInfo(), NewBB);
   }
 
   // We now have an appropriate fall-through block in place (either naturally or
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 3a1a3020a8d4..4e85708efafc 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -257,10 +257,10 @@ class TypePromotionTransaction;
 }
 
 char CodeGenPrepare::ID = 0;
-INITIALIZE_PASS_BEGIN(CodeGenPrepare, "codegenprepare",
+INITIALIZE_PASS_BEGIN(CodeGenPrepare, DEBUG_TYPE,
                       "Optimize for code generation", false, false)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_END(CodeGenPrepare, "codegenprepare",
+INITIALIZE_PASS_END(CodeGenPrepare, DEBUG_TYPE,
                     "Optimize for code generation", false, false)
 
 FunctionPass *llvm::createCodeGenPreparePass() { return new CodeGenPrepare(); }
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index 7ac2e5445435..265dda16bfa7 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -23,7 +23,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "codegen-dce"
+#define DEBUG_TYPE "dead-mi-elimination"
 
 STATISTIC(NumDeletes,          "Number of dead instructions deleted");
 
@@ -54,7 +54,7 @@ namespace {
 char DeadMachineInstructionElim::ID = 0;
 char &llvm::DeadMachineInstructionElimID = DeadMachineInstructionElim::ID;
 
-INITIALIZE_PASS(DeadMachineInstructionElim, "dead-mi-elimination",
+INITIALIZE_PASS(DeadMachineInstructionElim, DEBUG_TYPE,
                 "Remove dead machine instructions", false, false)
 
 bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
diff --git a/lib/CodeGen/DetectDeadLanes.cpp b/lib/CodeGen/DetectDeadLanes.cpp
index 6f4ea1912cf4..ab9a0592e017 100644
--- a/lib/CodeGen/DetectDeadLanes.cpp
+++ b/lib/CodeGen/DetectDeadLanes.cpp
@@ -132,8 +132,7 @@ private:
 char DetectDeadLanes::ID = 0;
 char &llvm::DetectDeadLanesID = DetectDeadLanes::ID;
 
-INITIALIZE_PASS(DetectDeadLanes, "detect-dead-lanes", "Detect Dead Lanes",
-                false, false)
+INITIALIZE_PASS(DetectDeadLanes, DEBUG_TYPE, "Detect Dead Lanes", false, false)
 
 /// Returns true if \p MI will get lowered to a series of COPY instructions.
 /// We call this a COPY-like instruction.
diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 1ef4d8660657..06ae5cd72c85 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp
@@ -71,12 +71,12 @@ namespace {
 } // end anonymous namespace
 
 char DwarfEHPrepare::ID = 0;
-INITIALIZE_PASS_BEGIN(DwarfEHPrepare, "dwarfehprepare",
+INITIALIZE_PASS_BEGIN(DwarfEHPrepare, DEBUG_TYPE,
                       "Prepare DWARF exceptions", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(DwarfEHPrepare, "dwarfehprepare",
+INITIALIZE_PASS_END(DwarfEHPrepare, DEBUG_TYPE,
                     "Prepare DWARF exceptions", false, false)
 
 FunctionPass *llvm::createDwarfEHPass() { return new DwarfEHPrepare(); }
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index 729172796453..402afe75b141 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -616,13 +616,13 @@ private:
 char EarlyIfConverter::ID = 0;
 char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
 
-INITIALIZE_PASS_BEGIN(EarlyIfConverter,
-                      "early-ifcvt", "Early If Converter", false, false)
+INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE,
+                      "Early If Converter", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
-INITIALIZE_PASS_END(EarlyIfConverter,
-                      "early-ifcvt", "Early If Converter", false, false)
+INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE,
+                    "Early If Converter", false, false)
 
 void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineBranchProbabilityInfo>();
diff --git a/lib/CodeGen/ExpandISelPseudos.cpp b/lib/CodeGen/ExpandISelPseudos.cpp
index 0ec79c2e69f9..88d422a0f545 100644
--- a/lib/CodeGen/ExpandISelPseudos.cpp
+++ b/lib/CodeGen/ExpandISelPseudos.cpp
@@ -41,7 +41,7 @@ namespace {
 
 char ExpandISelPseudos::ID = 0;
 char &llvm::ExpandISelPseudosID = ExpandISelPseudos::ID;
-INITIALIZE_PASS(ExpandISelPseudos, "expand-isel-pseudos",
+INITIALIZE_PASS(ExpandISelPseudos, DEBUG_TYPE,
                 "Expand ISel Pseudo-instructions", false, false)
 
 bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) {
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index e860906043dd..27cd639b2a49 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -58,7 +58,7 @@ private:
 char ExpandPostRA::ID = 0;
 char &llvm::ExpandPostRAPseudosID = ExpandPostRA::ID;
 
-INITIALIZE_PASS(ExpandPostRA, "postrapseudos",
+INITIALIZE_PASS(ExpandPostRA, DEBUG_TYPE,
                 "Post-RA pseudo instruction expansion pass", false, false)
 
 /// TransferImplicitOperands - MI is a pseudo-instruction, and the lowered
diff --git a/lib/CodeGen/FuncletLayout.cpp b/lib/CodeGen/FuncletLayout.cpp
index d61afad4db57..0bdd5e64a7f2 100644
--- a/lib/CodeGen/FuncletLayout.cpp
+++ b/lib/CodeGen/FuncletLayout.cpp
@@ -37,7 +37,7 @@ public:
 
 char FuncletLayout::ID = 0;
 char &llvm::FuncletLayoutID = FuncletLayout::ID;
-INITIALIZE_PASS(FuncletLayout, "funclet-layout",
+INITIALIZE_PASS(FuncletLayout, DEBUG_TYPE,
                 "Contiguously Lay Out Funclets", false, false)
 
 bool FuncletLayout::runOnMachineFunction(MachineFunction &F) {
diff --git a/lib/CodeGen/GlobalISel/CMakeLists.txt b/lib/CodeGen/GlobalISel/CMakeLists.txt
index 03a8c4f5f909..eba7ea8132e3 100644
--- a/lib/CodeGen/GlobalISel/CMakeLists.txt
+++ b/lib/CodeGen/GlobalISel/CMakeLists.txt
@@ -8,6 +8,7 @@ set(GLOBAL_ISEL_FILES
       LegalizerHelper.cpp
       Legalizer.cpp
       LegalizerInfo.cpp
+      Localizer.cpp
       RegBankSelect.cpp
       RegisterBank.cpp
       RegisterBankInfo.cpp
@@ -24,11 +25,11 @@ endif()
 
 # In LLVMBuild.txt files, it is not possible to mark a dependency to a
 # library as optional. So instead, generate an empty library if we did
-# not ask for it. 
+# not ask for it.
 add_llvm_library(LLVMGlobalISel
         ${GLOBAL_ISEL_BUILD_FILES}
         GlobalISel.cpp
-        
+
         DEPENDS
         intrinsics_gen
   )
diff --git a/lib/CodeGen/GlobalISel/GlobalISel.cpp b/lib/CodeGen/GlobalISel/GlobalISel.cpp
index fcd2722f1c2f..29d1209bb02a 100644
--- a/lib/CodeGen/GlobalISel/GlobalISel.cpp
+++ b/lib/CodeGen/GlobalISel/GlobalISel.cpp
@@ -26,6 +26,7 @@ void llvm::initializeGlobalISel(PassRegistry &Registry) {
 void llvm::initializeGlobalISel(PassRegistry &Registry) {
   initializeIRTranslatorPass(Registry);
   initializeLegalizerPass(Registry);
+  initializeLocalizerPass(Registry);
   initializeRegBankSelectPass(Registry);
   initializeInstructionSelectPass(Registry);
 }
diff --git a/lib/CodeGen/GlobalISel/Localizer.cpp b/lib/CodeGen/GlobalISel/Localizer.cpp
new file mode 100644
index 000000000000..bdca732b4e33
--- /dev/null
+++ b/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -0,0 +1,125 @@
+//===- Localizer.cpp ---------------------- Localize some instrs -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the Localizer class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "localizer"
+
+using namespace llvm;
+
+char Localizer::ID = 0;
+INITIALIZE_PASS(Localizer, DEBUG_TYPE,
+                "Move/duplicate certain instructions close to their use", false,
+                false);
+
+Localizer::Localizer() : MachineFunctionPass(ID) {
+  initializeLocalizerPass(*PassRegistry::getPassRegistry());
+}
+
+void Localizer::init(MachineFunction &MF) { MRI = &MF.getRegInfo(); }
+
+bool Localizer::shouldLocalize(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  // Constants-like instructions should be close to their users.
+  // We don't want long live-ranges for them.
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_FRAME_INDEX:
+    return true;
+  }
+}
+
+bool Localizer::isLocalUse(MachineOperand &MOUse, const MachineInstr &Def,
+                           MachineBasicBlock *&InsertMBB) {
+  MachineInstr &MIUse = *MOUse.getParent();
+  InsertMBB = MIUse.getParent();
+  if (MIUse.isPHI())
+    InsertMBB = MIUse.getOperand(MIUse.getOperandNo(&MOUse) + 1).getMBB();
+  return InsertMBB == Def.getParent();
+}
+
+bool Localizer::runOnMachineFunction(MachineFunction &MF) {
+  // If the ISel pipeline failed, do not bother running that pass.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+
+  DEBUG(dbgs() << "Localize instructions for: " << MF.getName() << '\n');
+
+  init(MF);
+
+  bool Changed = false;
+  // Keep track of the instructions we localized.
+  // We won't need to process them if we see them later in the CFG.
+  SmallPtrSet<MachineInstr *, 16> LocalizedInstrs;
+  DenseMap<std::pair<MachineBasicBlock *, unsigned>, unsigned> MBBWithLocalDef;
+  // TODO: Do bottom up traversal.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (LocalizedInstrs.count(&MI) || !shouldLocalize(MI))
+        continue;
+      DEBUG(dbgs() << "Should localize: " << MI);
+      assert(MI.getDesc().getNumDefs() == 1 &&
+             "More than one definition not supported yet");
+      unsigned Reg = MI.getOperand(0).getReg();
+      // Check if all the users of MI are local.
+      // We are going to invalidation the list of use operands, so we
+      // can't use range iterator.
+      for (auto MOIt = MRI->use_begin(Reg), MOItEnd = MRI->use_end();
+           MOIt != MOItEnd;) {
+        MachineOperand &MOUse = *MOIt++;
+        // Check if the use is already local.
+        MachineBasicBlock *InsertMBB;
+        DEBUG(MachineInstr &MIUse = *MOUse.getParent();
+              dbgs() << "Checking use: " << MIUse
+                     << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n');
+        if (isLocalUse(MOUse, MI, InsertMBB))
+          continue;
+        DEBUG(dbgs() << "Fixing non-local use\n");
+        Changed = true;
+        auto MBBAndReg = std::make_pair(InsertMBB, Reg);
+        auto NewVRegIt = MBBWithLocalDef.find(MBBAndReg);
+        if (NewVRegIt == MBBWithLocalDef.end()) {
+          // Create the localized instruction.
+          MachineInstr *LocalizedMI = MF.CloneMachineInstr(&MI);
+          LocalizedInstrs.insert(LocalizedMI);
+          // Move it at the right place.
+          MachineInstr &MIUse = *MOUse.getParent();
+          if (MIUse.getParent() == InsertMBB)
+            InsertMBB->insert(MIUse, LocalizedMI);
+          else
+            InsertMBB->insert(InsertMBB->getFirstNonPHI(), LocalizedMI);
+
+          // Set a new register for the definition.
+          unsigned NewReg =
+              MRI->createGenericVirtualRegister(MRI->getType(Reg));
+          MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg));
+          LocalizedMI->getOperand(0).setReg(NewReg);
+          NewVRegIt =
+              MBBWithLocalDef.insert(std::make_pair(MBBAndReg, NewReg)).first;
+          DEBUG(dbgs() << "Inserted: " << *LocalizedMI);
+        }
+        DEBUG(dbgs() << "Update use with: " << PrintReg(NewVRegIt->second)
+                     << '\n');
+        // Update the user reg.
+        MOUse.setReg(NewVRegIt->second);
+      }
+    }
+  }
+  return Changed;
+}
diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
index 1ea534939948..23812a2a2344 100644
--- a/lib/CodeGen/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp
@@ -192,10 +192,7 @@ namespace {
 } // end anonymous namespace
 
 char GlobalMerge::ID = 0;
-INITIALIZE_PASS_BEGIN(GlobalMerge, "global-merge", "Merge global variables",
-                      false, false)
-INITIALIZE_PASS_END(GlobalMerge, "global-merge", "Merge global variables",
-                    false, false)
+INITIALIZE_PASS(GlobalMerge, DEBUG_TYPE, "Merge global variables", false, false)
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 628d599a3cc7..1c33f3b6800e 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -39,7 +39,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "ifcvt"
+#define DEBUG_TYPE "if-converter"
 
 // Hidden options for help debugging.
 static cl::opt<int> IfCvtFnStart("ifcvt-fn-start", cl::init(-1), cl::Hidden);
@@ -316,9 +316,9 @@ namespace {
 
 char &llvm::IfConverterID = IfConverter::ID;
 
-INITIALIZE_PASS_BEGIN(IfConverter, "if-converter", "If Converter", false, false)
+INITIALIZE_PASS_BEGIN(IfConverter, DEBUG_TYPE, "If Converter", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
-INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false)
+INITIALIZE_PASS_END(IfConverter, DEBUG_TYPE, "If Converter", false, false)
 
 bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()) || (PredicateFtor && !PredicateFtor(MF)))
@@ -1588,32 +1588,22 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     BBCvt = MBPI->getEdgeProbability(BBI.BB, &CvtMBB);
   }
 
-  // To be able to insert code freely at the end of BBI we sometimes remove
-  // the branch from BBI to NextMBB temporarily. Remember if this happened.
-  bool RemovedBranchToNextMBB = false;
   if (CvtMBB.pred_size() > 1) {
     BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
     // Copy instructions in the true block, predicate them, and add them to
     // the entry block.
     CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
 
-    // Keep the CFG updated.
+    // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
+    // explicitly remove CvtBBI as a successor.
     BBI.BB->removeSuccessor(&CvtMBB, true);
   } else {
     // Predicate the 'true' block after removing its branch.
     CvtBBI->NonPredSize -= TII->removeBranch(CvtMBB);
     PredicateBlock(*CvtBBI, CvtMBB.end(), Cond);
 
-    // Remove the branch from the entry of the triangle to NextBB to be able to
-    // do the merge below. Keep the CFG updated, but remember we removed the
-    // branch since we do want to execute NextMBB, either by introducing a
-    // branch to it again, or merging it into the entry block.
-    // How it's handled is decided further down.
-    BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
-    BBI.BB->removeSuccessor(&NextMBB, true);
-    RemovedBranchToNextMBB = true;
-
     // Now merge the entry of the triangle with the true block.
+    BBI.NonPredSize -= TII->removeBranch(*BBI.BB);
     MergeBlocks(BBI, *CvtBBI, false);
   }
 
@@ -1651,19 +1641,12 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     // block. By not merging them, we make it possible to iteratively
     // ifcvt the blocks.
     if (!HasEarlyExit &&
-        // We might have removed BBI from NextMBB's predecessor list above but
-        // we want it to be there, so consider that too.
-        (NextMBB.pred_size() == (RemovedBranchToNextMBB ? 0 : 1)) &&
-        !NextBBI->HasFallThrough &&
+        NextMBB.pred_size() == 1 && !NextBBI->HasFallThrough &&
         !NextMBB.hasAddressTaken()) {
-      // We will merge NextBBI into BBI, and thus remove the current
-      // fallthrough from BBI into CvtBBI.
-      BBI.BB->removeSuccessor(&CvtMBB, true);
       MergeBlocks(BBI, *NextBBI);
       FalseBBDead = true;
     } else {
       InsertUncondBranch(*BBI.BB, NextMBB, TII);
-      BBI.BB->addSuccessor(&NextMBB);
       BBI.HasFallThrough = false;
     }
     // Mixed predicated and unpredicated code. This cannot be iteratively
@@ -1671,6 +1654,8 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
     IterIfcvt = false;
   }
 
+  RemoveExtraEdges(BBI);
+
   // Update block info. BB can be iteratively if-converted.
   if (!IterIfcvt)
     BBI.IsDone = true;
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index 920c2a372a9b..24e289dd4f1b 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -674,8 +674,8 @@ void ImplicitNullChecks::rewriteNullChecks(
 
 char ImplicitNullChecks::ID = 0;
 char &llvm::ImplicitNullChecksID = ImplicitNullChecks::ID;
-INITIALIZE_PASS_BEGIN(ImplicitNullChecks, "implicit-null-checks",
+INITIALIZE_PASS_BEGIN(ImplicitNullChecks, DEBUG_TYPE,
                       "Implicit null checks", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(ImplicitNullChecks, "implicit-null-checks",
+INITIALIZE_PASS_END(ImplicitNullChecks, DEBUG_TYPE,
                     "Implicit null checks", false, false)
diff --git a/lib/CodeGen/InterleavedAccessPass.cpp b/lib/CodeGen/InterleavedAccessPass.cpp
index bb29db301a95..ee4929c91482 100644
--- a/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/lib/CodeGen/InterleavedAccessPass.cpp
@@ -107,13 +107,11 @@ private:
 } // end anonymous namespace.
 
 char InterleavedAccess::ID = 0;
-INITIALIZE_PASS_BEGIN(
-    InterleavedAccess, "interleaved-access",
+INITIALIZE_PASS_BEGIN(InterleavedAccess, DEBUG_TYPE,
     "Lower interleaved memory accesses to target specific intrinsics", false,
     false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(
-    InterleavedAccess, "interleaved-access",
+INITIALIZE_PASS_END(InterleavedAccess, DEBUG_TYPE,
     "Lower interleaved memory accesses to target specific intrinsics", false,
     false)
 
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index 275d84e2c185..40ee7ea785f0 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -86,8 +86,9 @@ void LexicalScopes::extractLexicalScopes(
         continue;
       }
 
-      // Ignore DBG_VALUE. It does not contribute to any instruction in output.
-      if (MInsn.isDebugValue())
+      // Ignore DBG_VALUE and similar instruction that do not contribute to any
+      // instruction in the output.
+      if (MInsn.isMetaInstruction())
         continue;
 
       if (RangeBeginMI) {
diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp
index f956974b1aaf..b5e705f6455d 100644
--- a/lib/CodeGen/LiveDebugValues.cpp
+++ b/lib/CodeGen/LiveDebugValues.cpp
@@ -43,7 +43,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "live-debug-values"
+#define DEBUG_TYPE "livedebugvalues"
 
 STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
 
@@ -283,7 +283,7 @@ public:
 
 char LiveDebugValues::ID = 0;
 char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
-INITIALIZE_PASS(LiveDebugValues, "livedebugvalues", "Live DEBUG_VALUE analysis",
+INITIALIZE_PASS(LiveDebugValues, DEBUG_TYPE, "Live DEBUG_VALUE analysis",
                 false, false)
 
 /// Default construct and initialize the pass.
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index bcf7c8e99c7f..bbd783367c9e 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -45,7 +45,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "livedebug"
+#define DEBUG_TYPE "livedebugvars"
 
 static cl::opt<bool>
 EnableLDV("live-debug-variables", cl::init(true),
@@ -54,11 +54,11 @@ EnableLDV("live-debug-variables", cl::init(true),
 STATISTIC(NumInsertedDebugValues, "Number of DBG_VALUEs inserted");
 char LiveDebugVariables::ID = 0;
 
-INITIALIZE_PASS_BEGIN(LiveDebugVariables, "livedebugvars",
+INITIALIZE_PASS_BEGIN(LiveDebugVariables, DEBUG_TYPE,
                 "Debug Variable Analysis", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(LiveDebugVariables, "livedebugvars",
+INITIALIZE_PASS_END(LiveDebugVariables, DEBUG_TYPE,
                 "Debug Variable Analysis", false, false)
 
 void LiveDebugVariables::getAnalysisUsage(AnalysisUsage &AU) const {
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 3f5b8e19d1f0..0c05dbeacba0 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -1,4 +1,4 @@
-//===-- LiveIntervalAnalysis.cpp - Live Interval Analysis -----------------===//
+//===- LiveIntervalAnalysis.cpp - Live Interval Analysis ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,28 +14,45 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "LiveRangeCalc.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/IR/Value.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
-#include <cmath>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
@@ -59,11 +76,13 @@ static bool EnablePrecomputePhysRegs = false;
 #endif // NDEBUG
 
 namespace llvm {
+
 cl::opt<bool> UseSegmentSetForPhysRegs(
     "use-segment-set-for-physregs", cl::Hidden, cl::init(true),
     cl::desc(
         "Use segment set for the computation of the live ranges of physregs."));
-}
+
+} // end namespace llvm
 
 void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
@@ -78,8 +97,7 @@ void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-LiveIntervals::LiveIntervals() : MachineFunctionPass(ID),
-  DomTree(nullptr), LRCalc(nullptr) {
+LiveIntervals::LiveIntervals() : MachineFunctionPass(ID) {
   initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
 }
 
@@ -168,12 +186,10 @@ LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const {
 #endif
 
 LiveInterval* LiveIntervals::createInterval(unsigned reg) {
-  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ?
-                  llvm::huge_valf : 0.0F;
+  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? huge_valf : 0.0F;
   return new LiveInterval(reg, Weight);
 }
 
-
 /// Compute the live interval of a virtual register, based on defs and uses.
 void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
   assert(LRCalc && "LRCalc not initialized.");
@@ -337,7 +353,7 @@ static void createSegmentsForValues(LiveRange &LR,
   }
 }
 
-typedef SmallVector<std::pair<SlotIndex, VNInfo*>, 16> ShrinkToUsesWorkList;
+using ShrinkToUsesWorkList = SmallVector<std::pair<SlotIndex, VNInfo*>, 16>;
 
 static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
                                  ShrinkToUsesWorkList &WorkList,
@@ -593,7 +609,7 @@ void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
   // Find all blocks that are reachable from KillMBB without leaving VNI's live
   // range. It is possible that KillMBB itself is reachable, so start a DFS
   // from each successor.
-  typedef df_iterator_default_set<MachineBasicBlock*,9> VisitedTy;
+  using VisitedTy = df_iterator_default_set<MachineBasicBlock*,9>;
   VisitedTy Visited;
   for (MachineBasicBlock *Succ : KillMBB->successors()) {
     for (df_ext_iterator<MachineBasicBlock*, VisitedTy>
@@ -822,7 +838,6 @@ LiveIntervals::addSegmentToEndOfBlock(unsigned reg, MachineInstr &startInst) {
   return S;
 }
 
-
 //===----------------------------------------------------------------------===//
 //                          Register mask functions
 //===----------------------------------------------------------------------===//
@@ -855,7 +870,7 @@ bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI,
     return false;
 
   bool Found = false;
-  for (;;) {
+  while (true) {
     assert(*SlotI >= LiveI->start);
     // Loop over all slots overlapping this segment.
     while (*SlotI < LiveI->end) {
diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp
index 9f7d7cf54848..0dc1079b2ad4 100644
--- a/lib/CodeGen/LivePhysRegs.cpp
+++ b/lib/CodeGen/LivePhysRegs.cpp
@@ -53,7 +53,7 @@ void LivePhysRegs::stepBackward(const MachineInstr &MI) {
         continue;
       removeReg(Reg);
     } else if (O->isRegMask())
-      removeRegsInMask(*O, nullptr);
+      removeRegsInMask(*O);
   }
 
   // Add uses to the set.
@@ -142,66 +142,85 @@ bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
 /// Add live-in registers of basic block \p MBB to \p LiveRegs.
 void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) {
   for (const auto &LI : MBB.liveins()) {
-    MCSubRegIndexIterator S(LI.PhysReg, TRI);
-    if (LI.LaneMask.all() || (LI.LaneMask.any() && !S.isValid())) {
-      addReg(LI.PhysReg);
+    unsigned Reg = LI.PhysReg;
+    LaneBitmask Mask = LI.LaneMask;
+    MCSubRegIndexIterator S(Reg, TRI);
+    assert(Mask.any() && "Invalid livein mask");
+    if (Mask.all() || !S.isValid()) {
+      addReg(Reg);
       continue;
     }
     for (; S.isValid(); ++S) {
       unsigned SI = S.getSubRegIndex();
-      if ((LI.LaneMask & TRI->getSubRegIndexLaneMask(SI)).any())
+      if ((Mask & TRI->getSubRegIndexLaneMask(SI)).any())
         addReg(S.getSubReg());
     }
   }
 }
 
-/// Add pristine registers to the given \p LiveRegs. This function removes
-/// actually saved callee save registers when \p InPrologueEpilogue is false.
-static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF,
-                         const MachineFrameInfo &MFI,
-                         const TargetRegisterInfo &TRI) {
+/// Adds all callee saved registers to \p LiveRegs.
+static void addCalleeSavedRegs(LivePhysRegs &LiveRegs,
+                               const MachineFunction &MF) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR;
-       ++CSR)
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR; ++CSR)
     LiveRegs.addReg(*CSR);
+}
+
+/// Adds pristine registers to the given \p LiveRegs. Pristine registers are
+/// callee saved registers that are unused in the function.
+static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  if (!MFI.isCalleeSavedInfoValid())
+    return;
+  /// Add all callee saved regs, then remove the ones that are saved+restored.
+  addCalleeSavedRegs(LiveRegs, MF);
+  /// Remove the ones that are not saved/restored; they are pristine.
   for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
     LiveRegs.removeReg(Info.getReg());
 }
 
 void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
-  // To get the live-outs we simply merge the live-ins of all successors.
-  for (const MachineBasicBlock *Succ : MBB.successors())
-    addBlockLiveIns(*Succ);
+  if (!MBB.succ_empty()) {
+    // To get the live-outs we simply merge the live-ins of all successors.
+    for (const MachineBasicBlock *Succ : MBB.successors())
+      addBlockLiveIns(*Succ);
+  } else if (MBB.isReturnBlock()) {
+    // For the return block: Add all callee saved registers that are saved and
+    // restored (somewhere); This does not include callee saved registers that
+    // are unused and hence not saved and restored; they are called pristine.
+    const MachineFunction &MF = *MBB.getParent();
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (MFI.isCalleeSavedInfoValid()) {
+      for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
+        addReg(Info.getReg());
+    }
+  }
 }
 
 void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
-  const MachineFunction &MF = *MBB.getParent();
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  if (MFI.isCalleeSavedInfoValid()) {
-    if (MBB.isReturnBlock()) {
-      // The return block has no successors whose live-ins we could merge
-      // below. So instead we add the callee saved registers manually.
-      const MachineRegisterInfo &MRI = MF.getRegInfo();
-      for (const MCPhysReg *I = MRI.getCalleeSavedRegs(); *I; ++I)
-        addReg(*I);
-    } else {
-      addPristines(*this, MF, MFI, *TRI);
-    }
+  if (!MBB.succ_empty()) {
+    const MachineFunction &MF = *MBB.getParent();
+    addPristines(*this, MF);
+    addLiveOutsNoPristines(MBB);
+  } else if (MBB.isReturnBlock()) {
+    // For the return block: Add all callee saved registers.
+    const MachineFunction &MF = *MBB.getParent();
+    const MachineFrameInfo &MFI = MF.getFrameInfo();
+    if (MFI.isCalleeSavedInfoValid())
+      addCalleeSavedRegs(*this, MF);
   }
-
-  addLiveOutsNoPristines(MBB);
 }
 
 void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) {
   const MachineFunction &MF = *MBB.getParent();
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  if (MFI.isCalleeSavedInfoValid())
-    addPristines(*this, MF, MFI, *TRI);
+  addPristines(*this, MF);
   addBlockLiveIns(MBB);
 }
 
-void llvm::computeLiveIns(LivePhysRegs &LiveRegs, const TargetRegisterInfo &TRI,
+void llvm::computeLiveIns(LivePhysRegs &LiveRegs,
+                          const MachineRegisterInfo &MRI,
                           MachineBasicBlock &MBB) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
   assert(MBB.livein_empty());
   LiveRegs.init(TRI);
   LiveRegs.addLiveOutsNoPristines(MBB);
@@ -209,10 +228,12 @@ void llvm::computeLiveIns(LivePhysRegs &LiveRegs, const TargetRegisterInfo &TRI,
     LiveRegs.stepBackward(MI);
 
   for (unsigned Reg : LiveRegs) {
+    if (MRI.isReserved(Reg))
+      continue;
     // Skip the register if we are about to add one of its super registers.
     bool ContainsSuperReg = false;
     for (MCSuperRegIterator SReg(Reg, &TRI); SReg.isValid(); ++SReg) {
-      if (LiveRegs.contains(*SReg)) {
+      if (LiveRegs.contains(*SReg) && !MRI.isReserved(*SReg)) {
         ContainsSuperReg = true;
         break;
       }
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
index dbf1f96102d1..b51f8b0aa6bb 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStackAnalysis.cpp
@@ -25,10 +25,10 @@ using namespace llvm;
 #define DEBUG_TYPE "livestacks"
 
 char LiveStacks::ID = 0;
-INITIALIZE_PASS_BEGIN(LiveStacks, "livestacks",
+INITIALIZE_PASS_BEGIN(LiveStacks, DEBUG_TYPE,
                 "Live Stack Slot Analysis", false, false)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
-INITIALIZE_PASS_END(LiveStacks, "livestacks",
+INITIALIZE_PASS_END(LiveStacks, DEBUG_TYPE,
                 "Live Stack Slot Analysis", false, false)
 
 char &llvm::LiveStacksID = LiveStacks::ID;
diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index e189fb0dd89d..17cab0ae910e 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -103,10 +103,10 @@ namespace {
 
 char LocalStackSlotPass::ID = 0;
 char &llvm::LocalStackSlotAllocationID = LocalStackSlotPass::ID;
-INITIALIZE_PASS_BEGIN(LocalStackSlotPass, "localstackalloc",
+INITIALIZE_PASS_BEGIN(LocalStackSlotPass, DEBUG_TYPE,
                       "Local Stack Slot Allocation", false, false)
 INITIALIZE_PASS_DEPENDENCY(StackProtector)
-INITIALIZE_PASS_END(LocalStackSlotPass, "localstackalloc",
+INITIALIZE_PASS_END(LocalStackSlotPass, DEBUG_TYPE,
                     "Local Stack Slot Allocation", false, false)
 
 
diff --git a/lib/CodeGen/LowerEmuTLS.cpp b/lib/CodeGen/LowerEmuTLS.cpp
index 5fb5b747f471..0fc48d4e0b6b 100644
--- a/lib/CodeGen/LowerEmuTLS.cpp
+++ b/lib/CodeGen/LowerEmuTLS.cpp
@@ -53,7 +53,7 @@ private:
 
 char LowerEmuTLS::ID = 0;
 
-INITIALIZE_PASS(LowerEmuTLS, "loweremutls",
+INITIALIZE_PASS(LowerEmuTLS, DEBUG_TYPE,
                 "Add __emutls_[vt]. variables for emultated TLS model", false,
                 false)
 
diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index 9c7367b4c780..4d1ec11df46c 100644
--- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -26,7 +26,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "block-freq"
+#define DEBUG_TYPE "machine-block-freq"
 
 
 static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
@@ -149,11 +149,11 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *>
 
 } // end namespace llvm
 
-INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq",
+INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, DEBUG_TYPE,
                       "Machine Block Frequency Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq",
+INITIALIZE_PASS_END(MachineBlockFrequencyInfo, DEBUG_TYPE,
                     "Machine Block Frequency Analysis", true, true)
 
 char MachineBlockFrequencyInfo::ID = 0;
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index adfca9a46239..c1ca8e8e83b4 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -499,13 +499,13 @@ public:
 
 char MachineBlockPlacement::ID = 0;
 char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
-INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement",
+INITIALIZE_PASS_BEGIN(MachineBlockPlacement, DEBUG_TYPE,
                       "Branch Probability Basic Block Placement", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",
+INITIALIZE_PASS_END(MachineBlockPlacement, DEBUG_TYPE,
                     "Branch Probability Basic Block Placement", false, false)
 
 #ifndef NDEBUG
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 0766f465456c..34f6bbd59e9b 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -108,12 +108,12 @@ namespace {
 
 char MachineCSE::ID = 0;
 char &llvm::MachineCSEID = MachineCSE::ID;
-INITIALIZE_PASS_BEGIN(MachineCSE, "machine-cse",
-                "Machine Common Subexpression Elimination", false, false)
+INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE,
+                      "Machine Common Subexpression Elimination", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(MachineCSE, "machine-cse",
-                "Machine Common Subexpression Elimination", false, false)
+INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE,
+                    "Machine Common Subexpression Elimination", false, false)
 
 /// The source register of a COPY machine instruction can be propagated to all
 /// its users, and this propagation could increase the probability of finding
@@ -180,8 +180,8 @@ MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
     I = skipDebugInstructionsForward(I, E);
 
     if (I == E)
-      // Reached end of block, register is obviously dead.
-      return true;
+      // Reached end of block, we don't know if register is dead or not.
+      return false;
 
     bool SeenDef = false;
     for (const MachineOperand &MO : I->operands()) {
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
index 50e453e4067c..c176de16b593 100644
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -86,11 +86,11 @@ private:
 char MachineCombiner::ID = 0;
 char &llvm::MachineCombinerID = MachineCombiner::ID;
 
-INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
+INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
                       "Machine InstCombiner", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
-INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
+INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
                     false, false)
 
 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 7312dc5e94bd..f83b5481e0a5 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -27,7 +27,7 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "codegen-cp"
+#define DEBUG_TYPE "machine-cp"
 
 STATISTIC(NumDeletes, "Number of dead copies deleted");
 
@@ -79,7 +79,7 @@ namespace {
 char MachineCopyPropagation::ID = 0;
 char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID;
 
-INITIALIZE_PASS(MachineCopyPropagation, "machine-cp",
+INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE,
                 "Machine Copy Propagation Pass", false, false)
 
 /// Remove any entry in \p Map where the register is a subregister or equal to
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 7eb991744f01..95c62d820b0e 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -38,7 +38,7 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "machine-licm"
+#define DEBUG_TYPE "machinelicm"
 
 static cl::opt<bool>
 AvoidSpeculation("avoid-speculation",
@@ -237,13 +237,13 @@ namespace {
 
 char MachineLICM::ID = 0;
 char &llvm::MachineLICMID = MachineLICM::ID;
-INITIALIZE_PASS_BEGIN(MachineLICM, "machinelicm",
-                "Machine Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_BEGIN(MachineLICM, DEBUG_TYPE,
+                      "Machine Loop Invariant Code Motion", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(MachineLICM, "machinelicm",
-                "Machine Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_END(MachineLICM, DEBUG_TYPE,
+                    "Machine Loop Invariant Code Motion", false, false)
 
 /// Test if the given loop is the outer-most loop that has a unique predecessor.
 static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 581a8ad81149..9ea3c00a2fc4 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -901,7 +901,7 @@ namespace llvm {
 ModulePass *createMachineOutlinerPass() { return new MachineOutliner(); }
 }
 
-INITIALIZE_PASS(MachineOutliner, "machine-outliner",
+INITIALIZE_PASS(MachineOutliner, DEBUG_TYPE,
                 "Machine Function Outliner", false, false)
 
 void MachineOutliner::pruneOverlaps(std::vector<Candidate> &CandidateList,
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index d06c38cf4ed8..8f5ac8b3fc45 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -715,13 +715,13 @@ char MachinePipeliner::ID = 0;
 int MachinePipeliner::NumTries = 0;
 #endif
 char &llvm::MachinePipelinerID = MachinePipeliner::ID;
-INITIALIZE_PASS_BEGIN(MachinePipeliner, "pipeliner",
+INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
                       "Modulo Software Pipelining", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(MachinePipeliner, "pipeliner",
+INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
                     "Modulo Software Pipelining", false, false)
 
 /// The "main" function for implementing Swing Modulo Scheduling.
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 41e161f71e53..edc3783afa2f 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -69,7 +69,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "misched"
+#define DEBUG_TYPE "machine-scheduler"
 
 namespace llvm {
 
@@ -191,13 +191,13 @@ char MachineScheduler::ID = 0;
 
 char &llvm::MachineSchedulerID = MachineScheduler::ID;
 
-INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler",
+INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE,
                       "Machine Instruction Scheduler", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler",
+INITIALIZE_PASS_END(MachineScheduler, DEBUG_TYPE,
                     "Machine Instruction Scheduler", false, false)
 
 MachineScheduler::MachineScheduler()
@@ -532,7 +532,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
     // thumb2 size reduction is currently an exception, so the PostMIScheduler
     // needs to do this.
     if (FixKillFlags)
-        Scheduler.fixupKills(&*MBB);
+      Scheduler.fixupKills(*MBB);
   }
   Scheduler.finalizeSchedule();
 }
@@ -3233,6 +3233,12 @@ void PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
               Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
     return;
 
+  // Keep clustered nodes together.
+  if (tryGreater(TryCand.SU == DAG->getNextClusterSucc(),
+                 Cand.SU == DAG->getNextClusterSucc(),
+                 TryCand, Cand, Cluster))
+    return;
+
   // Avoid critical resource consumption and balance the schedule.
   if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
               TryCand, Cand, ResourceReduce))
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 5f87b68123f1..7c34e71a0cce 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -173,14 +173,14 @@ namespace {
 
 char MachineSinking::ID = 0;
 char &llvm::MachineSinkingID = MachineSinking::ID;
-INITIALIZE_PASS_BEGIN(MachineSinking, "machine-sink",
-                "Machine code sinking", false, false)
+INITIALIZE_PASS_BEGIN(MachineSinking, DEBUG_TYPE,
+                      "Machine code sinking", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(MachineSinking, "machine-sink",
-                "Machine code sinking", false, false)
+INITIALIZE_PASS_END(MachineSinking, DEBUG_TYPE,
+                    "Machine code sinking", false, false)
 
 bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI,
                                                      MachineBasicBlock *MBB) {
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index 998a9645e68b..01391a1a0e50 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -44,12 +44,12 @@ using namespace llvm;
 char MachineTraceMetrics::ID = 0;
 char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
 
-INITIALIZE_PASS_BEGIN(MachineTraceMetrics,
-                  "machine-trace-metrics", "Machine Trace Metrics", false, true)
+INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE,
+                      "Machine Trace Metrics", false, true)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(MachineTraceMetrics,
-                  "machine-trace-metrics", "Machine Trace Metrics", false, true)
+INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE,
+                    "Machine Trace Metrics", false, true)
 
 MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) {
   std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr);
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index b53b002f55a6..265f93c363ca 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -87,7 +87,6 @@ namespace {
     RegSet regsLive;
     RegVector regsDefined, regsDead, regsKilled;
     RegMaskVector regMasks;
-    RegSet regsLiveInButUnused;
 
     SlotIndex lastIndex;
 
@@ -419,7 +418,6 @@ unsigned MachineVerifier::verify(MachineFunction &MF) {
   regsDead.clear();
   regsKilled.clear();
   regMasks.clear();
-  regsLiveInButUnused.clear();
   MBBInfoMap.clear();
 
   return foundErrors;
@@ -756,7 +754,6 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
         regsLive.insert(*SubRegs);
     }
   }
-  regsLiveInButUnused = regsLive;
 
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   BitVector PR = MFI.getPristineRegs(*MF);
@@ -1268,8 +1265,6 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
 
   // Both use and def operands can read a register.
   if (MO->readsReg()) {
-    regsLiveInButUnused.erase(Reg);
-
     if (MO->isKill())
       addRegWithSubRegs(regsKilled, Reg);
 
diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index 2a8531f337a0..76ad668104b4 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp
@@ -23,7 +23,7 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
-#define DEBUG_TYPE "phi-opt"
+#define DEBUG_TYPE "opt-phis"
 
 STATISTIC(NumPHICycles, "Number of PHI cycles replaced");
 STATISTIC(NumDeadPHICycles, "Number of dead PHI cycles");
@@ -59,7 +59,7 @@ namespace {
 
 char OptimizePHIs::ID = 0;
 char &llvm::OptimizePHIsID = OptimizePHIs::ID;
-INITIALIZE_PASS(OptimizePHIs, "opt-phis",
+INITIALIZE_PASS(OptimizePHIs, DEBUG_TYPE,
                 "Optimize machine instruction PHIs", false, false)
 
 bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) {
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index db2264b2439d..9c898fa40d7e 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -112,11 +112,11 @@ STATISTIC(NumReused, "Number of reused lowered phis");
 char PHIElimination::ID = 0;
 char& llvm::PHIEliminationID = PHIElimination::ID;
 
-INITIALIZE_PASS_BEGIN(PHIElimination, "phi-node-elimination",
+INITIALIZE_PASS_BEGIN(PHIElimination, DEBUG_TYPE,
                       "Eliminate PHI nodes for register allocation",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(LiveVariables)
-INITIALIZE_PASS_END(PHIElimination, "phi-node-elimination",
+INITIALIZE_PASS_END(PHIElimination, DEBUG_TYPE,
                     "Eliminate PHI nodes for register allocation", false, false)
 
 void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 61dccdde8f1d..f2249f9e37e0 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -200,7 +200,7 @@ namespace {
 
 char &llvm::PostRASchedulerID = PostRAScheduler::ID;
 
-INITIALIZE_PASS(PostRAScheduler, "post-RA-sched",
+INITIALIZE_PASS(PostRAScheduler, DEBUG_TYPE,
                 "Post RA top-down list latency scheduler", false, false)
 
 SchedulePostRATDList::SchedulePostRATDList(
@@ -367,7 +367,7 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
     Scheduler.finishBlock();
 
     // Update register kills
-    Scheduler.fixupKills(&MBB);
+    Scheduler.fixupKills(MBB);
   }
 
   return true;
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index d27ea2f51867..0118580a626a 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -20,7 +20,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "processimplicitdefs"
+#define DEBUG_TYPE "processimpdefs"
 
 namespace {
 /// Process IMPLICIT_DEF instructions and make sure there is one implicit_def
@@ -51,9 +51,7 @@ public:
 char ProcessImplicitDefs::ID = 0;
 char &llvm::ProcessImplicitDefsID = ProcessImplicitDefs::ID;
 
-INITIALIZE_PASS_BEGIN(ProcessImplicitDefs, "processimpdefs",
-                "Process Implicit Definitions", false, false)
-INITIALIZE_PASS_END(ProcessImplicitDefs, "processimpdefs",
+INITIALIZE_PASS(ProcessImplicitDefs, DEBUG_TYPE,
                 "Process Implicit Definitions", false, false)
 
 void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const {
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index aaa253fde494..a9813e534c5f 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -45,7 +45,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "pei"
+#define DEBUG_TYPE "prologepilog"
 
 typedef SmallVector<MachineBasicBlock *, 4> MBBVector;
 static void doSpillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS,
@@ -129,12 +129,12 @@ WarnStackSize("warn-stack-size", cl::Hidden, cl::init((unsigned)-1),
               cl::desc("Warn for stack size bigger than the given"
                        " number"));
 
-INITIALIZE_PASS_BEGIN(PEI, "prologepilog", "Prologue/Epilogue Insertion", false,
+INITIALIZE_PASS_BEGIN(PEI, DEBUG_TYPE, "Prologue/Epilogue Insertion", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(StackProtector)
-INITIALIZE_PASS_END(PEI, "prologepilog",
+INITIALIZE_PASS_END(PEI, DEBUG_TYPE,
                     "Prologue/Epilogue Insertion & Frame Finalization", false,
                     false)
 
@@ -450,12 +450,13 @@ static void updateLiveness(MachineFunction &MF) {
 
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     for (MachineBasicBlock *MBB : Visited) {
       MCPhysReg Reg = CSI[i].getReg();
       // Add the callee-saved register as live-in.
       // It's killed at the spill.
-      if (!MBB->isLiveIn(Reg))
+      if (!MRI.isReserved(Reg) && !MBB->isLiveIn(Reg))
         MBB->addLiveIn(Reg);
     }
   }
diff --git a/lib/CodeGen/RenameIndependentSubregs.cpp b/lib/CodeGen/RenameIndependentSubregs.cpp
index 2f7ee8bf414c..cc32e43968bb 100644
--- a/lib/CodeGen/RenameIndependentSubregs.cpp
+++ b/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -112,11 +112,11 @@ char RenameIndependentSubregs::ID;
 
 char &llvm::RenameIndependentSubregsID = RenameIndependentSubregs::ID;
 
-INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, "rename-independent-subregs",
+INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, DEBUG_TYPE,
                       "Rename Independent Subregisters", false, false)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(RenameIndependentSubregs, "rename-independent-subregs",
+INITIALIZE_PASS_END(RenameIndependentSubregs, DEBUG_TYPE,
                     "Rename Independent Subregisters", false, false)
 
 bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const {
diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp
index 2771fdbd737a..8584a9b7c897 100644
--- a/lib/CodeGen/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@@ -52,7 +52,7 @@
 using namespace llvm;
 using namespace llvm::safestack;
 
-#define DEBUG_TYPE "safestack"
+#define DEBUG_TYPE "safe-stack"
 
 namespace llvm {
 
@@ -820,10 +820,10 @@ public:
 } // anonymous namespace
 
 char SafeStackLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(SafeStackLegacyPass, "safe-stack",
+INITIALIZE_PASS_BEGIN(SafeStackLegacyPass, DEBUG_TYPE,
                       "Safe Stack instrumentation pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
-INITIALIZE_PASS_END(SafeStackLegacyPass, "safe-stack",
+INITIALIZE_PASS_END(SafeStackLegacyPass, DEBUG_TYPE,
                     "Safe Stack instrumentation pass", false, false)
 
 FunctionPass *llvm::createSafeStackPass() { return new SafeStackLegacyPass(); }
diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
index dab5b91f50ad..07b43a82ca99 100644
--- a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
+++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -49,12 +49,8 @@ private:
 } // namespace
 
 char ScalarizeMaskedMemIntrin::ID = 0;
-INITIALIZE_PASS_BEGIN(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin",
-                      "Scalarize unsupported masked memory intrinsics", false,
-                      false)
-INITIALIZE_PASS_END(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin",
-                    "Scalarize unsupported masked memory intrinsics", false,
-                    false)
+INITIALIZE_PASS(ScalarizeMaskedMemIntrin, DEBUG_TYPE,
+                "Scalarize unsupported masked memory intrinsics", false, false)
 
 FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() {
   return new ScalarizeMaskedMemIntrin();
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 18823b74c47f..8035ea80364b 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -1057,179 +1057,71 @@ void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
         loads.dump());
 }
 
-void ScheduleDAGInstrs::startBlockForKills(MachineBasicBlock *BB) {
-  // Start with no live registers.
-  LiveRegs.reset();
+static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
+                        MachineInstr &MI, bool addToLiveRegs) {
+  for (MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.readsReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
 
-  // Examine the live-in regs of all successors.
-  for (const MachineBasicBlock *Succ : BB->successors()) {
-    for (const auto &LI : Succ->liveins()) {
-      // Repeat, for reg and all subregs.
-      for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true);
-           SubRegs.isValid(); ++SubRegs)
-        LiveRegs.set(*SubRegs);
-    }
+    // Things that are available after the instruction are killed by it.
+    bool IsKill = LiveRegs.available(MRI, Reg);
+    MO.setIsKill(IsKill);
+    if (IsKill && addToLiveRegs)
+      LiveRegs.addReg(Reg);
   }
 }
 
-/// \brief If we change a kill flag on the bundle instruction implicit register
-/// operands, then we also need to propagate that to any instructions inside
-/// the bundle which had the same kill state.
-static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg,
-                                 bool NewKillState,
-                                 const TargetRegisterInfo *TRI) {
-  if (MI->getOpcode() != TargetOpcode::BUNDLE)
-    return;
+void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
+  DEBUG(dbgs() << "Fixup kills for BB#" << MBB.getNumber() << '\n');
 
-  // Walk backwards from the last instruction in the bundle to the first.
-  // Once we set a kill flag on an instruction, we bail out, as otherwise we
-  // might set it on too many operands.  We will clear as many flags as we
-  // can though.
-  MachineBasicBlock::instr_iterator Begin = MI->getIterator();
-  MachineBasicBlock::instr_iterator End = getBundleEnd(Begin);
-  while (Begin != End) {
-    if (NewKillState) {
-      if ((--End)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false))
-         return;
-    } else
-      (--End)->clearRegisterKills(Reg, TRI);
-  }
-}
-
-void ScheduleDAGInstrs::toggleKillFlag(MachineInstr &MI, MachineOperand &MO) {
-  if (MO.isDebug())
-    return;
-
-  // Setting kill flag...
-  if (!MO.isKill()) {
-    MO.setIsKill(true);
-    toggleBundleKillFlag(&MI, MO.getReg(), true, TRI);
-    return;
-  }
-
-  // If MO itself is live, clear the kill flag...
-  if (LiveRegs.test(MO.getReg())) {
-    MO.setIsKill(false);
-    toggleBundleKillFlag(&MI, MO.getReg(), false, TRI);
-    return;
-  }
-
-  // If any subreg of MO is live, then create an imp-def for that
-  // subreg and keep MO marked as killed.
-  MO.setIsKill(false);
-  toggleBundleKillFlag(&MI, MO.getReg(), false, TRI);
-  bool AllDead = true;
-  const unsigned SuperReg = MO.getReg();
-  MachineInstrBuilder MIB(MF, &MI);
-  for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) {
-    if (LiveRegs.test(*SubRegs)) {
-      MIB.addReg(*SubRegs, RegState::ImplicitDefine);
-      AllDead = false;
-    }
-  }
-
-  if(AllDead) {
-    MO.setIsKill(true);
-    toggleBundleKillFlag(&MI, MO.getReg(), true, TRI);
-  }
-}
-
-void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {
-  // FIXME: Reuse the LivePhysRegs utility for this.
-  DEBUG(dbgs() << "Fixup kills for BB#" << MBB->getNumber() << '\n');
-
-  LiveRegs.resize(TRI->getNumRegs());
-  BitVector killedRegs(TRI->getNumRegs());
-
-  startBlockForKills(MBB);
+  LiveRegs.init(*TRI);
+  LiveRegs.addLiveOuts(MBB);
 
   // Examine block from end to start...
-  unsigned Count = MBB->size();
-  for (MachineBasicBlock::iterator I = MBB->end(), E = MBB->begin();
-       I != E; --Count) {
-    MachineInstr &MI = *--I;
+  for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
     if (MI.isDebugValue())
       continue;
 
     // Update liveness.  Registers that are defed but not used in this
     // instruction are now dead. Mark register and all subregs as they
     // are completely defined.
-    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI.getOperand(i);
-      if (MO.isRegMask())
-        LiveRegs.clearBitsNotInMask(MO.getRegMask());
-      if (!MO.isReg()) continue;
-      unsigned Reg = MO.getReg();
-      if (Reg == 0) continue;
-      if (!MO.isDef()) continue;
-      // Ignore two-addr defs.
-      if (MI.isRegTiedToUseOperand(i)) continue;
-
-      // Repeat for reg and all subregs.
-      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-           SubRegs.isValid(); ++SubRegs)
-        LiveRegs.reset(*SubRegs);
+    for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+      const MachineOperand &MO = *O;
+      if (MO.isReg()) {
+        if (!MO.isDef())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (!Reg)
+          continue;
+        LiveRegs.removeReg(Reg);
+      } else if (MO.isRegMask()) {
+        LiveRegs.removeRegsInMask(MO);
+      }
     }
 
-    // Examine all used registers and set/clear kill flag. When a
-    // register is used multiple times we only set the kill flag on
-    // the first use. Don't set kill flags on undef operands.
-    killedRegs.reset();
-
-    // toggleKillFlag can append new operands (implicit defs), so using
-    // a range-based loop is not safe. The new operands will be appended
-    // at the end of the operand list and they don't need to be visited,
-    // so iterating until the currently last operand is ok.
-    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = MI.getOperand(i);
-      if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
-      unsigned Reg = MO.getReg();
-      if ((Reg == 0) || MRI.isReserved(Reg)) continue;
-
-      bool kill = false;
-      if (!killedRegs.test(Reg)) {
-        kill = true;
-        // A register is not killed if any subregs are live...
-        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
-          if (LiveRegs.test(*SubRegs)) {
-            kill = false;
-            break;
-          }
-        }
-
-        // If subreg is not live, then register is killed if it became
-        // live in this instruction
-        if (kill)
-          kill = !LiveRegs.test(Reg);
+    // If there is a bundle header fix it up first.
+    if (!MI.isBundled()) {
+      toggleKills(MRI, LiveRegs, MI, true);
+    } else {
+      MachineBasicBlock::instr_iterator First = MI.getIterator();
+      if (MI.isBundle()) {
+        toggleKills(MRI, LiveRegs, MI, false);
+        ++First;
       }
-
-      if (MO.isKill() != kill) {
-        DEBUG(dbgs() << "Fixing " << MO << " in ");
-        toggleKillFlag(MI, MO);
-        DEBUG(MI.dump());
-        DEBUG({
-          if (MI.getOpcode() == TargetOpcode::BUNDLE) {
-            MachineBasicBlock::instr_iterator Begin = MI.getIterator();
-            MachineBasicBlock::instr_iterator End = getBundleEnd(Begin);
-            while (++Begin != End)
-              DEBUG(Begin->dump());
-          }
-        });
-      }
-
-      killedRegs.set(Reg);
-    }
-
-    // Mark any used register (that is not using undef) and subregs as
-    // now live...
-    for (const MachineOperand &MO : MI.operands()) {
-      if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
-      unsigned Reg = MO.getReg();
-      if ((Reg == 0) || MRI.isReserved(Reg)) continue;
-
-      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
-           SubRegs.isValid(); ++SubRegs)
-        LiveRegs.set(*SubRegs);
+      // Some targets make the (questionable) assumtion that the instructions
+      // inside the bundle are ordered and consequently only the last use of
+      // a register inside the bundle can kill it.
+      MachineBasicBlock::instr_iterator I = std::next(First);
+      while (I->isBundledWithSucc())
+        ++I;
+      do {
+        if (!I->isDebugValue())
+          toggleKills(MRI, LiveRegs, *I, true);
+        --I;
+      } while(I != First);
     }
   }
 }
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5d450e7e078c..23a302f3e561 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12349,9 +12349,9 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
       SDValue Val = St->getValue();
       StoreInt <<= ElementSizeBytes * 8;
       if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
-        StoreInt |= C->getAPIntValue().zext(SizeInBits);
+        StoreInt |= C->getAPIntValue().zextOrTrunc(SizeInBits);
       } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
-        StoreInt |= C->getValueAPF().bitcastToAPInt().zext(SizeInBits);
+        StoreInt |= C->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
       } else {
         llvm_unreachable("Invalid constant element type");
       }
@@ -12617,16 +12617,19 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
         EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
         bool IsFast = false;
         if (TLI.isTypeLegal(StoreTy) &&
+            TLI.canMergeStoresTo(FirstStoreAS, StoreTy) &&
             TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
                                    FirstStoreAlign, &IsFast) &&
             IsFast) {
           LastLegalType = i + 1;
           // Or check whether a truncstore is legal.
-        } else if (TLI.getTypeAction(Context, StoreTy) ==
+        } else if (!LegalTypes &&
+                   TLI.getTypeAction(Context, StoreTy) ==
                    TargetLowering::TypePromoteInteger) {
           EVT LegalizedStoredValueTy =
               TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
           if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
+              TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy) &&
               TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
                                      FirstStoreAS, FirstStoreAlign, &IsFast) &&
               IsFast) {
@@ -12642,7 +12645,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
             !NoVectors) {
           // Find a legal type for the vector store.
           EVT Ty = EVT::getVectorVT(Context, MemVT, i + 1);
-          if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(Ty) &&
+          if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty) &&
               TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
                                      FirstStoreAlign, &IsFast) &&
               IsFast)
@@ -12700,7 +12703,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
         EVT Ty =
             EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
         bool IsFast;
-        if (TLI.isTypeLegal(Ty) &&
+        if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty) &&
             TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
                                    FirstStoreAlign, &IsFast) &&
             IsFast)
@@ -12810,6 +12813,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       EVT StoreTy = EVT::getVectorVT(Context, MemVT, i + 1);
       bool IsFastSt, IsFastLd;
       if (TLI.isTypeLegal(StoreTy) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy) &&
           TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
                                  FirstStoreAlign, &IsFastSt) &&
           IsFastSt &&
@@ -12823,6 +12827,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
       StoreTy = EVT::getIntegerVT(Context, SizeInBits);
       if (TLI.isTypeLegal(StoreTy) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy) &&
           TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
                                  FirstStoreAlign, &IsFastSt) &&
           IsFastSt &&
@@ -12834,7 +12839,9 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
       else if (TLI.getTypeAction(Context, StoreTy) ==
                TargetLowering::TypePromoteInteger) {
         EVT LegalizedStoredValueTy = TLI.getTypeToTransformTo(Context, StoreTy);
-        if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
+        if (!LegalTypes &&
+            TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
+            TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValueTy) &&
             TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy,
                                StoreTy) &&
             TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy,
@@ -14455,6 +14462,145 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   return SDValue();
 }
 
+/// If we are extracting a subvector produced by a wide binary operator with at
+/// at least one operand that was the result of a vector concatenation, then try
+/// to use the narrow vector operands directly to avoid the concatenation and
+/// extraction.
+static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
+  // TODO: Refactor with the caller (visitEXTRACT_SUBVECTOR), so we can share
+  // some of these bailouts with other transforms.
+
+  // The extract index must be a constant, so we can map it to a concat operand.
+  auto *ExtractIndex = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
+  if (!ExtractIndex)
+    return SDValue();
+
+  // Only handle the case where we are doubling and then halving. A larger ratio
+  // may require more than two narrow binops to replace the wide binop.
+  EVT VT = Extract->getValueType(0);
+  unsigned NumElems = VT.getVectorNumElements();
+  assert((ExtractIndex->getZExtValue() % NumElems) == 0 &&
+         "Extract index is not a multiple of the vector length.");
+  if (Extract->getOperand(0).getValueSizeInBits() != VT.getSizeInBits() * 2)
+    return SDValue();
+
+  // We are looking for an optionally bitcasted wide vector binary operator
+  // feeding an extract subvector.
+  SDValue BinOp = Extract->getOperand(0);
+  if (BinOp.getOpcode() == ISD::BITCAST)
+    BinOp = BinOp.getOperand(0);
+
+  // TODO: The motivating case for this transform is an x86 AVX1 target. That
+  // target has temptingly almost legal versions of bitwise logic ops in 256-bit
+  // flavors, but no other 256-bit integer support. This could be extended to
+  // handle any binop, but that may require fixing/adding other folds to avoid
+  // codegen regressions.
+  unsigned BOpcode = BinOp.getOpcode();
+  if (BOpcode != ISD::AND && BOpcode != ISD::OR && BOpcode != ISD::XOR)
+    return SDValue();
+
+  // The binop must be a vector type, so we can chop it in half.
+  EVT WideBVT = BinOp.getValueType();
+  if (!WideBVT.isVector())
+    return SDValue();
+
+  // Bail out if the target does not support a narrower version of the binop.
+  EVT NarrowBVT = EVT::getVectorVT(*DAG.getContext(), WideBVT.getScalarType(),
+                                   WideBVT.getVectorNumElements() / 2);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrCustomOrPromote(BOpcode, NarrowBVT))
+    return SDValue();
+
+  // Peek through bitcasts of the binary operator operands if needed.
+  SDValue LHS = BinOp.getOperand(0);
+  if (LHS.getOpcode() == ISD::BITCAST)
+    LHS = LHS.getOperand(0);
+
+  SDValue RHS = BinOp.getOperand(1);
+  if (RHS.getOpcode() == ISD::BITCAST)
+    RHS = RHS.getOperand(0);
+
+  // We need at least one concatenation operation of a binop operand to make
+  // this transform worthwhile. The concat must double the input vector sizes.
+  // TODO: Should we also handle INSERT_SUBVECTOR patterns?
+  bool ConcatL =
+      LHS.getOpcode() == ISD::CONCAT_VECTORS && LHS.getNumOperands() == 2;
+  bool ConcatR =
+      RHS.getOpcode() == ISD::CONCAT_VECTORS && RHS.getNumOperands() == 2;
+  if (!ConcatL && !ConcatR)
+    return SDValue();
+
+  // If one of the binop operands was not the result of a concat, we must
+  // extract a half-sized operand for our new narrow binop. We can't just reuse
+  // the original extract index operand because we may have bitcasted.
+  unsigned ConcatOpNum = ExtractIndex->getZExtValue() / NumElems;
+  unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
+  EVT ExtBOIdxVT = Extract->getOperand(1).getValueType();
+  SDLoc DL(Extract);
+
+  // extract (binop (concat X1, X2), (concat Y1, Y2)), N --> binop XN, YN
+  // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, N)
+  // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, N), YN
+  SDValue X = ConcatL ? DAG.getBitcast(NarrowBVT, LHS.getOperand(ConcatOpNum))
+                      : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
+                                    BinOp.getOperand(0),
+                                    DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT));
+
+  SDValue Y = ConcatR ? DAG.getBitcast(NarrowBVT, RHS.getOperand(ConcatOpNum))
+                      : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
+                                    BinOp.getOperand(1),
+                                    DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT));
+
+  SDValue NarrowBinOp = DAG.getNode(BOpcode, DL, NarrowBVT, X, Y);
+  return DAG.getBitcast(VT, NarrowBinOp);
+}
+
+/// If we are extracting a subvector from a wide vector load, convert to a
+/// narrow load to eliminate the extraction:
+/// (extract_subvector (load wide vector)) --> (load narrow vector)
+static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
+  // TODO: Add support for big-endian. The offset calculation must be adjusted.
+  if (DAG.getDataLayout().isBigEndian())
+    return SDValue();
+
+  // TODO: The one-use check is overly conservative. Check the cost of the
+  // extract instead or remove that condition entirely.
+  auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
+  auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
+  if (!Ld || !Ld->hasOneUse() || Ld->isVolatile() || !ExtIdx)
+    return SDValue();
+
+  // The narrow load will be offset from the base address of the old load if
+  // we are extracting from something besides index 0 (little-endian).
+  EVT VT = Extract->getValueType(0);
+  SDLoc DL(Extract);
+  SDValue BaseAddr = Ld->getOperand(1);
+  unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize();
+
+  // TODO: Use "BaseIndexOffset" to make this more effective.
+  SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(Ld->getMemOperand(), Offset,
+                                                   VT.getStoreSize());
+  SDValue NewLd = DAG.getLoad(VT, DL, Ld->getChain(), NewAddr, MMO);
+
+  // The new load must have the same position as the old load in terms of memory
+  // dependency. Create a TokenFactor for Ld and NewLd and update uses of Ld's
+  // output chain to use that TokenFactor.
+  // TODO: This code is based on a similar sequence in x86 lowering. It should
+  // be moved to a helper function, so it can be shared and reused.
+  if (Ld->hasAnyUseOfValue(1)) {
+    SDValue OldChain = SDValue(Ld, 1);
+    SDValue NewChain = SDValue(NewLd.getNode(), 1);
+    SDValue TokenFactor = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                      OldChain, NewChain);
+    DAG.ReplaceAllUsesOfValueWith(OldChain, TokenFactor);
+    DAG.UpdateNodeOperands(TokenFactor.getNode(), OldChain, NewChain);
+  }
+
+  return NewLd;
+}
+
 SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
   EVT NVT = N->getValueType(0);
   SDValue V = N->getOperand(0);
@@ -14463,6 +14609,10 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
   if (V.isUndef())
     return DAG.getUNDEF(NVT);
 
+  if (TLI.isOperationLegalOrCustomOrPromote(ISD::LOAD, NVT))
+    if (SDValue NarrowLoad = narrowExtractedVectorLoad(N, DAG))
+      return NarrowLoad;
+
   // Combine:
   //    (extract_subvec (concat V1, V2, ...), i)
   // Into:
@@ -14510,6 +14660,9 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
     }
   }
 
+  if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG))
+    return NarrowBOp;
+
   return SDValue();
 }
 
@@ -14745,10 +14898,10 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
 // This is often generated during legalization.
 // e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src))
 // TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case.
-SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
-                                     SelectionDAG &DAG,
-                                     const TargetLowering &TLI,
-                                     bool LegalOperations) {
+static SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
+                                            SelectionDAG &DAG,
+                                            const TargetLowering &TLI,
+                                            bool LegalOperations) {
   EVT VT = SVN->getValueType(0);
   bool IsBigEndian = DAG.getDataLayout().isBigEndian();
 
@@ -14795,7 +14948,8 @@ SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
 // destination type. This is often generated during legalization.
 // If the source node itself was a '*_extend_vector_inreg' node then we should
 // then be able to remove it.
-SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) {
+static SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN,
+                                        SelectionDAG &DAG) {
   EVT VT = SVN->getValueType(0);
   bool IsBigEndian = DAG.getDataLayout().isBigEndian();
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 9a47a914df91..d0a8b34c69c6 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -899,6 +899,39 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
   }
 }
 
+static TargetLowering::LegalizeAction
+getStrictFPOpcodeAction(const TargetLowering &TLI, unsigned Opcode, EVT VT) {
+  unsigned EqOpc;
+  switch (Opcode) {
+    default: llvm_unreachable("Unexpected FP pseudo-opcode");
+    case ISD::STRICT_FSQRT: EqOpc = ISD::FSQRT; break;
+    case ISD::STRICT_FPOW: EqOpc = ISD::FPOW; break;
+    case ISD::STRICT_FPOWI: EqOpc = ISD::FPOWI; break;
+    case ISD::STRICT_FSIN: EqOpc = ISD::FSIN; break;
+    case ISD::STRICT_FCOS: EqOpc = ISD::FCOS; break;
+    case ISD::STRICT_FEXP: EqOpc = ISD::FEXP; break;
+    case ISD::STRICT_FEXP2: EqOpc = ISD::FEXP2; break;
+    case ISD::STRICT_FLOG: EqOpc = ISD::FLOG; break;
+    case ISD::STRICT_FLOG10: EqOpc = ISD::FLOG10; break;
+    case ISD::STRICT_FLOG2: EqOpc = ISD::FLOG2; break;
+    case ISD::STRICT_FRINT: EqOpc = ISD::FRINT; break;
+    case ISD::STRICT_FNEARBYINT: EqOpc = ISD::FNEARBYINT; break;
+  }
+
+  auto Action = TLI.getOperationAction(EqOpc, VT);
+
+  // We don't currently handle Custom or Promote for strict FP pseudo-ops.
+  // For now, we just expand for those cases.
+  if (Action != TargetLowering::Legal)
+    Action = TargetLowering::Expand;
+
+  // ISD::FPOWI returns 'Legal' even though it should be expanded.
+  if (Opcode == ISD::STRICT_FPOWI && Action == TargetLowering::Legal)
+    Action = TargetLowering::Expand;
+
+  return Action;
+}
+
 /// Return a legal replacement for the given operation, with all legal operands.
 void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));
@@ -1043,6 +1076,25 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
       return;
     }
     break;
+  case ISD::STRICT_FSQRT:
+  case ISD::STRICT_FPOW:
+  case ISD::STRICT_FPOWI:
+  case ISD::STRICT_FSIN:
+  case ISD::STRICT_FCOS:
+  case ISD::STRICT_FEXP:
+  case ISD::STRICT_FEXP2:
+  case ISD::STRICT_FLOG:
+  case ISD::STRICT_FLOG10:
+  case ISD::STRICT_FLOG2:
+  case ISD::STRICT_FRINT:
+  case ISD::STRICT_FNEARBYINT:
+    // These pseudo-ops get legalized as if they were their non-strict
+    // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
+    // is also legal, but if ISD::FSQRT requires expansion then so does
+    // ISD::STRICT_FSQRT.
+    Action = getStrictFPOpcodeAction(TLI, Node->getOpcode(),
+                                     Node->getValueType(0));
+    break;
 
   default:
     if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
@@ -2032,6 +2084,9 @@ SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
                                               RTLIB::Libcall Call_F80,
                                               RTLIB::Libcall Call_F128,
                                               RTLIB::Libcall Call_PPCF128) {
+  if (Node->isStrictFPOpcode())
+    Node = DAG.mutateStrictFPToFP(Node);
+
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
@@ -3907,16 +3962,19 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                                       RTLIB::FMAX_PPCF128));
     break;
   case ISD::FSQRT:
+  case ISD::STRICT_FSQRT:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
                                       RTLIB::SQRT_F80, RTLIB::SQRT_F128,
                                       RTLIB::SQRT_PPCF128));
     break;
   case ISD::FSIN:
+  case ISD::STRICT_FSIN:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
                                       RTLIB::SIN_F80, RTLIB::SIN_F128,
                                       RTLIB::SIN_PPCF128));
     break;
   case ISD::FCOS:
+  case ISD::STRICT_FCOS:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
                                       RTLIB::COS_F80, RTLIB::COS_F128,
                                       RTLIB::COS_PPCF128));
@@ -3926,26 +3984,31 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     ExpandSinCosLibCall(Node, Results);
     break;
   case ISD::FLOG:
+  case ISD::STRICT_FLOG:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
                                       RTLIB::LOG_F80, RTLIB::LOG_F128,
                                       RTLIB::LOG_PPCF128));
     break;
   case ISD::FLOG2:
+  case ISD::STRICT_FLOG2:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
                                       RTLIB::LOG2_F80, RTLIB::LOG2_F128,
                                       RTLIB::LOG2_PPCF128));
     break;
   case ISD::FLOG10:
+  case ISD::STRICT_FLOG10:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
                                       RTLIB::LOG10_F80, RTLIB::LOG10_F128,
                                       RTLIB::LOG10_PPCF128));
     break;
   case ISD::FEXP:
+  case ISD::STRICT_FEXP:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
                                       RTLIB::EXP_F80, RTLIB::EXP_F128,
                                       RTLIB::EXP_PPCF128));
     break;
   case ISD::FEXP2:
+  case ISD::STRICT_FEXP2:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
                                       RTLIB::EXP2_F80, RTLIB::EXP2_F128,
                                       RTLIB::EXP2_PPCF128));
@@ -3966,11 +4029,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                                       RTLIB::CEIL_PPCF128));
     break;
   case ISD::FRINT:
+  case ISD::STRICT_FRINT:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64,
                                       RTLIB::RINT_F80, RTLIB::RINT_F128,
                                       RTLIB::RINT_PPCF128));
     break;
   case ISD::FNEARBYINT:
+  case ISD::STRICT_FNEARBYINT:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32,
                                       RTLIB::NEARBYINT_F64,
                                       RTLIB::NEARBYINT_F80,
@@ -3985,11 +4050,13 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                                       RTLIB::ROUND_PPCF128));
     break;
   case ISD::FPOWI:
+  case ISD::STRICT_FPOWI:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
                                       RTLIB::POWI_F80, RTLIB::POWI_F128,
                                       RTLIB::POWI_PPCF128));
     break;
   case ISD::FPOW:
+  case ISD::STRICT_FPOW:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
                                       RTLIB::POW_F80, RTLIB::POW_F128,
                                       RTLIB::POW_PPCF128));
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 16c1f78f1b35..177898e1e950 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -4779,23 +4779,23 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
                                    DAG.getMachineFunction());
 
   if (VT == MVT::Other) {
-    if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(DstAS) ||
-        TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) {
-      VT = TLI.getPointerTy(DAG.getDataLayout(), DstAS);
-    } else {
-      switch (DstAlign & 7) {
-      case 0:  VT = MVT::i64; break;
-      case 4:  VT = MVT::i32; break;
-      case 2:  VT = MVT::i16; break;
-      default: VT = MVT::i8;  break;
-      }
-    }
+    // Use the largest integer type whose alignment constraints are satisfied.
+    // We only need to check DstAlign here as SrcAlign is always greater or
+    // equal to DstAlign (or zero).
+    VT = MVT::i64;
+    while (DstAlign && DstAlign < VT.getSizeInBits() / 8 &&
+           !TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign))
+      VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
+    assert(VT.isInteger());
 
+    // Find the largest legal integer type.
     MVT LVT = MVT::i64;
     while (!TLI.isTypeLegal(LVT))
       LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
     assert(LVT.isInteger());
 
+    // If the type we've chosen is larger than the largest legal integer type
+    // then use that instead.
     if (VT.bitsGT(LVT))
       VT = LVT;
   }
@@ -6542,6 +6542,63 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
   return N;
 }
 
+SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
+  unsigned OrigOpc = Node->getOpcode();
+  unsigned NewOpc;
+  bool IsUnary = false;
+  switch (OrigOpc) {
+  default: 
+    llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!");
+  case ISD::STRICT_FADD: NewOpc = ISD::FADD; break;
+  case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; break;
+  case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; break;
+  case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; break;
+  case ISD::STRICT_FREM: NewOpc = ISD::FREM; break;
+  case ISD::STRICT_FSQRT: NewOpc = ISD::FSQRT; IsUnary = true; break;
+  case ISD::STRICT_FPOW: NewOpc = ISD::FPOW; break;
+  case ISD::STRICT_FPOWI: NewOpc = ISD::FPOWI; break;
+  case ISD::STRICT_FSIN: NewOpc = ISD::FSIN; IsUnary = true; break;
+  case ISD::STRICT_FCOS: NewOpc = ISD::FCOS; IsUnary = true; break;
+  case ISD::STRICT_FEXP: NewOpc = ISD::FEXP; IsUnary = true; break;
+  case ISD::STRICT_FEXP2: NewOpc = ISD::FEXP2; IsUnary = true; break;
+  case ISD::STRICT_FLOG: NewOpc = ISD::FLOG; IsUnary = true; break;
+  case ISD::STRICT_FLOG10: NewOpc = ISD::FLOG10; IsUnary = true; break;
+  case ISD::STRICT_FLOG2: NewOpc = ISD::FLOG2; IsUnary = true; break;
+  case ISD::STRICT_FRINT: NewOpc = ISD::FRINT; IsUnary = true; break;
+  case ISD::STRICT_FNEARBYINT:
+    NewOpc = ISD::FNEARBYINT;
+    IsUnary = true;
+    break;
+  }
+
+  // We're taking this node out of the chain, so we need to re-link things.
+  SDValue InputChain = Node->getOperand(0);
+  SDValue OutputChain = SDValue(Node, 1);
+  ReplaceAllUsesOfValueWith(OutputChain, InputChain);
+
+  SDVTList VTs = getVTList(Node->getOperand(1).getValueType());
+  SDNode *Res = nullptr;
+  if (IsUnary)
+    Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1) });
+  else
+    Res = MorphNodeTo(Node, NewOpc, VTs, { Node->getOperand(1),
+                                           Node->getOperand(2) });
+  
+  // MorphNodeTo can operate in two ways: if an existing node with the
+  // specified operands exists, it can just return it.  Otherwise, it
+  // updates the node in place to have the requested operands.
+  if (Res == Node) {
+    // If we updated the node in place, reset the node ID.  To the isel,
+    // this should be just like a newly allocated machine node.
+    Res->setNodeId(-1);
+  } else {
+    ReplaceAllUsesWith(Node, Res);
+    RemoveDeadNode(Node);
+  }
+
+  return Res; 
+}
+
 
 /// getMachineNode - These are used for target selectors to create a new node
 /// with specified return type(s), MachineInstr opcode, and operands.
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 57d340c41c39..b895da21a7ff 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4736,24 +4736,15 @@ SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,
                                              DIExpression *Expr, int64_t Offset,
                                              const DebugLoc &dl,
                                              unsigned DbgSDNodeOrder) {
-  SDDbgValue *SDV;
-  auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode());
-  if (FISDN && Expr->startsWithDeref()) {
+  if (auto *FISDN = dyn_cast<FrameIndexSDNode>(N.getNode())) {
     // Construct a FrameIndexDbgValue for FrameIndexSDNodes so we can describe
     // stack slot locations as such instead of as indirectly addressed
     // locations.
-    ArrayRef<uint64_t> TrailingElements(Expr->elements_begin() + 1,
-                                        Expr->elements_end());
-    DIExpression *DerefedDIExpr =
-        DIExpression::get(*DAG.getContext(), TrailingElements);
-    int FI = FISDN->getIndex();
-    SDV = DAG.getFrameIndexDbgValue(Variable, DerefedDIExpr, FI, 0, dl,
-                                    DbgSDNodeOrder);
-  } else {
-    SDV = DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(), false,
-                          Offset, dl, DbgSDNodeOrder);
+    return DAG.getFrameIndexDbgValue(Variable, Expr, FISDN->getIndex(), 0, dl,
+                                     DbgSDNodeOrder);
   }
-  return SDV;
+  return DAG.getDbgValue(Variable, Expr, N.getNode(), N.getResNo(), false,
+                         Offset, dl, DbgSDNodeOrder);
 }
 
 // VisualStudio defines setjmp as _setjmp
@@ -5254,7 +5245,19 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::experimental_constrained_fmul:
   case Intrinsic::experimental_constrained_fdiv:
   case Intrinsic::experimental_constrained_frem:
-    visitConstrainedFPIntrinsic(I, Intrinsic);
+  case Intrinsic::experimental_constrained_sqrt:
+  case Intrinsic::experimental_constrained_pow:
+  case Intrinsic::experimental_constrained_powi:
+  case Intrinsic::experimental_constrained_sin:
+  case Intrinsic::experimental_constrained_cos:
+  case Intrinsic::experimental_constrained_exp:
+  case Intrinsic::experimental_constrained_exp2:
+  case Intrinsic::experimental_constrained_log:
+  case Intrinsic::experimental_constrained_log10:
+  case Intrinsic::experimental_constrained_log2:
+  case Intrinsic::experimental_constrained_rint:
+  case Intrinsic::experimental_constrained_nearbyint:
+    visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
     return nullptr;
   case Intrinsic::fmuladd: {
     EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
@@ -5752,11 +5755,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   }
 }
 
-void SelectionDAGBuilder::visitConstrainedFPIntrinsic(const CallInst &I,
-                                                      unsigned Intrinsic) {
+void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
+    const ConstrainedFPIntrinsic &FPI) {
   SDLoc sdl = getCurSDLoc();
   unsigned Opcode;
-  switch (Intrinsic) {
+  switch (FPI.getIntrinsicID()) {
   default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
   case Intrinsic::experimental_constrained_fadd:
     Opcode = ISD::STRICT_FADD;
@@ -5773,23 +5776,64 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(const CallInst &I,
   case Intrinsic::experimental_constrained_frem:
     Opcode = ISD::STRICT_FREM;
     break;
+  case Intrinsic::experimental_constrained_sqrt:
+    Opcode = ISD::STRICT_FSQRT;
+    break;
+  case Intrinsic::experimental_constrained_pow:
+    Opcode = ISD::STRICT_FPOW;
+    break;
+  case Intrinsic::experimental_constrained_powi:
+    Opcode = ISD::STRICT_FPOWI;
+    break;
+  case Intrinsic::experimental_constrained_sin:
+    Opcode = ISD::STRICT_FSIN;
+    break;
+  case Intrinsic::experimental_constrained_cos:
+    Opcode = ISD::STRICT_FCOS;
+    break;
+  case Intrinsic::experimental_constrained_exp:
+    Opcode = ISD::STRICT_FEXP;
+    break;
+  case Intrinsic::experimental_constrained_exp2:
+    Opcode = ISD::STRICT_FEXP2;
+    break;
+  case Intrinsic::experimental_constrained_log:
+    Opcode = ISD::STRICT_FLOG;
+    break;
+  case Intrinsic::experimental_constrained_log10:
+    Opcode = ISD::STRICT_FLOG10;
+    break;
+  case Intrinsic::experimental_constrained_log2:
+    Opcode = ISD::STRICT_FLOG2;
+    break;
+  case Intrinsic::experimental_constrained_rint:
+    Opcode = ISD::STRICT_FRINT;
+    break;
+  case Intrinsic::experimental_constrained_nearbyint:
+    Opcode = ISD::STRICT_FNEARBYINT;
+    break;
   }
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Chain = getRoot();
-  SDValue Ops[3] = { Chain, getValue(I.getArgOperand(0)),
-                     getValue(I.getArgOperand(1)) };
   SmallVector<EVT, 4> ValueVTs;
-  ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
+  ComputeValueVTs(TLI, DAG.getDataLayout(), FPI.getType(), ValueVTs);
   ValueVTs.push_back(MVT::Other); // Out chain
 
   SDVTList VTs = DAG.getVTList(ValueVTs);
-  SDValue Result = DAG.getNode(Opcode, sdl, VTs, Ops);
+  SDValue Result;
+  if (FPI.isUnaryOp())
+    Result = DAG.getNode(Opcode, sdl, VTs, 
+                         { Chain, getValue(FPI.getArgOperand(0)) });
+  else
+    Result = DAG.getNode(Opcode, sdl, VTs, 
+                         { Chain, getValue(FPI.getArgOperand(0)),
+                           getValue(FPI.getArgOperand(1))  });
 
   assert(Result.getNode()->getNumValues() == 2);
   SDValue OutChain = Result.getValue(1);
   DAG.setRoot(OutChain);
   SDValue FPResult = Result.getValue(0);
-  setValue(&I, FPResult);
+  setValue(&FPI, FPResult);
 }
 
 std::pair<SDValue, SDValue>
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index bdaee858da61..77e131fa551c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -895,7 +895,7 @@ private:
   void visitInlineAsm(ImmutableCallSite CS);
   const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
   void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
-  void visitConstrainedFPIntrinsic(const CallInst &I, unsigned Intrinsic);
+  void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
 
   void visitVAStart(const CallInst &I);
   void visitVAArg(const VAArgInst &I);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 5e0feccb6b4c..687b882c5e4d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -905,50 +905,6 @@ public:
 
 } // end anonymous namespace
 
-static bool isStrictFPOp(SDNode *Node, unsigned &NewOpc) {
-  unsigned OrigOpc = Node->getOpcode();
-  switch (OrigOpc) {
-    case ISD::STRICT_FADD: NewOpc = ISD::FADD; return true;
-    case ISD::STRICT_FSUB: NewOpc = ISD::FSUB; return true;
-    case ISD::STRICT_FMUL: NewOpc = ISD::FMUL; return true;
-    case ISD::STRICT_FDIV: NewOpc = ISD::FDIV; return true;
-    case ISD::STRICT_FREM: NewOpc = ISD::FREM; return true;
-    default: return false;
-  }
-}
-
-SDNode* SelectionDAGISel::MutateStrictFPToFP(SDNode *Node, unsigned NewOpc) {
-  assert(((Node->getOpcode() == ISD::STRICT_FADD && NewOpc == ISD::FADD) ||
-          (Node->getOpcode() == ISD::STRICT_FSUB && NewOpc == ISD::FSUB) ||
-          (Node->getOpcode() == ISD::STRICT_FMUL && NewOpc == ISD::FMUL) ||
-          (Node->getOpcode() == ISD::STRICT_FDIV && NewOpc == ISD::FDIV) ||
-          (Node->getOpcode() == ISD::STRICT_FREM && NewOpc == ISD::FREM)) &&
-          "Unexpected StrictFP opcode!");
-
-  // We're taking this node out of the chain, so we need to re-link things.
-  SDValue InputChain = Node->getOperand(0);
-  SDValue OutputChain = SDValue(Node, 1);
-  CurDAG->ReplaceAllUsesOfValueWith(OutputChain, InputChain);
-
-  SDVTList VTs = CurDAG->getVTList(Node->getOperand(1).getValueType());
-  SDValue Ops[2] = { Node->getOperand(1), Node->getOperand(2) };
-  SDNode *Res = CurDAG->MorphNodeTo(Node, NewOpc, VTs, Ops);
-  
-  // MorphNodeTo can operate in two ways: if an existing node with the
-  // specified operands exists, it can just return it.  Otherwise, it
-  // updates the node in place to have the requested operands.
-  if (Res == Node) {
-    // If we updated the node in place, reset the node ID.  To the isel,
-    // this should be just like a newly allocated machine node.
-    Res->setNodeId(-1);
-  } else {
-    CurDAG->ReplaceAllUsesWith(Node, Res);
-    CurDAG->RemoveDeadNode(Node);
-  }
-
-  return Res; 
-}
-
 void SelectionDAGISel::DoInstructionSelection() {
   DEBUG(dbgs() << "===== Instruction selection begins: BB#"
         << FuncInfo->MBB->getNumber()
@@ -992,15 +948,12 @@ void SelectionDAGISel::DoInstructionSelection() {
       // If the current node is a strict FP pseudo-op, the isStrictFPOp()
       // function will provide the corresponding normal FP opcode to which the
       // node should be mutated.
-      unsigned NormalFPOpc = ISD::UNDEF;
-      bool IsStrictFPOp = isStrictFPOp(Node, NormalFPOpc);
-      if (IsStrictFPOp)
-        Node = MutateStrictFPToFP(Node, NormalFPOpc);
+      //
+      // FIXME: The backends need a way to handle FP constraints.
+      if (Node->isStrictFPOpcode())
+        Node = CurDAG->mutateStrictFPToFP(Node);
 
       Select(Node);
-
-      // FIXME: Add code here to attach an implicit def and use of
-      // target-specific FP environment registers.
     }
 
     CurDAG->setRoot(Dummy.getValue());
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index befbd80d7965..0dffffee9976 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -603,11 +603,11 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
     if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1))
       return true;
-    assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     if (SimplifyDemandedBits(Op.getOperand(0), ~Known.Zero & NewMask,
                              Known2, TLO, Depth+1))
       return true;
-    assert((Known2.Zero & Known2.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If all of the demanded bits are known one on one side, return the other.
     // These bits cannot contribute to the result of the 'and'.
@@ -633,11 +633,11 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   case ISD::OR:
     if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1))
       return true;
-    assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     if (SimplifyDemandedBits(Op.getOperand(0), ~Known.One & NewMask,
                              Known2, TLO, Depth+1))
       return true;
-    assert((Known2.Zero & Known2.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'or'.
@@ -660,10 +660,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
   case ISD::XOR: {
     if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known, TLO, Depth+1))
       return true;
-    assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     if (SimplifyDemandedBits(Op.getOperand(0), NewMask, Known2, TLO, Depth+1))
       return true;
-    assert((Known2.Zero & Known2.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'xor'.
@@ -725,8 +725,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       return true;
     if (SimplifyDemandedBits(Op.getOperand(1), NewMask, Known2, TLO, Depth+1))
       return true;
-    assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
-    assert((Known2.Zero & Known2.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If the operands are constants, see if we can simplify them.
     if (ShrinkDemandedConstant(Op, NewMask, TLO))
@@ -741,8 +741,8 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       return true;
     if (SimplifyDemandedBits(Op.getOperand(2), NewMask, Known2, TLO, Depth+1))
       return true;
-    assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
-    assert((Known2.Zero & Known2.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+    assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If the operands are constants, see if we can simplify them.
     if (ShrinkDemandedConstant(Op, NewMask, TLO))
@@ -907,7 +907,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       // Compute the new bits that are at the top now.
       if (SimplifyDemandedBits(InOp, InDemandedMask, Known, TLO, Depth+1))
         return true;
-      assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero.lshrInPlace(ShAmt);
       Known.One.lshrInPlace(ShAmt);
 
@@ -947,7 +947,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask, Known, TLO,
                                Depth+1))
         return true;
-      assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero.lshrInPlace(ShAmt);
       Known.One.lshrInPlace(ShAmt);
 
@@ -1029,7 +1029,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (SimplifyDemandedBits(Op.getOperand(0), InputDemandedBits,
                              Known, TLO, Depth+1))
       return true;
-    assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
 
     // If the sign bit of the input is known set or clear, then we know the
     // top bits of the result.
@@ -1084,7 +1084,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 
     if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1))
       return true;
-    assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     Known = Known.zext(BitWidth);
     Known.Zero |= NewBits;
     break;
@@ -1134,7 +1134,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     APInt InMask = NewMask.trunc(OperandBitWidth);
     if (SimplifyDemandedBits(Op.getOperand(0), InMask, Known, TLO, Depth+1))
       return true;
-    assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     Known = Known.zext(BitWidth);
     break;
   }
@@ -1193,7 +1193,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       }
     }
 
-    assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     break;
   }
   case ISD::AssertZext: {
@@ -1205,7 +1205,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     if (SimplifyDemandedBits(Op.getOperand(0), ~InMask | NewMask,
                              Known, TLO, Depth+1))
       return true;
-    assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
 
     Known.Zero |= ~InMask;
     break;
diff --git a/lib/CodeGen/ShadowStackGCLowering.cpp b/lib/CodeGen/ShadowStackGCLowering.cpp
index ff7d205c1f4c..6750fde57638 100644
--- a/lib/CodeGen/ShadowStackGCLowering.cpp
+++ b/lib/CodeGen/ShadowStackGCLowering.cpp
@@ -27,7 +27,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "shadowstackgclowering"
+#define DEBUG_TYPE "shadow-stack-gc-lowering"
 
 namespace {
 
@@ -66,10 +66,10 @@ private:
 };
 }
 
-INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, "shadow-stack-gc-lowering",
+INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, DEBUG_TYPE,
                       "Shadow Stack GC Lowering", false, false)
 INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
-INITIALIZE_PASS_END(ShadowStackGCLowering, "shadow-stack-gc-lowering",
+INITIALIZE_PASS_END(ShadowStackGCLowering, DEBUG_TYPE,
                     "Shadow Stack GC Lowering", false, false)
 
 FunctionPass *llvm::createShadowStackGCLoweringPass() { return new ShadowStackGCLowering(); }
diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp
index 2638702da152..aa75f5e2caa2 100644
--- a/lib/CodeGen/ShrinkWrap.cpp
+++ b/lib/CodeGen/ShrinkWrap.cpp
@@ -210,13 +210,12 @@ public:
 char ShrinkWrap::ID = 0;
 char &llvm::ShrinkWrapID = ShrinkWrap::ID;
 
-INITIALIZE_PASS_BEGIN(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false,
-                      false)
+INITIALIZE_PASS_BEGIN(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false, false)
+INITIALIZE_PASS_END(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false)
 
 bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI,
                                  RegScavenger *RS) const {
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index e9eff4d0acb2..09e9c3bb3354 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -74,7 +74,7 @@ private:
 } // end anonymous namespace
 
 char SjLjEHPrepare::ID = 0;
-INITIALIZE_PASS(SjLjEHPrepare, "sjljehprepare", "Prepare SjLj exceptions",
+INITIALIZE_PASS(SjLjEHPrepare, DEBUG_TYPE, "Prepare SjLj exceptions",
                 false, false)
 
 // Public Interface To the SjLjEHPrepare pass.
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index bc2a1d09056b..3656832a7f1a 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
 #define DEBUG_TYPE "slotindexes"
 
 char SlotIndexes::ID = 0;
-INITIALIZE_PASS(SlotIndexes, "slotindexes",
+INITIALIZE_PASS(SlotIndexes, DEBUG_TYPE,
                 "Slot index numbering", false, false)
 
 STATISTIC(NumLocalRenum,  "Number of local renumberings");
diff --git a/lib/CodeGen/SpillPlacement.cpp b/lib/CodeGen/SpillPlacement.cpp
index 43cbf4add0f8..0abe1c47da55 100644
--- a/lib/CodeGen/SpillPlacement.cpp
+++ b/lib/CodeGen/SpillPlacement.cpp
@@ -40,14 +40,14 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "spillplacement"
+#define DEBUG_TYPE "spill-code-placement"
 
 char SpillPlacement::ID = 0;
-INITIALIZE_PASS_BEGIN(SpillPlacement, "spill-code-placement",
+INITIALIZE_PASS_BEGIN(SpillPlacement, DEBUG_TYPE,
                       "Spill Code Placement Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(SpillPlacement, "spill-code-placement",
+INITIALIZE_PASS_END(SpillPlacement, DEBUG_TYPE,
                     "Spill Code Placement Analysis", true, true)
 
 char &llvm::SpillPlacementID = SpillPlacement::ID;
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index 86a16187fcb6..acb3676fdd71 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -53,7 +53,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "stackcoloring"
+#define DEBUG_TYPE "stack-coloring"
 
 static cl::opt<bool>
 DisableColoring("no-stack-coloring",
@@ -371,12 +371,12 @@ private:
 char StackColoring::ID = 0;
 char &llvm::StackColoringID = StackColoring::ID;
 
-INITIALIZE_PASS_BEGIN(StackColoring,
-                   "stack-coloring", "Merge disjoint stack slots", false, false)
+INITIALIZE_PASS_BEGIN(StackColoring, DEBUG_TYPE,
+                      "Merge disjoint stack slots", false, false)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(StackProtector)
-INITIALIZE_PASS_END(StackColoring,
-                   "stack-coloring", "Merge disjoint stack slots", false, false)
+INITIALIZE_PASS_END(StackColoring, DEBUG_TYPE,
+                    "Merge disjoint stack slots", false, false)
 
 void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<SlotIndexes>();
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 5da77264261b..ca8bde2d114a 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -58,10 +58,10 @@ static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp",
                                           cl::init(true), cl::Hidden);
 
 char StackProtector::ID = 0;
-INITIALIZE_PASS_BEGIN(StackProtector, "stack-protector",
+INITIALIZE_PASS_BEGIN(StackProtector, DEBUG_TYPE,
                       "Insert stack protectors", false, true)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
-INITIALIZE_PASS_END(StackProtector, "stack-protector",
+INITIALIZE_PASS_END(StackProtector, DEBUG_TYPE,
                     "Insert stack protectors", false, true)
 
 FunctionPass *llvm::createStackProtectorPass() { return new StackProtector(); }
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 234b2043a6a1..d1758ecbd79f 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -32,7 +32,7 @@
 #include <vector>
 using namespace llvm;
 
-#define DEBUG_TYPE "stackslotcoloring"
+#define DEBUG_TYPE "stack-slot-coloring"
 
 static cl::opt<bool>
 DisableSharing("no-stack-slot-sharing",
@@ -116,12 +116,12 @@ namespace {
 char StackSlotColoring::ID = 0;
 char &llvm::StackSlotColoringID = StackSlotColoring::ID;
 
-INITIALIZE_PASS_BEGIN(StackSlotColoring, "stack-slot-coloring",
+INITIALIZE_PASS_BEGIN(StackSlotColoring, DEBUG_TYPE,
                 "Stack Slot Coloring", false, false)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveStacks)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_END(StackSlotColoring, "stack-slot-coloring",
+INITIALIZE_PASS_END(StackSlotColoring, DEBUG_TYPE,
                 "Stack Slot Coloring", false, false)
 
 namespace {
diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index e2377d89497d..ad0b04373656 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp
@@ -40,8 +40,7 @@ char TailDuplicatePass::ID = 0;
 
 char &llvm::TailDuplicateID = TailDuplicatePass::ID;
 
-INITIALIZE_PASS(TailDuplicatePass, "tailduplication", "Tail Duplication", false,
-                false)
+INITIALIZE_PASS(TailDuplicatePass, DEBUG_TYPE, "Tail Duplication", false, false)
 
 bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
diff --git a/lib/CodeGen/TailDuplicator.cpp b/lib/CodeGen/TailDuplicator.cpp
index d2414200e9d5..d40f7af431a9 100644
--- a/lib/CodeGen/TailDuplicator.cpp
+++ b/lib/CodeGen/TailDuplicator.cpp
@@ -749,7 +749,7 @@ bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB,
   if (PredBB->succ_size() > 1)
     return false;
 
-  MachineBasicBlock *PredTBB, *PredFBB;
+  MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
   SmallVector<MachineOperand, 4> PredCond;
   if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond))
     return false;
@@ -832,7 +832,7 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
     appendCopies(PredBB, CopyInfos, Copies);
 
     // Simplify
-    MachineBasicBlock *PredTBB, *PredFBB;
+    MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
     SmallVector<MachineOperand, 4> PredCond;
     TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond);
 
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 7392c8327148..552a89f76ca2 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -52,7 +52,7 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "twoaddrinstr"
+#define DEBUG_TYPE "twoaddressinstruction"
 
 STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions");
 STATISTIC(NumCommuted        , "Number of instructions commuted to coalesce");
@@ -171,10 +171,10 @@ public:
 } // end anonymous namespace
 
 char TwoAddressInstructionPass::ID = 0;
-INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, "twoaddressinstruction",
+INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, DEBUG_TYPE,
                 "Two-Address instruction pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(TwoAddressInstructionPass, "twoaddressinstruction",
+INITIALIZE_PASS_END(TwoAddressInstructionPass, DEBUG_TYPE,
                 "Two-Address instruction pass", false, false)
 
 char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
index a632b40c20f5..4e7542bf31e0 100644
--- a/lib/CodeGen/WinEHPrepare.cpp
+++ b/lib/CodeGen/WinEHPrepare.cpp
@@ -94,7 +94,7 @@ private:
 } // end anonymous namespace
 
 char WinEHPrepare::ID = 0;
-INITIALIZE_PASS(WinEHPrepare, "winehprepare", "Prepare Windows exceptions",
+INITIALIZE_PASS(WinEHPrepare, DEBUG_TYPE, "Prepare Windows exceptions",
                 false, false)
 
 FunctionPass *llvm::createWinEHPass() { return new WinEHPrepare(); }
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index 556ebf78622f..90193d07b95d 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -22,6 +22,7 @@ add_llvm_library(LLVMDebugInfoCodeView
   TypeDatabaseVisitor.cpp
   TypeDumpVisitor.cpp
   TypeIndex.cpp
+  TypeIndexDiscovery.cpp
   TypeRecordMapping.cpp
   TypeSerializer.cpp
   TypeStreamMerger.cpp
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index f95c3e79388e..705b548141b0 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -45,24 +45,9 @@ static Error visitKnownMember(CVMemberRecord &Record,
 }
 
 static Expected<TypeServer2Record> deserializeTypeServerRecord(CVType &Record) {
-  class StealTypeServerVisitor : public TypeVisitorCallbacks {
-  public:
-    explicit StealTypeServerVisitor(TypeServer2Record &TR) : TR(TR) {}
-
-    Error visitKnownRecord(CVType &CVR, TypeServer2Record &Record) override {
-      TR = Record;
-      return Error::success();
-    }
-
-  private:
-    TypeServer2Record &TR;
-  };
-
   TypeServer2Record R(TypeRecordKind::TypeServer2);
-  StealTypeServerVisitor Thief(R);
-  if (auto EC = visitTypeRecord(Record, Thief))
+  if (auto EC = TypeDeserializer::deserializeAs(Record, R))
     return std::move(EC);
-
   return R;
 }
 
@@ -308,8 +293,9 @@ Error llvm::codeview::visitTypeRecord(CVType &Record,
 
 Error llvm::codeview::visitTypeStream(const CVTypeArray &Types,
                                       TypeVisitorCallbacks &Callbacks,
+                                      VisitorDataSource Source,
                                       TypeServerHandler *TS) {
-  VisitHelper V(Callbacks, VDS_BytesPresent);
+  VisitHelper V(Callbacks, Source);
   if (TS)
     V.Visitor.addTypeServerHandler(*TS);
   return V.Visitor.visitTypeStream(Types);
diff --git a/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp b/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
new file mode 100644
index 000000000000..11e2e215303c
--- /dev/null
+++ b/lib/DebugInfo/CodeView/TypeIndexDiscovery.cpp
@@ -0,0 +1,371 @@
+//===- TypeIndexDiscovery.cpp -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/Endian.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+static inline MethodKind getMethodKind(uint16_t Attrs) {
+  Attrs &= uint16_t(MethodOptions::MethodKindMask);
+  Attrs >>= 2;
+  return MethodKind(Attrs);
+}
+
+static inline bool isIntroVirtual(uint16_t Attrs) {
+  MethodKind MK = getMethodKind(Attrs);
+  return MK == MethodKind::IntroducingVirtual ||
+         MK == MethodKind::PureIntroducingVirtual;
+}
+
+static inline PointerMode getPointerMode(uint32_t Attrs) {
+  return static_cast<PointerMode>((Attrs >> PointerRecord::PointerModeShift) &
+                                  PointerRecord::PointerModeMask);
+}
+
+static inline bool isMemberPointer(uint32_t Attrs) {
+  PointerMode Mode = getPointerMode(Attrs);
+  return Mode == PointerMode::PointerToDataMember ||
+         Mode == PointerMode::PointerToDataMember;
+}
+
+static inline uint32_t getEncodedIntegerLength(ArrayRef<uint8_t> Data) {
+  uint16_t N = support::endian::read16le(Data.data());
+  if (N < LF_NUMERIC)
+    return 2;
+
+  assert(N <= LF_UQUADWORD);
+
+  constexpr uint32_t Sizes[] = {
+      1,  // LF_CHAR
+      2,  // LF_SHORT
+      2,  // LF_USHORT
+      4,  // LF_LONG
+      4,  // LF_ULONG
+      4,  // LF_REAL32
+      8,  // LF_REAL64
+      10, // LF_REAL80
+      16, // LF_REAL128
+      8,  // LF_QUADWORD
+      8,  // LF_UQUADWORD
+  };
+
+  return Sizes[N - LF_NUMERIC];
+}
+
+static inline uint32_t getCStringLength(ArrayRef<uint8_t> Data) {
+  const char *S = reinterpret_cast<const char *>(Data.data());
+  return strlen(S) + 1;
+}
+
+static void handleMethodOverloadList(ArrayRef<uint8_t> Content,
+                                     SmallVectorImpl<TiReference> &Refs) {
+  uint32_t Offset = 0;
+
+  while (!Content.empty()) {
+    // Array of:
+    //   0: Attrs
+    //   2: Padding
+    //   4: TypeIndex
+    //   if (isIntroVirtual())
+    //     8: VFTableOffset
+
+    // At least 8 bytes are guaranteed.  4 extra bytes come iff function is an
+    // intro virtual.
+    uint32_t Len = 8;
+
+    uint16_t Attrs = support::endian::read16le(Content.data());
+    Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+
+    if (LLVM_UNLIKELY(isIntroVirtual(Attrs)))
+      Len += 4;
+    Offset += Len;
+    Content = Content.drop_front(Len);
+  }
+}
+
+static uint32_t handleBaseClass(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  // 8: Encoded Integer
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  return 8 + getEncodedIntegerLength(Data.drop_front(8));
+}
+
+static uint32_t handleEnumerator(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                 SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: Encoded Integer
+  // <next>: Name
+  uint32_t Size = 4 + getEncodedIntegerLength(Data.drop_front(4));
+  return Size + getCStringLength(Data.drop_front(Size));
+}
+
+static uint32_t handleDataMember(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                 SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  // 8: Encoded Integer
+  // <next>: Name
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  uint32_t Size = 8 + getEncodedIntegerLength(Data.drop_front(8));
+  return Size + getCStringLength(Data.drop_front(Size));
+}
+
+static uint32_t handleOverloadedMethod(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                       SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  // 8: Name
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  return 8 + getCStringLength(Data.drop_front(8));
+}
+
+static uint32_t handleOneMethod(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Attributes
+  // 4: Type
+  // if (isIntroVirtual)
+  //   8: VFTableOffset
+  // <next>: Name
+  uint32_t Size = 8;
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+
+  uint16_t Attrs = support::endian::read16le(Data.drop_front(2).data());
+  if (LLVM_UNLIKELY(isIntroVirtual(Attrs)))
+    Size += 4;
+
+  return Size + getCStringLength(Data.drop_front(Size));
+}
+
+static uint32_t handleNestedType(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                 SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  // 8: Name
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  return 8 + getCStringLength(Data.drop_front(8));
+}
+
+static uint32_t handleStaticDataMember(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                       SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  // 8: Name
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  return 8 + getCStringLength(Data.drop_front(8));
+}
+
+static uint32_t handleVirtualBaseClass(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                       bool IsIndirect,
+                                       SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Attrs
+  // 4: TypeIndex
+  // 8: TypeIndex
+  // 12: Encoded Integer
+  // <next>: Encoded Integer
+  uint32_t Size = 12;
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 2});
+  Size += getEncodedIntegerLength(Data.drop_front(Size));
+  Size += getEncodedIntegerLength(Data.drop_front(Size));
+  return Size;
+}
+
+static uint32_t handleVFPtr(ArrayRef<uint8_t> Data, uint32_t Offset,
+                            SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  return 8;
+}
+
+static uint32_t handleListContinuation(ArrayRef<uint8_t> Data, uint32_t Offset,
+                                       SmallVectorImpl<TiReference> &Refs) {
+  // 0: Kind
+  // 2: Padding
+  // 4: TypeIndex
+  Refs.push_back({TiRefKind::TypeRef, Offset + 4, 1});
+  return 8;
+}
+
+static void handleFieldList(ArrayRef<uint8_t> Content,
+                            SmallVectorImpl<TiReference> &Refs) {
+  uint32_t Offset = 0;
+  uint32_t ThisLen = 0;
+  while (!Content.empty()) {
+    TypeLeafKind Kind =
+        static_cast<TypeLeafKind>(support::endian::read16le(Content.data()));
+    switch (Kind) {
+    case LF_BCLASS:
+      ThisLen = handleBaseClass(Content, Offset, Refs);
+      break;
+    case LF_ENUMERATE:
+      ThisLen = handleEnumerator(Content, Offset, Refs);
+      break;
+    case LF_MEMBER:
+      ThisLen = handleDataMember(Content, Offset, Refs);
+      break;
+    case LF_METHOD:
+      ThisLen = handleOverloadedMethod(Content, Offset, Refs);
+      break;
+    case LF_ONEMETHOD:
+      ThisLen = handleOneMethod(Content, Offset, Refs);
+      break;
+    case LF_NESTTYPE:
+      ThisLen = handleNestedType(Content, Offset, Refs);
+      break;
+    case LF_STMEMBER:
+      ThisLen = handleStaticDataMember(Content, Offset, Refs);
+      break;
+    case LF_VBCLASS:
+    case LF_IVBCLASS:
+      ThisLen =
+          handleVirtualBaseClass(Content, Offset, Kind == LF_VBCLASS, Refs);
+      break;
+    case LF_VFUNCTAB:
+      ThisLen = handleVFPtr(Content, Offset, Refs);
+      break;
+    case LF_INDEX:
+      ThisLen = handleListContinuation(Content, Offset, Refs);
+      break;
+    default:
+      return;
+    }
+    Content = Content.drop_front(ThisLen);
+    Offset += ThisLen;
+    if (!Content.empty()) {
+      uint8_t Pad = Content.front();
+      if (Pad >= LF_PAD0) {
+        uint32_t Skip = Pad & 0x0F;
+        Content = Content.drop_front(Skip);
+        Offset += Skip;
+      }
+    }
+  }
+}
+
+static void handlePointer(ArrayRef<uint8_t> Content,
+                          SmallVectorImpl<TiReference> &Refs) {
+  Refs.push_back({TiRefKind::TypeRef, 0, 1});
+
+  uint32_t Attrs = support::endian::read32le(Content.drop_front(4).data());
+  if (isMemberPointer(Attrs))
+    Refs.push_back({TiRefKind::TypeRef, 8, 1});
+}
+
+static void discoverTypeIndices(ArrayRef<uint8_t> Content, TypeLeafKind Kind,
+                                SmallVectorImpl<TiReference> &Refs) {
+  uint32_t Count;
+  // FIXME: In the future it would be nice if we could avoid hardcoding these
+  // values.  One idea is to define some structures representing these types
+  // that would allow the use of offsetof().
+  switch (Kind) {
+  case TypeLeafKind::LF_FUNC_ID:
+    Refs.push_back({TiRefKind::IndexRef, 0, 1});
+    Refs.push_back({TiRefKind::TypeRef, 4, 1});
+    break;
+  case TypeLeafKind::LF_MFUNC_ID:
+    Refs.push_back({TiRefKind::TypeRef, 0, 2});
+    break;
+  case TypeLeafKind::LF_STRING_ID:
+    Refs.push_back({TiRefKind::IndexRef, 0, 1});
+    break;
+  case TypeLeafKind::LF_SUBSTR_LIST:
+    Count = support::endian::read32le(Content.data());
+    if (Count > 0)
+      Refs.push_back({TiRefKind::IndexRef, 4, Count});
+    break;
+  case TypeLeafKind::LF_BUILDINFO:
+    Count = support::endian::read16le(Content.data());
+    if (Count > 0)
+      Refs.push_back({TiRefKind::IndexRef, 2, Count});
+    break;
+  case TypeLeafKind::LF_UDT_SRC_LINE:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1});
+    Refs.push_back({TiRefKind::IndexRef, 4, 1});
+    break;
+  case TypeLeafKind::LF_UDT_MOD_SRC_LINE:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1});
+    break;
+  case TypeLeafKind::LF_MODIFIER:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1});
+    break;
+  case TypeLeafKind::LF_PROCEDURE:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1});
+    Refs.push_back({TiRefKind::TypeRef, 8, 1});
+    break;
+  case TypeLeafKind::LF_MFUNCTION:
+    Refs.push_back({TiRefKind::TypeRef, 0, 3});
+    Refs.push_back({TiRefKind::TypeRef, 16, 1});
+    break;
+  case TypeLeafKind::LF_ARGLIST:
+    Count = support::endian::read32le(Content.data());
+    if (Count > 0)
+      Refs.push_back({TiRefKind::TypeRef, 4, Count});
+    break;
+  case TypeLeafKind::LF_ARRAY:
+    Refs.push_back({TiRefKind::TypeRef, 0, 2});
+    break;
+  case TypeLeafKind::LF_CLASS:
+  case TypeLeafKind::LF_STRUCTURE:
+  case TypeLeafKind::LF_INTERFACE:
+    Refs.push_back({TiRefKind::TypeRef, 4, 3});
+    break;
+  case TypeLeafKind::LF_UNION:
+    Refs.push_back({TiRefKind::TypeRef, 4, 1});
+    break;
+  case TypeLeafKind::LF_ENUM:
+    Refs.push_back({TiRefKind::TypeRef, 4, 2});
+    break;
+  case TypeLeafKind::LF_BITFIELD:
+    Refs.push_back({TiRefKind::TypeRef, 0, 1});
+    break;
+  case TypeLeafKind::LF_VFTABLE:
+    Refs.push_back({TiRefKind::TypeRef, 0, 2});
+    break;
+  case TypeLeafKind::LF_VTSHAPE:
+    break;
+  case TypeLeafKind::LF_METHODLIST:
+    handleMethodOverloadList(Content, Refs);
+    break;
+  case TypeLeafKind::LF_FIELDLIST:
+    handleFieldList(Content, Refs);
+    break;
+  case TypeLeafKind::LF_POINTER:
+    handlePointer(Content, Refs);
+    break;
+  default:
+    break;
+  }
+}
+
+void llvm::codeview::discoverTypeIndices(const CVType &Type,
+                                         SmallVectorImpl<TiReference> &Refs) {
+  ::discoverTypeIndices(Type.content(), Type.kind(), Refs);
+}
+
+void llvm::codeview::discoverTypeIndices(ArrayRef<uint8_t> RecordData,
+                                         SmallVectorImpl<TiReference> &Refs) {
+  const RecordPrefix *P =
+      reinterpret_cast<const RecordPrefix *>(RecordData.data());
+  TypeLeafKind K = static_cast<TypeLeafKind>(uint16_t(P->RecordKind));
+  ::discoverTypeIndices(RecordData.drop_front(sizeof(RecordPrefix)), K, Refs);
+}
diff --git a/lib/DebugInfo/CodeView/TypeSerializer.cpp b/lib/DebugInfo/CodeView/TypeSerializer.cpp
index 3b061e67e05e..93c1198e36ce 100644
--- a/lib/DebugInfo/CodeView/TypeSerializer.cpp
+++ b/lib/DebugInfo/CodeView/TypeSerializer.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 
 #include <string.h>
@@ -16,23 +17,111 @@
 using namespace llvm;
 using namespace llvm::codeview;
 
+namespace {
+struct HashedType {
+  uint64_t Hash;
+  const uint8_t *Data;
+  unsigned Size; // FIXME: Go to uint16_t?
+  TypeIndex Index;
+};
+
+/// Wrapper around a poitner to a HashedType. Hash and equality operations are
+/// based on data in the pointee.
+struct HashedTypePtr {
+  HashedTypePtr() = default;
+  HashedTypePtr(HashedType *Ptr) : Ptr(Ptr) {}
+  HashedType *Ptr = nullptr;
+};
+} // namespace
+
+namespace llvm {
+template <> struct DenseMapInfo<HashedTypePtr> {
+  static inline HashedTypePtr getEmptyKey() { return HashedTypePtr(nullptr); }
+  static inline HashedTypePtr getTombstoneKey() {
+    return HashedTypePtr(reinterpret_cast<HashedType *>(1));
+  }
+  static unsigned getHashValue(HashedTypePtr Val) {
+    assert(Val.Ptr != getEmptyKey().Ptr && Val.Ptr != getTombstoneKey().Ptr);
+    return Val.Ptr->Hash;
+  }
+  static bool isEqual(HashedTypePtr LHSP, HashedTypePtr RHSP) {
+    HashedType *LHS = LHSP.Ptr;
+    HashedType *RHS = RHSP.Ptr;
+    if (RHS == getEmptyKey().Ptr || RHS == getTombstoneKey().Ptr)
+      return LHS == RHS;
+    if (LHS->Hash != RHS->Hash || LHS->Size != RHS->Size)
+      return false;
+    return ::memcmp(LHS->Data, RHS->Data, LHS->Size) == 0;
+  }
+};
+}
+
+/// Private implementation so that we don't leak our DenseMap instantiations to
+/// users.
+class llvm::codeview::TypeHasher {
+private:
+  /// Storage for type record provided by the caller. Records will outlive the
+  /// hasher object, so they should be allocated here.
+  BumpPtrAllocator &RecordStorage;
+
+  /// Storage for hash keys. These only need to live as long as the hashing
+  /// operation.
+  BumpPtrAllocator KeyStorage;
+
+  /// Hash table. We really want a DenseMap<ArrayRef<uint8_t>, TypeIndex> here,
+  /// but DenseMap is inefficient when the keys are long (like type records)
+  /// because it recomputes the hash value of every key when it grows. This
+  /// value type stores the hash out of line in KeyStorage, so that table
+  /// entries are small and easy to rehash.
+  DenseSet<HashedTypePtr> HashedRecords;
+
+public:
+  TypeHasher(BumpPtrAllocator &RecordStorage) : RecordStorage(RecordStorage) {}
+
+  void reset() { HashedRecords.clear(); }
+
+  /// Takes the bytes of type record, inserts them into the hash table, saves
+  /// them, and returns a pointer to an identical stable type record along with
+  /// its type index in the destination stream.
+  TypeIndex getOrCreateRecord(ArrayRef<uint8_t> &Record, TypeIndex TI);
+};
+
+TypeIndex TypeHasher::getOrCreateRecord(ArrayRef<uint8_t> &Record,
+                                        TypeIndex TI) {
+  assert(Record.size() < UINT32_MAX && "Record too big");
+  assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
+
+  // Compute the hash up front so we can store it in the key.
+  HashedType TempHashedType = {hash_value(Record), Record.data(),
+                               unsigned(Record.size()), TI};
+  auto Result = HashedRecords.insert(HashedTypePtr(&TempHashedType));
+  HashedType *&Hashed = Result.first->Ptr;
+
+  if (Result.second) {
+    // This was a new type record. We need stable storage for both the key and
+    // the record. The record should outlive the hashing operation.
+    Hashed = KeyStorage.Allocate<HashedType>();
+    *Hashed = TempHashedType;
+
+    uint8_t *Stable = RecordStorage.Allocate<uint8_t>(Record.size());
+    memcpy(Stable, Record.data(), Record.size());
+    Hashed->Data = Stable;
+    assert(Hashed->Size == Record.size());
+  }
+
+  // Update the caller's copy of Record to point a stable copy.
+  Record = ArrayRef<uint8_t>(Hashed->Data, Hashed->Size);
+  return Hashed->Index;
+}
+
+TypeIndex TypeSerializer::nextTypeIndex() const {
+  return TypeIndex::fromArrayIndex(SeenRecords.size());
+}
+
 bool TypeSerializer::isInFieldList() const {
   return TypeKind.hasValue() && *TypeKind == TypeLeafKind::LF_FIELDLIST;
 }
 
-TypeIndex TypeSerializer::calcNextTypeIndex() const {
-  if (LastTypeIndex.isNoneType())
-    return TypeIndex(TypeIndex::FirstNonSimpleIndex);
-  else
-    return TypeIndex(LastTypeIndex.getIndex() + 1);
-}
-
-TypeIndex TypeSerializer::incrementTypeIndex() {
-  TypeIndex Previous = LastTypeIndex;
-  LastTypeIndex = calcNextTypeIndex();
-  return Previous;
-}
-
 MutableArrayRef<uint8_t> TypeSerializer::getCurrentSubRecordData() {
   assert(isInFieldList());
   return getCurrentRecordData().drop_front(CurrentSegment.length());
@@ -51,46 +140,6 @@ Error TypeSerializer::writeRecordPrefix(TypeLeafKind Kind) {
   return Error::success();
 }
 
-TypeIndex
-TypeSerializer::insertRecordBytesPrivate(MutableArrayRef<uint8_t> Record) {
-  assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
-
-  StringRef S(reinterpret_cast<const char *>(Record.data()), Record.size());
-
-  TypeIndex NextTypeIndex = calcNextTypeIndex();
-  auto Result = HashedRecords.try_emplace(S, NextTypeIndex);
-  if (Result.second) {
-    LastTypeIndex = NextTypeIndex;
-    SeenRecords.push_back(Record);
-  }
-  return Result.first->getValue();
-}
-
-TypeIndex
-TypeSerializer::insertRecordBytesWithCopy(CVType &Record,
-                                          MutableArrayRef<uint8_t> Data) {
-  assert(Data.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
-
-  StringRef S(reinterpret_cast<const char *>(Data.data()), Data.size());
-
-  // Do a two state lookup / insert so that we don't have to allocate unless
-  // we're going
-  // to do an insert.  This is a big memory savings.
-  auto Iter = HashedRecords.find(S);
-  if (Iter != HashedRecords.end())
-    return Iter->second;
-
-  LastTypeIndex = calcNextTypeIndex();
-  uint8_t *Copy = RecordStorage.Allocate<uint8_t>(Data.size());
-  ::memcpy(Copy, Data.data(), Data.size());
-  Data = MutableArrayRef<uint8_t>(Copy, Data.size());
-  S = StringRef(reinterpret_cast<const char *>(Data.data()), Data.size());
-  HashedRecords.insert(std::make_pair(S, LastTypeIndex));
-  SeenRecords.push_back(Data);
-  Record.RecordData = Data;
-  return LastTypeIndex;
-}
-
 Expected<MutableArrayRef<uint8_t>>
 TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
   uint32_t Align = Record.size() % 4;
@@ -108,27 +157,79 @@ TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
   return MutableArrayRef<uint8_t>(Record.data(), Record.size() + N);
 }
 
-TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage)
-    : RecordStorage(Storage), LastTypeIndex(),
-      RecordBuffer(MaxRecordLength * 2),
+TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage, bool Hash)
+    : RecordStorage(Storage), RecordBuffer(MaxRecordLength * 2),
       Stream(RecordBuffer, llvm::support::little), Writer(Stream),
       Mapping(Writer) {
   // RecordBuffer needs to be able to hold enough data so that if we are 1
   // byte short of MaxRecordLen, and then we try to write MaxRecordLen bytes,
   // we won't overflow.
+  if (Hash)
+    Hasher = make_unique<TypeHasher>(Storage);
 }
 
-ArrayRef<MutableArrayRef<uint8_t>> TypeSerializer::records() const {
+TypeSerializer::~TypeSerializer() = default;
+
+ArrayRef<ArrayRef<uint8_t>> TypeSerializer::records() const {
   return SeenRecords;
 }
 
-TypeIndex TypeSerializer::getLastTypeIndex() const { return LastTypeIndex; }
+void TypeSerializer::reset() {
+  if (Hasher)
+    Hasher->reset();
+  Writer.setOffset(0);
+  CurrentSegment = RecordSegment();
+  FieldListSegments.clear();
+  TypeKind.reset();
+  MemberKind.reset();
+  SeenRecords.clear();
+}
 
-TypeIndex TypeSerializer::insertRecordBytes(MutableArrayRef<uint8_t> Record) {
+TypeIndex TypeSerializer::insertRecordBytes(ArrayRef<uint8_t> &Record) {
   assert(!TypeKind.hasValue() && "Already in a type mapping!");
   assert(Writer.getOffset() == 0 && "Stream has data already!");
 
-  return insertRecordBytesPrivate(Record);
+  if (Hasher) {
+    TypeIndex ActualTI = Hasher->getOrCreateRecord(Record, nextTypeIndex());
+    if (nextTypeIndex() == ActualTI)
+      SeenRecords.push_back(Record);
+    return ActualTI;
+  }
+
+  TypeIndex NewTI = nextTypeIndex();
+  uint8_t *Stable = RecordStorage.Allocate<uint8_t>(Record.size());
+  memcpy(Stable, Record.data(), Record.size());
+  Record = ArrayRef<uint8_t>(Stable, Record.size());
+  SeenRecords.push_back(Record);
+  return NewTI;
+}
+
+TypeIndex TypeSerializer::insertRecord(const RemappedType &Record) {
+  assert(!TypeKind.hasValue() && "Already in a type mapping!");
+  assert(Writer.getOffset() == 0 && "Stream has data already!");
+
+  TypeIndex TI;
+  ArrayRef<uint8_t> OriginalData = Record.OriginalRecord.RecordData;
+  if (Record.Mappings.empty()) {
+    // This record did not remap any type indices.  Just write it.
+    return insertRecordBytes(OriginalData);
+  }
+
+  // At least one type index was remapped.  Before we can hash it we have to
+  // copy the full record bytes, re-write each type index, then hash the copy.
+  // We do this in temporary storage since only the DenseMap can decide whether
+  // this record already exists, and if it does we don't want the memory to
+  // stick around.
+  RemapStorage.resize(OriginalData.size());
+  ::memcpy(&RemapStorage[0], OriginalData.data(), OriginalData.size());
+  uint8_t *ContentBegin = RemapStorage.data() + sizeof(RecordPrefix);
+  for (const auto &M : Record.Mappings) {
+    // First 4 bytes of every record are the record prefix, but the mapping
+    // offset is relative to the content which starts after.
+    *(TypeIndex *)(ContentBegin + M.first) = M.second;
+  }
+  auto RemapRef = makeArrayRef(RemapStorage);
+  return insertRecordBytes(RemapRef);
 }
 
 Error TypeSerializer::visitTypeBegin(CVType &Record) {
@@ -163,8 +264,13 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
   Prefix->RecordLen = ThisRecordData.size() - sizeof(uint16_t);
 
   Record.Type = *TypeKind;
-  TypeIndex InsertedTypeIndex =
-      insertRecordBytesWithCopy(Record, ThisRecordData);
+  Record.RecordData = ThisRecordData;
+
+  // insertRecordBytes assumes we're not in a mapping, so do this first.
+  TypeKind.reset();
+  Writer.setOffset(0);
+
+  TypeIndex InsertedTypeIndex = insertRecordBytes(Record.RecordData);
 
   // Write out each additional segment in reverse order, and update each
   // record's continuation index to point to the previous one.
@@ -174,11 +280,9 @@ Expected<TypeIndex> TypeSerializer::visitTypeEndGetIndex(CVType &Record) {
         reinterpret_cast<support::ulittle32_t *>(CIBytes.data());
     assert(*CI == 0xB0C0B0C0 && "Invalid TypeIndex placeholder");
     *CI = InsertedTypeIndex.getIndex();
-    InsertedTypeIndex = insertRecordBytesPrivate(X);
+    InsertedTypeIndex = insertRecordBytes(X);
   }
 
-  TypeKind.reset();
-  Writer.setOffset(0);
   FieldListSegments.clear();
   CurrentSegment.SubRecords.clear();
 
diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index 46747f8eab99..71a0966df036 100644
--- a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -11,7 +11,9 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
@@ -57,37 +59,56 @@ namespace {
 /// looking at the record kind.
 class TypeStreamMerger : public TypeVisitorCallbacks {
 public:
-  TypeStreamMerger(TypeTableBuilder &DestIdStream,
-                   TypeTableBuilder &DestTypeStream,
-                   SmallVectorImpl<TypeIndex> &SourceToDest,
-                   TypeServerHandler *Handler)
-      : DestIdStream(DestIdStream), DestTypeStream(DestTypeStream),
-        FieldListBuilder(DestTypeStream), Handler(Handler),
-        IndexMap(SourceToDest) {}
+  explicit TypeStreamMerger(SmallVectorImpl<TypeIndex> &SourceToDest,
+                            TypeServerHandler *Handler)
+      : Handler(Handler), IndexMap(SourceToDest) {
+    SourceToDest.clear();
+  }
 
   static const TypeIndex Untranslated;
 
-/// TypeVisitorCallbacks overrides.
-#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
-  Error visitKnownRecord(CVType &CVR, Name##Record &Record) override;
-#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
-  Error visitKnownMember(CVMemberRecord &CVR, Name##Record &Record) override;
-#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#include "llvm/DebugInfo/CodeView/TypeRecords.def"
-
-  Error visitUnknownType(CVType &Record) override;
-
   Error visitTypeBegin(CVType &Record) override;
   Error visitTypeEnd(CVType &Record) override;
-  Error visitMemberEnd(CVMemberRecord &Record) override;
 
-  Error mergeStream(const CVTypeArray &Types);
+  Error mergeTypesAndIds(TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes,
+    const CVTypeArray &IdsAndTypes);
+  Error mergeIdRecords(TypeTableBuilder &Dest,
+                       ArrayRef<TypeIndex> TypeSourceToDest,
+    const CVTypeArray &Ids);
+  Error mergeTypeRecords(TypeTableBuilder &Dest, const CVTypeArray &Types);
 
 private:
+  Error doit(const CVTypeArray &Types);
+
   void addMapping(TypeIndex Idx);
 
-  bool remapIndex(TypeIndex &Idx);
+  bool remapTypeIndex(TypeIndex &Idx);
+  bool remapItemIndex(TypeIndex &Idx);
+
+  bool remapIndices(RemappedType &Record, ArrayRef<TiReference> Refs) {
+    auto OriginalData = Record.OriginalRecord.content();
+    bool Success = true;
+    for (auto &Ref : Refs) {
+      uint32_t Offset = Ref.Offset;
+      ArrayRef<uint8_t> Bytes =
+          OriginalData.slice(Ref.Offset, sizeof(TypeIndex));
+      ArrayRef<TypeIndex> TIs(reinterpret_cast<const TypeIndex *>(Bytes.data()),
+                              Ref.Count);
+      for (auto TI : TIs) {
+        TypeIndex NewTI = TI;
+        bool ThisSuccess = (Ref.Kind == TiRefKind::IndexRef)
+                               ? remapItemIndex(NewTI)
+                               : remapTypeIndex(NewTI);
+        if (ThisSuccess && NewTI != TI)
+          Record.Mappings.emplace_back(Offset, NewTI);
+        Offset += sizeof(TypeIndex);
+        Success &= ThisSuccess;
+      }
+    }
+    return Success;
+  }
+
+  bool remapIndex(TypeIndex &Idx, ArrayRef<TypeIndex> Map);
 
   size_t slotForIndex(TypeIndex Idx) const {
     assert(!Idx.isSimple() && "simple type indices have no slots");
@@ -98,50 +119,46 @@ private:
     return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);
   }
 
-  template <typename RecordType>
-  Error writeRecord(RecordType &R, bool RemapSuccess) {
+  Error writeRecord(TypeTableBuilder &Dest, const RemappedType &Record,
+                    bool RemapSuccess) {
     TypeIndex DestIdx = Untranslated;
     if (RemapSuccess)
-      DestIdx = DestTypeStream.writeKnownType(R);
+      DestIdx = Dest.writeSerializedRecord(Record);
     addMapping(DestIdx);
     return Error::success();
   }
 
-  template <typename RecordType>
-  Error writeIdRecord(RecordType &R, bool RemapSuccess) {
-    TypeIndex DestIdx = Untranslated;
-    if (RemapSuccess)
-      DestIdx = DestIdStream.writeKnownType(R);
+  Error writeTypeRecord(const CVType &Record) {
+    TypeIndex DestIdx =
+        DestTypeStream->writeSerializedRecord(Record.RecordData);
     addMapping(DestIdx);
     return Error::success();
   }
 
-  template <typename RecordType>
-  Error writeMember(RecordType &R, bool RemapSuccess) {
-    if (RemapSuccess)
-      FieldListBuilder.writeMemberType(R);
-    else
-      HadUntranslatedMember = true;
-    return Error::success();
+  Error writeTypeRecord(const RemappedType &Record, bool RemapSuccess) {
+    return writeRecord(*DestTypeStream, Record, RemapSuccess);
+  }
+
+  Error writeIdRecord(const RemappedType &Record, bool RemapSuccess) {
+    return writeRecord(*DestIdStream, Record, RemapSuccess);
   }
 
   Optional<Error> LastError;
 
   bool IsSecondPass = false;
 
-  bool HadUntranslatedMember = false;
-
   unsigned NumBadIndices = 0;
 
-  BumpPtrAllocator Allocator;
-
-  TypeTableBuilder &DestIdStream;
-  TypeTableBuilder &DestTypeStream;
-  FieldListRecordBuilder FieldListBuilder;
-  TypeServerHandler *Handler;
-
   TypeIndex CurIndex{TypeIndex::FirstNonSimpleIndex};
 
+  TypeTableBuilder *DestIdStream = nullptr;
+  TypeTableBuilder *DestTypeStream = nullptr;
+  TypeServerHandler *Handler = nullptr;
+
+  // If we're only mapping id records, this array contains the mapping for
+  // type records.
+  ArrayRef<TypeIndex> TypeLookup;
+
   /// Map from source type index to destination type index. Indexed by source
   /// type index minus 0x1000.
   SmallVectorImpl<TypeIndex> &IndexMap;
@@ -151,22 +168,34 @@ private:
 
 const TypeIndex TypeStreamMerger::Untranslated(SimpleTypeKind::NotTranslated);
 
-Error TypeStreamMerger::visitTypeBegin(CVRecord<TypeLeafKind> &Rec) {
+Error TypeStreamMerger::visitTypeBegin(CVType &Rec) {
+  RemappedType R(Rec);
+  SmallVector<TiReference, 32> Refs;
+  discoverTypeIndices(Rec.RecordData, Refs);
+  bool Success = remapIndices(R, Refs);
+  switch (Rec.kind()) {
+  case TypeLeafKind::LF_FUNC_ID:
+  case TypeLeafKind::LF_MFUNC_ID:
+  case TypeLeafKind::LF_STRING_ID:
+  case TypeLeafKind::LF_SUBSTR_LIST:
+  case TypeLeafKind::LF_BUILDINFO:
+  case TypeLeafKind::LF_UDT_SRC_LINE:
+  case TypeLeafKind::LF_UDT_MOD_SRC_LINE:
+    return writeIdRecord(R, Success);
+  default:
+    return writeTypeRecord(R, Success);
+  }
   return Error::success();
 }
 
-Error TypeStreamMerger::visitTypeEnd(CVRecord<TypeLeafKind> &Rec) {
-  CurIndex = TypeIndex(CurIndex.getIndex() + 1);
+Error TypeStreamMerger::visitTypeEnd(CVType &Rec) {
+  ++CurIndex;
   if (!IsSecondPass)
     assert(IndexMap.size() == slotForIndex(CurIndex) &&
            "visitKnownRecord should add one index map entry");
   return Error::success();
 }
 
-Error TypeStreamMerger::visitMemberEnd(CVMemberRecord &Rec) {
-  return Error::success();
-}
-
 void TypeStreamMerger::addMapping(TypeIndex Idx) {
   if (!IsSecondPass) {
     assert(IndexMap.size() == slotForIndex(CurIndex) &&
@@ -178,7 +207,7 @@ void TypeStreamMerger::addMapping(TypeIndex Idx) {
   }
 }
 
-bool TypeStreamMerger::remapIndex(TypeIndex &Idx) {
+bool TypeStreamMerger::remapIndex(TypeIndex &Idx, ArrayRef<TypeIndex> Map) {
   // Simple types are unchanged.
   if (Idx.isSimple())
     return true;
@@ -187,14 +216,14 @@ bool TypeStreamMerger::remapIndex(TypeIndex &Idx) {
   // successfully. If it refers to a type later in the stream or a record we
   // had to defer, defer it until later pass.
   unsigned MapPos = slotForIndex(Idx);
-  if (MapPos < IndexMap.size() && IndexMap[MapPos] != Untranslated) {
-    Idx = IndexMap[MapPos];
+  if (MapPos < Map.size() && Map[MapPos] != Untranslated) {
+    Idx = Map[MapPos];
     return true;
   }
 
   // If this is the second pass and this index isn't in the map, then it points
   // outside the current type stream, and this is a corrupt record.
-  if (IsSecondPass && MapPos >= IndexMap.size()) {
+  if (IsSecondPass && MapPos >= Map.size()) {
     // FIXME: Print a more useful error. We can give the current record and the
     // index that we think its pointing to.
     LastError = joinErrors(std::move(*LastError), errorCorruptRecord());
@@ -208,241 +237,61 @@ bool TypeStreamMerger::remapIndex(TypeIndex &Idx) {
   return false;
 }
 
-//----------------------------------------------------------------------------//
-// Item records
-//----------------------------------------------------------------------------//
+bool TypeStreamMerger::remapTypeIndex(TypeIndex &Idx) {
+  // If we're mapping a pure index stream, then IndexMap only contains mappings
+  // from OldIdStream -> NewIdStream, in which case we will need to use the
+  // special mapping from OldTypeStream -> NewTypeStream which was computed
+  // externally.  Regardless, we use this special map if and only if we are
+  // doing an id-only mapping.
+  if (DestTypeStream == nullptr)
+    return remapIndex(Idx, TypeLookup);
 
-Error TypeStreamMerger::visitKnownRecord(CVType &, FuncIdRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.ParentScope);
-  Success &= remapIndex(R.FunctionType);
-  return writeIdRecord(R, Success);
+  assert(TypeLookup.empty());
+  return remapIndex(Idx, IndexMap);
 }
 
-Error TypeStreamMerger::visitKnownRecord(CVType &, MemberFuncIdRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.ClassType);
-  Success &= remapIndex(R.FunctionType);
-  return writeIdRecord(R, Success);
+bool TypeStreamMerger::remapItemIndex(TypeIndex &Idx) {
+  assert(DestIdStream);
+  return remapIndex(Idx, IndexMap);
 }
 
-Error TypeStreamMerger::visitKnownRecord(CVType &, StringIdRecord &R) {
-  return writeIdRecord(R, remapIndex(R.Id));
+Error TypeStreamMerger::mergeTypeRecords(TypeTableBuilder &Dest,
+  const CVTypeArray &Types) {
+  DestTypeStream = &Dest;
+
+  return doit(Types);
 }
 
-Error TypeStreamMerger::visitKnownRecord(CVType &, StringListRecord &R) {
-  bool Success = true;
-  for (TypeIndex &Str : R.StringIndices)
-    Success &= remapIndex(Str);
-  return writeIdRecord(R, Success);
+Error TypeStreamMerger::mergeIdRecords(TypeTableBuilder &Dest,
+                                       ArrayRef<TypeIndex> TypeSourceToDest,
+  const CVTypeArray &Ids) {
+  DestIdStream = &Dest;
+  TypeLookup = TypeSourceToDest;
+
+  return doit(Ids);
 }
 
-Error TypeStreamMerger::visitKnownRecord(CVType &, BuildInfoRecord &R) {
-  bool Success = true;
-  for (TypeIndex &Arg : R.ArgIndices)
-    Success &= remapIndex(Arg);
-  return writeIdRecord(R, Success);
+Error TypeStreamMerger::mergeTypesAndIds(TypeTableBuilder &DestIds,
+                                         TypeTableBuilder &DestTypes,
+  const CVTypeArray &IdsAndTypes) {
+  DestIdStream = &DestIds;
+  DestTypeStream = &DestTypes;
+
+  return doit(IdsAndTypes);
 }
 
-Error TypeStreamMerger::visitKnownRecord(CVType &, UdtSourceLineRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.UDT);
-  Success &= remapIndex(R.SourceFile);
-  // FIXME: Translate UdtSourceLineRecord into UdtModSourceLineRecords in the
-  // IPI stream.
-  return writeIdRecord(R, Success);
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, UdtModSourceLineRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.UDT);
-  Success &= remapIndex(R.SourceFile);
-  return writeIdRecord(R, Success);
-}
-
-//----------------------------------------------------------------------------//
-// Type records
-//----------------------------------------------------------------------------//
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, ModifierRecord &R) {
-  return writeRecord(R, remapIndex(R.ModifiedType));
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, ProcedureRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.ReturnType);
-  Success &= remapIndex(R.ArgumentList);
-  return writeRecord(R, Success);
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, MemberFunctionRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.ReturnType);
-  Success &= remapIndex(R.ClassType);
-  Success &= remapIndex(R.ThisType);
-  Success &= remapIndex(R.ArgumentList);
-  return writeRecord(R, Success);
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &Type, ArgListRecord &R) {
-  bool Success = true;
-  for (TypeIndex &Arg : R.ArgIndices)
-    Success &= remapIndex(Arg);
-  if (auto EC = writeRecord(R, Success))
-    return EC;
-  return Error::success();
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, PointerRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.ReferentType);
-  if (R.isPointerToMember())
-    Success &= remapIndex(R.MemberInfo->ContainingType);
-  return writeRecord(R, Success);
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, ArrayRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.ElementType);
-  Success &= remapIndex(R.IndexType);
-  return writeRecord(R, Success);
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, ClassRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.FieldList);
-  Success &= remapIndex(R.DerivationList);
-  Success &= remapIndex(R.VTableShape);
-  return writeRecord(R, Success);
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, UnionRecord &R) {
-  return writeRecord(R, remapIndex(R.FieldList));
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, EnumRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.FieldList);
-  Success &= remapIndex(R.UnderlyingType);
-  return writeRecord(R, Success);
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, BitFieldRecord &R) {
-  return writeRecord(R, remapIndex(R.Type));
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, VFTableShapeRecord &R) {
-  return writeRecord(R, true);
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, TypeServer2Record &R) {
-  return writeRecord(R, true);
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, LabelRecord &R) {
-  return writeRecord(R, true);
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, VFTableRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.CompleteClass);
-  Success &= remapIndex(R.OverriddenVFTable);
-  return writeRecord(R, Success);
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &,
-                                         MethodOverloadListRecord &R) {
-  bool Success = true;
-  for (OneMethodRecord &Meth : R.Methods)
-    Success &= remapIndex(Meth.Type);
-  return writeRecord(R, Success);
-}
-
-Error TypeStreamMerger::visitKnownRecord(CVType &, FieldListRecord &R) {
-  // Visit the members inside the field list.
-  HadUntranslatedMember = false;
-  FieldListBuilder.begin();
-  if (auto EC = codeview::visitMemberRecordStream(R.Data, *this))
-    return EC;
-
-  // Write the record if we translated all field list members.
-  TypeIndex DestIdx = Untranslated;
-  if (!HadUntranslatedMember)
-    DestIdx = FieldListBuilder.end();
-  else
-    FieldListBuilder.reset();
-  addMapping(DestIdx);
-
-  return Error::success();
-}
-
-//----------------------------------------------------------------------------//
-// Member records
-//----------------------------------------------------------------------------//
-
-Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
-                                         NestedTypeRecord &R) {
-  return writeMember(R, remapIndex(R.Type));
-}
-
-Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, OneMethodRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.Type);
-  return writeMember(R, Success);
-}
-
-Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
-                                         OverloadedMethodRecord &R) {
-  return writeMember(R, remapIndex(R.MethodList));
-}
-
-Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
-                                         DataMemberRecord &R) {
-  return writeMember(R, remapIndex(R.Type));
-}
-
-Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
-                                         StaticDataMemberRecord &R) {
-  return writeMember(R, remapIndex(R.Type));
-}
-
-Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
-                                         EnumeratorRecord &R) {
-  return writeMember(R, true);
-}
-
-Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, VFPtrRecord &R) {
-  return writeMember(R, remapIndex(R.Type));
-}
-
-Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, BaseClassRecord &R) {
-  return writeMember(R, remapIndex(R.Type));
-}
-
-Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
-                                         VirtualBaseClassRecord &R) {
-  bool Success = true;
-  Success &= remapIndex(R.BaseType);
-  Success &= remapIndex(R.VBPtrType);
-  return writeMember(R, Success);
-}
-
-Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
-                                         ListContinuationRecord &R) {
-  return writeMember(R, remapIndex(R.ContinuationIndex));
-}
-
-Error TypeStreamMerger::visitUnknownType(CVType &Rec) {
-  // We failed to translate a type. Translate this index as "not translated".
-  addMapping(TypeIndex(SimpleTypeKind::NotTranslated));
-  return errorCorruptRecord();
-}
-
-Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) {
-  assert(IndexMap.empty());
+Error TypeStreamMerger::doit(const CVTypeArray &Types) {
   LastError = Error::success();
 
-  if (auto EC = codeview::visitTypeStream(Types, *this, Handler))
+  // We don't want to deserialize records.  I guess this flag is poorly named,
+  // but it really means "Don't deserialize records before switching on the
+  // concrete type.
+  // FIXME: We can probably get even more speed here if we don't use the visitor
+  // pipeline here, but instead write the switch ourselves.  I don't think it
+  // would buy us much since it's already pretty fast, but it's probably worth
+  // a few cycles.
+  if (auto EC =
+          codeview::visitTypeStream(Types, *this, VDS_BytesExternal, Handler))
     return EC;
 
   // If we found bad indices but no other errors, try doing another pass and see
@@ -458,7 +307,8 @@ Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) {
     NumBadIndices = 0;
     CurIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex);
 
-    if (auto EC = codeview::visitTypeStream(Types, *this, Handler))
+    if (auto EC =
+            codeview::visitTypeStream(Types, *this, VDS_BytesExternal, Handler))
       return EC;
 
     assert(NumBadIndices <= BadIndicesRemaining &&
@@ -469,18 +319,32 @@ Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) {
     }
   }
 
-  IndexMap.clear();
-
   Error Ret = std::move(*LastError);
   LastError.reset();
   return Ret;
 }
 
-Error llvm::codeview::mergeTypeStreams(TypeTableBuilder &DestIdStream,
-                                       TypeTableBuilder &DestTypeStream,
+Error llvm::codeview::mergeTypeRecords(TypeTableBuilder &Dest,
                                        SmallVectorImpl<TypeIndex> &SourceToDest,
                                        TypeServerHandler *Handler,
-                                       const CVTypeArray &Types) {
-  return TypeStreamMerger(DestIdStream, DestTypeStream, SourceToDest, Handler)
-      .mergeStream(Types);
+  const CVTypeArray &Types) {
+  TypeStreamMerger M(SourceToDest, Handler);
+  return M.mergeTypeRecords(Dest, Types);
+}
+
+Error llvm::codeview::mergeIdRecords(TypeTableBuilder &Dest,
+                                     ArrayRef<TypeIndex> TypeSourceToDest,
+                                     SmallVectorImpl<TypeIndex> &SourceToDest,
+  const CVTypeArray &Ids) {
+  TypeStreamMerger M(SourceToDest, nullptr);
+  return M.mergeIdRecords(Dest, TypeSourceToDest, Ids);
+}
+
+Error llvm::codeview::mergeTypeAndIdRecords(
+    TypeTableBuilder &DestIds, TypeTableBuilder &DestTypes,
+    SmallVectorImpl<TypeIndex> &SourceToDest, TypeServerHandler *Handler,
+  const CVTypeArray &IdsAndTypes) {
+
+  TypeStreamMerger M(SourceToDest, Handler);
+  return M.mergeTypesAndIds(DestIds, DestTypes, IdsAndTypes);
 }
diff --git a/lib/DebugInfo/CodeView/TypeTableCollection.cpp b/lib/DebugInfo/CodeView/TypeTableCollection.cpp
index a18710d6ab52..699694fde928 100644
--- a/lib/DebugInfo/CodeView/TypeTableCollection.cpp
+++ b/lib/DebugInfo/CodeView/TypeTableCollection.cpp
@@ -24,8 +24,7 @@ static void error(Error &&EC) {
     consumeError(std::move(EC));
 }
 
-TypeTableCollection::TypeTableCollection(
-    ArrayRef<MutableArrayRef<uint8_t>> Records)
+TypeTableCollection::TypeTableCollection(ArrayRef<ArrayRef<uint8_t>> Records)
     : Records(Records), Database(Records.size()) {}
 
 Optional<TypeIndex> TypeTableCollection::getFirst() {
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index 8e7c6c43d1a2..5ed55ce4c0dc 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -60,12 +60,15 @@ typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
 typedef DILineInfoSpecifier::FunctionNameKind FunctionNameKind;
 
 uint64_t llvm::getRelocatedValue(const DataExtractor &Data, uint32_t Size,
-                                 uint32_t *Off, const RelocAddrMap *Relocs) {
+                                 uint32_t *Off, const RelocAddrMap *Relocs,
+                                 uint64_t *SectionIndex) {
   if (!Relocs)
     return Data.getUnsigned(Off, Size);
   RelocAddrMap::const_iterator AI = Relocs->find(*Off);
   if (AI == Relocs->end())
     return Data.getUnsigned(Off, Size);
+  if (SectionIndex)
+    *SectionIndex = AI->second.SectionIndex;
   return Data.getUnsigned(Off, Size) + AI->second.Value;
 }
 
@@ -287,6 +290,15 @@ void DWARFContext::dump(raw_ostream &OS, DIDumpType DumpType, bool DumpEH,
                      getStringSection(), isLittleEndian());
 }
 
+DWARFCompileUnit *DWARFContext::getDWOCompileUnitForHash(uint64_t Hash) {
+  // FIXME: Improve this for the case where this DWO file is really a DWP file
+  // with an index - use the index for lookup instead of a linear search.
+  for (const auto &DWOCU : dwo_compile_units())
+    if (DWOCU->getDWOId() == Hash)
+      return DWOCU.get();
+  return nullptr;
+}
+
 DWARFDie DWARFContext::getDIEForOffset(uint32_t Offset) {
   parseCompileUnits();
   if (auto *CU = CUs.getUnitForOffset(Offset))
@@ -897,28 +909,81 @@ DWARFContext::getInliningInfoForAddress(uint64_t Address,
   return InliningInfo;
 }
 
+std::shared_ptr<DWARFContext>
+DWARFContext::getDWOContext(StringRef AbsolutePath) {
+  if (auto S = DWP.lock()) {
+    DWARFContext *Ctxt = S->Context.get();
+    return std::shared_ptr<DWARFContext>(std::move(S), Ctxt);
+  }
+
+  std::weak_ptr<DWOFile> *Entry = &DWOFiles[AbsolutePath];
+
+  if (auto S = Entry->lock()) {
+    DWARFContext *Ctxt = S->Context.get();
+    return std::shared_ptr<DWARFContext>(std::move(S), Ctxt);
+  }
+
+  SmallString<128> DWPName;
+  Expected<OwningBinary<ObjectFile>> Obj = [&] {
+    if (!CheckedForDWP) {
+      (getFileName() + ".dwp").toVector(DWPName);
+      auto Obj = object::ObjectFile::createObjectFile(DWPName);
+      if (Obj) {
+        Entry = &DWP;
+        return Obj;
+      } else {
+        CheckedForDWP = true;
+        // TODO: Should this error be handled (maybe in a high verbosity mode)
+        // before falling back to .dwo files?
+        consumeError(Obj.takeError());
+      }
+    }
+
+    return object::ObjectFile::createObjectFile(AbsolutePath);
+  }();
+
+  if (!Obj) {
+    // TODO: Actually report errors helpfully.
+    consumeError(Obj.takeError());
+    return nullptr;
+  }
+
+  auto S = std::make_shared<DWOFile>();
+  S->File = std::move(Obj.get());
+  S->Context = llvm::make_unique<DWARFContextInMemory>(*S->File.getBinary());
+  *Entry = S;
+  auto *Ctxt = S->Context.get();
+  return std::shared_ptr<DWARFContext>(std::move(S), Ctxt);
+}
+
 static Error createError(const Twine &Reason, llvm::Error E) {
   return make_error<StringError>(Reason + toString(std::move(E)),
                                  inconvertibleErrorCode());
 }
 
-/// Returns the address of symbol relocation used against. Used for futher
-/// relocations computation. Symbol's section load address is taken in account if
-/// LoadedObjectInfo interface is provided.
-static Expected<uint64_t>
-getSymbolAddress(const object::ObjectFile &Obj, const RelocationRef &Reloc,
-                 const LoadedObjectInfo *L,
-                 std::map<SymbolRef, uint64_t> &Cache) {
-  uint64_t Ret = 0;
+/// SymInfo contains information about symbol: it's address
+/// and section index which is -1LL for absolute symbols.
+struct SymInfo {
+  uint64_t Address;
+  uint64_t SectionIndex;
+};
+
+/// Returns the address of symbol relocation used against and a section index.
+/// Used for futher relocations computation. Symbol's section load address is
+static Expected<SymInfo> getSymbolInfo(const object::ObjectFile &Obj,
+                                       const RelocationRef &Reloc,
+                                       const LoadedObjectInfo *L,
+                                       std::map<SymbolRef, SymInfo> &Cache) {
+  SymInfo Ret = {0, (uint64_t)-1LL};
   object::section_iterator RSec = Obj.section_end();
   object::symbol_iterator Sym = Reloc.getSymbol();
 
-  std::map<SymbolRef, uint64_t>::iterator CacheIt = Cache.end();
+  std::map<SymbolRef, SymInfo>::iterator CacheIt = Cache.end();
   // First calculate the address of the symbol or section as it appears
   // in the object file
   if (Sym != Obj.symbol_end()) {
     bool New;
-    std::tie(CacheIt, New) = Cache.insert({*Sym, 0});
+    std::tie(CacheIt, New) = Cache.insert({*Sym, {0, 0}});
     if (!New)
       return CacheIt->second;
 
@@ -934,12 +999,15 @@ getSymbolAddress(const object::ObjectFile &Obj, const RelocationRef &Reloc,
                          SectOrErr.takeError());
 
     RSec = *SectOrErr;
-    Ret = *SymAddrOrErr;
+    Ret.Address = *SymAddrOrErr;
   } else if (auto *MObj = dyn_cast<MachOObjectFile>(&Obj)) {
     RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl());
-    Ret = RSec->getAddress();
+    Ret.Address = RSec->getAddress();
   }
 
+  if (RSec != Obj.section_end())
+    Ret.SectionIndex = RSec->getIndex();
+
   // If we are given load addresses for the sections, we need to adjust:
   // SymAddr = (Address of Symbol Or Section in File) -
   //           (Address of Section in File) +
@@ -949,7 +1017,7 @@ getSymbolAddress(const object::ObjectFile &Obj, const RelocationRef &Reloc,
   // we need to perform the same computation.
   if (L && RSec != Obj.section_end())
     if (uint64_t SectionLoadAddress = L->getSectionLoadAddress(*RSec))
-      Ret += SectionLoadAddress - RSec->getAddress();
+      Ret.Address += SectionLoadAddress - RSec->getAddress();
 
   if (CacheIt != Cache.end())
     CacheIt->second = Ret;
@@ -989,8 +1057,8 @@ Error DWARFContextInMemory::maybeDecompress(const SectionRef &Sec,
 }
 
 DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
-    const LoadedObjectInfo *L)
-    : IsLittleEndian(Obj.isLittleEndian()),
+                                           const LoadedObjectInfo *L)
+    : FileName(Obj.getFileName()), IsLittleEndian(Obj.isLittleEndian()),
       AddressSize(Obj.getBytesInAddress()) {
   for (const SectionRef &Section : Obj.sections()) {
     StringRef name;
@@ -1008,7 +1076,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     // Try to obtain an already relocated version of this section.
     // Else use the unrelocated section from the object file. We'll have to
     // apply relocations ourselves later.
-    if (!L || !L->getLoadedSectionContents(*RelocatedSection,data))
+    if (!L || !L->getLoadedSectionContents(*RelocatedSection, data))
       Section.getContents(data);
 
     if (auto Err = maybeDecompress(Section, name, data)) {
@@ -1047,7 +1115,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     // If the section we're relocating was relocated already by the JIT,
     // then we used the relocated version above, so we do not need to process
     // relocations for it now.
-    if (L && L->getLoadedSectionContents(*RelocatedSection,RelSecData))
+    if (L && L->getLoadedSectionContents(*RelocatedSection, RelSecData))
       continue;
 
     // In Mach-o files, the relocations do not need to be applied if
@@ -1091,29 +1159,30 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     if (Section.relocation_begin() == Section.relocation_end())
       continue;
 
-    std::map<SymbolRef, uint64_t> AddrCache;
+    // Symbol to [address, section index] cache mapping.
+    std::map<SymbolRef, SymInfo> AddrCache;
     for (const RelocationRef &Reloc : Section.relocations()) {
       // FIXME: it's not clear how to correctly handle scattered
       // relocations.
       if (isRelocScattered(Obj, Reloc))
         continue;
 
-      Expected<uint64_t> SymAddrOrErr =
-          getSymbolAddress(Obj, Reloc, L, AddrCache);
-      if (!SymAddrOrErr) {
-        errs() << toString(SymAddrOrErr.takeError()) << '\n';
+      Expected<SymInfo> SymInfoOrErr = getSymbolInfo(Obj, Reloc, L, AddrCache);
+      if (!SymInfoOrErr) {
+        errs() << toString(SymInfoOrErr.takeError()) << '\n';
         continue;
       }
 
       object::RelocVisitor V(Obj);
-      uint64_t Val = V.visit(Reloc.getType(), Reloc, *SymAddrOrErr);
+      uint64_t Val = V.visit(Reloc.getType(), Reloc, SymInfoOrErr->Address);
       if (V.error()) {
         SmallString<32> Name;
         Reloc.getTypeName(Name);
         errs() << "error: failed to compute relocation: " << Name << "\n";
         continue;
       }
-      Map->insert({Reloc.getOffset(), {Val}});
+      llvm::RelocAddrEntry Rel = {SymInfoOrErr->SectionIndex, Val};
+      Map->insert({Reloc.getOffset(), Rel});
     }
   }
 }
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index 8da797750abd..6b5e1d3c931b 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -35,8 +35,8 @@ bool DWARFDebugRangeList::extract(DataExtractor data, uint32_t *offset_ptr,
   while (true) {
     RangeListEntry entry;
     uint32_t prev_offset = *offset_ptr;
-    entry.StartAddress =
-        getRelocatedValue(data, AddressSize, offset_ptr, &Relocs);
+    entry.StartAddress = getRelocatedValue(data, AddressSize, offset_ptr,
+                                           &Relocs, &entry.SectionIndex);
     entry.EndAddress =
         getRelocatedValue(data, AddressSize, offset_ptr, &Relocs);
 
@@ -69,8 +69,8 @@ DWARFDebugRangeList::getAbsoluteRanges(uint64_t BaseAddress) const {
     if (RLE.isBaseAddressSelectionEntry(AddressSize)) {
       BaseAddress = RLE.EndAddress;
     } else {
-      Res.push_back(
-          {BaseAddress + RLE.StartAddress, BaseAddress + RLE.EndAddress});
+      Res.push_back({BaseAddress + RLE.StartAddress,
+                     BaseAddress + RLE.EndAddress, RLE.SectionIndex});
     }
   }
   return Res;
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index e3bd759ba94b..fd45c77d3745 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -211,13 +211,16 @@ Optional<uint64_t> DWARFDie::getHighPC(uint64_t LowPC) const {
   return None;
 }
 
-bool DWARFDie::getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC) const {
-  auto LowPcAddr = toAddress(find(DW_AT_low_pc));
+bool DWARFDie::getLowAndHighPC(uint64_t &LowPC, uint64_t &HighPC,
+                               uint64_t &SectionIndex) const {
+  auto F = find(DW_AT_low_pc);
+  auto LowPcAddr = toAddress(F);
   if (!LowPcAddr)
     return false;
   if (auto HighPcAddr = getHighPC(*LowPcAddr)) {
     LowPC = *LowPcAddr;
     HighPC = *HighPcAddr;
+    SectionIndex = F->getSectionIndex();
     return true;
   }
   return false;
@@ -228,9 +231,9 @@ DWARFDie::getAddressRanges() const {
   if (isNULL())
     return DWARFAddressRangesVector();
   // Single range specified by low/high PC.
-  uint64_t LowPC, HighPC;
-  if (getLowAndHighPC(LowPC, HighPC))
-    return {{LowPC, HighPC}};
+  uint64_t LowPC, HighPC, Index;
+  if (getLowAndHighPC(LowPC, HighPC, Index))
+    return {{LowPC, HighPC, Index}};
 
   // Multiple ranges from .debug_ranges section.
   auto RangesOffset = toSectionOffset(find(DW_AT_ranges));
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index 1cbd3ea2c869..0963d7bfd713 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -333,8 +333,8 @@ bool DWARFFormValue::extractValue(const DataExtractor &Data,
         return false;
       uint16_t AddrSize = (Form == DW_FORM_addr) ? U->getAddressByteSize()
                                                  : U->getRefAddrByteSize();
-      Value.uval =
-          getRelocatedValue(Data, AddrSize, OffsetPtr, U->getRelocMap());
+      Value.uval = getRelocatedValue(Data, AddrSize, OffsetPtr,
+                                     U->getRelocMap(), &Value.SectionIndex);
       break;
     }
     case DW_FORM_exprloc:
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index c268afc222c3..c5add6a478b3 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -249,23 +249,6 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
   return DieArray.size();
 }
 
-DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath, uint64_t DWOId) {
-  auto Obj = object::ObjectFile::createObjectFile(DWOPath);
-  if (!Obj) {
-    // TODO: Actually report errors helpfully.
-    consumeError(Obj.takeError());
-    return;
-  }
-  DWOFile = std::move(Obj.get());
-  DWOContext.reset(
-      cast<DWARFContext>(new DWARFContextInMemory(*DWOFile.getBinary())));
-  for (const auto &DWOCU : DWOContext->dwo_compile_units())
-    if (DWOCU->getDWOId() == DWOId) {
-      DWOU = DWOCU.get();
-      return;
-    }
-}
-
 bool DWARFUnit::parseDWO() {
   if (isDWO)
     return false;
@@ -287,16 +270,18 @@ bool DWARFUnit::parseDWO() {
   auto DWOId = getDWOId();
   if (!DWOId)
     return false;
-  DWO = llvm::make_unique<DWOHolder>(AbsolutePath, *DWOId);
-  DWARFUnit *DWOCU = DWO->getUnit();
-  if (!DWOCU) {
-    DWO.reset();
+  auto DWOContext = Context.getDWOContext(AbsolutePath);
+  if (!DWOContext)
     return false;
-  }
+
+  DWARFCompileUnit *DWOCU = DWOContext->getDWOCompileUnitForHash(*DWOId);
+  if (!DWOCU)
+    return false;
+  DWO = std::shared_ptr<DWARFCompileUnit>(std::move(DWOContext), DWOCU);
   // Share .debug_addr and .debug_ranges section with compile unit in .dwo
-  DWOCU->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
+  DWO->setAddrOffsetSection(AddrOffsetSection, AddrOffsetSectionBase);
   auto DWORangesBase = UnitDie.getRangesBaseAttribute();
-  DWOCU->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0);
+  DWO->setRangesSection(RangeSection, DWORangesBase ? *DWORangesBase : 0);
   return true;
 }
 
@@ -339,8 +324,8 @@ void DWARFUnit::collectAddressRanges(DWARFAddressRangesVector &CURanges) {
 
   // Collect address ranges from DIEs in .dwo if necessary.
   bool DWOCreated = parseDWO();
-  if (DWO.get())
-    DWO->getUnit()->collectAddressRanges(CURanges);
+  if (DWO)
+    DWO->collectAddressRanges(CURanges);
   if (DWOCreated)
     DWO.reset();
 
@@ -400,7 +385,7 @@ DWARFUnit::getInlinedChainForAddress(uint64_t Address,
   // First, find the subroutine that contains the given address (the leaf
   // of inlined chain).
   DWARFDie SubroutineDIE =
-      (DWO ? DWO->getUnit() : this)->getSubroutineForAddress(Address);
+      (DWO ? DWO.get() : this)->getSubroutineForAddress(Address);
 
   while (SubroutineDIE) {
     if (SubroutineDIE.isSubroutineDIE())
diff --git a/lib/DebugInfo/MSF/MappedBlockStream.cpp b/lib/DebugInfo/MSF/MappedBlockStream.cpp
index 57953cfa338e..dfdeb8414212 100644
--- a/lib/DebugInfo/MSF/MappedBlockStream.cpp
+++ b/lib/DebugInfo/MSF/MappedBlockStream.cpp
@@ -45,18 +45,17 @@ static Interval intersect(const Interval &I1, const Interval &I2) {
                         std::min(I1.second, I2.second));
 }
 
-MappedBlockStream::MappedBlockStream(uint32_t BlockSize, uint32_t NumBlocks,
+MappedBlockStream::MappedBlockStream(uint32_t BlockSize,
                                      const MSFStreamLayout &Layout,
                                      BinaryStreamRef MsfData)
-    : BlockSize(BlockSize), NumBlocks(NumBlocks), StreamLayout(Layout),
-      MsfData(MsfData) {}
+    : BlockSize(BlockSize), StreamLayout(Layout), MsfData(MsfData) {}
 
 std::unique_ptr<MappedBlockStream>
-MappedBlockStream::createStream(uint32_t BlockSize, uint32_t NumBlocks,
+MappedBlockStream::createStream(uint32_t BlockSize,
                                 const MSFStreamLayout &Layout,
                                 BinaryStreamRef MsfData) {
   return llvm::make_unique<MappedBlockStreamImpl<MappedBlockStream>>(
-      BlockSize, NumBlocks, Layout, MsfData);
+      BlockSize, Layout, MsfData);
 }
 
 std::unique_ptr<MappedBlockStream> MappedBlockStream::createIndexedStream(
@@ -66,7 +65,7 @@ std::unique_ptr<MappedBlockStream> MappedBlockStream::createIndexedStream(
   SL.Blocks = Layout.StreamMap[StreamIndex];
   SL.Length = Layout.StreamSizes[StreamIndex];
   return llvm::make_unique<MappedBlockStreamImpl<MappedBlockStream>>(
-      Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
+      Layout.SB->BlockSize, SL, MsfData);
 }
 
 std::unique_ptr<MappedBlockStream>
@@ -75,7 +74,7 @@ MappedBlockStream::createDirectoryStream(const MSFLayout &Layout,
   MSFStreamLayout SL;
   SL.Blocks = Layout.DirectoryBlocks;
   SL.Length = Layout.SB->NumDirectoryBytes;
-  return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData);
 }
 
 std::unique_ptr<MappedBlockStream>
@@ -83,7 +82,7 @@ MappedBlockStream::createFpmStream(const MSFLayout &Layout,
                                    BinaryStreamRef MsfData) {
   MSFStreamLayout SL;
   initializeFpmStreamLayout(Layout, SL);
-  return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData);
 }
 
 Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
@@ -173,7 +172,7 @@ Error MappedBlockStream::readLongestContiguousChunk(uint32_t Offset,
   uint32_t First = Offset / BlockSize;
   uint32_t Last = First;
 
-  while (Last < NumBlocks - 1) {
+  while (Last < getNumBlocks() - 1) {
     if (StreamLayout.Blocks[Last] != StreamLayout.Blocks[Last + 1] - 1)
       break;
     ++Last;
@@ -313,17 +312,16 @@ void MappedBlockStream::fixCacheAfterWrite(uint32_t Offset,
 }
 
 WritableMappedBlockStream::WritableMappedBlockStream(
-    uint32_t BlockSize, uint32_t NumBlocks, const MSFStreamLayout &Layout,
+    uint32_t BlockSize, const MSFStreamLayout &Layout,
     WritableBinaryStreamRef MsfData)
-    : ReadInterface(BlockSize, NumBlocks, Layout, MsfData),
-      WriteInterface(MsfData) {}
+    : ReadInterface(BlockSize, Layout, MsfData), WriteInterface(MsfData) {}
 
 std::unique_ptr<WritableMappedBlockStream>
-WritableMappedBlockStream::createStream(uint32_t BlockSize, uint32_t NumBlocks,
+WritableMappedBlockStream::createStream(uint32_t BlockSize,
                                         const MSFStreamLayout &Layout,
                                         WritableBinaryStreamRef MsfData) {
   return llvm::make_unique<MappedBlockStreamImpl<WritableMappedBlockStream>>(
-      BlockSize, NumBlocks, Layout, MsfData);
+      BlockSize, Layout, MsfData);
 }
 
 std::unique_ptr<WritableMappedBlockStream>
@@ -334,7 +332,7 @@ WritableMappedBlockStream::createIndexedStream(const MSFLayout &Layout,
   MSFStreamLayout SL;
   SL.Blocks = Layout.StreamMap[StreamIndex];
   SL.Length = Layout.StreamSizes[StreamIndex];
-  return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData);
 }
 
 std::unique_ptr<WritableMappedBlockStream>
@@ -343,7 +341,7 @@ WritableMappedBlockStream::createDirectoryStream(
   MSFStreamLayout SL;
   SL.Blocks = Layout.DirectoryBlocks;
   SL.Length = Layout.SB->NumDirectoryBytes;
-  return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData);
 }
 
 std::unique_ptr<WritableMappedBlockStream>
@@ -351,7 +349,7 @@ WritableMappedBlockStream::createFpmStream(const MSFLayout &Layout,
                                            WritableBinaryStreamRef MsfData) {
   MSFStreamLayout SL;
   initializeFpmStreamLayout(Layout, SL);
-  return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
+  return createStream(Layout.SB->BlockSize, SL, MsfData);
 }
 
 Error WritableMappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
diff --git a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index c19a2f0d3110..23c7456d7772 100644
--- a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -129,16 +129,21 @@ uint32_t DbiStreamBuilder::calculateSectionMapStreamSize() const {
   return sizeof(SecMapHeader) + sizeof(SecMapEntry) * SectionMap.size();
 }
 
-uint32_t DbiStreamBuilder::calculateFileInfoSubstreamSize() const {
-  uint32_t Size = 0;
-  Size += sizeof(ulittle16_t);                         // NumModules
-  Size += sizeof(ulittle16_t);                         // NumSourceFiles
-  Size += ModiList.size() * sizeof(ulittle16_t);       // ModIndices
-  Size += ModiList.size() * sizeof(ulittle16_t);       // ModFileCounts
+uint32_t DbiStreamBuilder::calculateNamesOffset() const {
+  uint32_t Offset = 0;
+  Offset += sizeof(ulittle16_t);                         // NumModules
+  Offset += sizeof(ulittle16_t);                         // NumSourceFiles
+  Offset += ModiList.size() * sizeof(ulittle16_t);       // ModIndices
+  Offset += ModiList.size() * sizeof(ulittle16_t);       // ModFileCounts
   uint32_t NumFileInfos = 0;
   for (const auto &M : ModiList)
     NumFileInfos += M->source_files().size();
-  Size += NumFileInfos * sizeof(ulittle32_t); // FileNameOffsets
+  Offset += NumFileInfos * sizeof(ulittle32_t); // FileNameOffsets
+  return Offset;
+}
+
+uint32_t DbiStreamBuilder::calculateFileInfoSubstreamSize() const {
+  uint32_t Size = calculateNamesOffset();
   Size += calculateNamesBufferSize();
   return alignTo(Size, sizeof(uint32_t));
 }
@@ -157,9 +162,8 @@ uint32_t DbiStreamBuilder::calculateDbgStreamsSize() const {
 
 Error DbiStreamBuilder::generateFileInfoSubstream() {
   uint32_t Size = calculateFileInfoSubstreamSize();
-  uint32_t NameSize = calculateNamesBufferSize();
   auto Data = Allocator.Allocate<uint8_t>(Size);
-  uint32_t NamesOffset = Size - NameSize;
+  uint32_t NamesOffset = calculateNamesOffset();
 
   FileInfoBuffer = MutableBinaryByteStream(MutableArrayRef<uint8_t>(Data, Size),
                                            llvm::support::little);
@@ -207,6 +211,9 @@ Error DbiStreamBuilder::generateFileInfoSubstream() {
     }
   }
 
+  if (auto EC = NameBufferWriter.padToAlignment(sizeof(uint32_t)))
+    return EC;
+
   if (NameBufferWriter.bytesRemaining() > 0)
     return make_error<RawError>(raw_error_code::invalid_format,
                                 "The names buffer contained unexpected data.");
diff --git a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
index f00567db743e..9fd90102f72c 100644
--- a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
@@ -47,7 +47,7 @@ void PDBTypeServerHandler::addSearchPath(StringRef Path) {
   if (Path.empty() || !sys::fs::is_directory(Path))
     return;
 
-  SearchPaths.push_back(Path);
+  SearchPaths.insert(Path);
 }
 
 Expected<bool>
@@ -57,7 +57,13 @@ PDBTypeServerHandler::handleInternal(PDBFile &File,
   if (!ExpectedTpi)
     return ExpectedTpi.takeError();
 
-  if (auto EC = codeview::visitTypeStream(ExpectedTpi->typeArray(), Callbacks))
+  // For handling a type server, we should be using whatever the callback array
+  // was
+  // that is being used for the original file.  We shouldn't allow the visitor
+  // to
+  // arbitrarily stick a deserializer in there.
+  if (auto EC = codeview::visitTypeStream(ExpectedTpi->typeArray(), Callbacks,
+                                          VDS_BytesExternal))
     return std::move(EC);
 
   return true;
@@ -80,13 +86,14 @@ Expected<bool> PDBTypeServerHandler::handle(TypeServer2Record &TS,
         cv_error_code::corrupt_record,
         "TypeServer2Record does not contain filename!");
 
-  for (auto Path : SearchPaths) {
-    sys::path::append(Path, File);
-    if (!sys::fs::exists(Path))
+  for (auto &Path : SearchPaths) {
+    SmallString<64> PathStr = Path.getKey();
+    sys::path::append(PathStr, File);
+    if (!sys::fs::exists(PathStr))
       continue;
 
     std::unique_ptr<IPDBSession> ThisSession;
-    if (auto EC = loadDataForPDB(PDB_ReaderType::Native, Path, ThisSession)) {
+    if (auto EC = loadDataForPDB(PDB_ReaderType::Native, PathStr, ThisSession)) {
       // It is not an error if this PDB fails to load, it just means that it
       // doesn't match and we should continue searching.
       ignoreErrors(std::move(EC));
diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp
index 8e0065873892..623afb371b50 100644
--- a/lib/DebugInfo/PDB/Native/TpiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -8,7 +8,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/DebugInfo/CodeView/LazyRandomTypeCollection.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
@@ -104,6 +106,8 @@ Error TpiStream::reload() {
     HashStream = std::move(HS);
   }
 
+  Types = llvm::make_unique<LazyRandomTypeCollection>(
+      TypeRecords, getNumTypeRecords(), getTypeIndexOffsets());
   return Error::success();
 }
 
diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp
index f454ae61d965..34f4017d9828 100644
--- a/lib/Demangle/ItaniumDemangle.cpp
+++ b/lib/Demangle/ItaniumDemangle.cpp
@@ -2525,6 +2525,9 @@ static std::string base_name(std::string &s) {
       ++p0;
       break;
     }
+    if (!isalpha(*p0) && !isdigit(*p0) && *p0 != '_') {
+      return std::string();
+    }
   }
   return std::string(p0, pe);
 }
@@ -2612,39 +2615,45 @@ static const char *parse_unnamed_type_name(const char *first, const char *last,
       first = t0 + 1;
     } break;
     case 'l': {
+      size_t lambda_pos = db.names.size();
       db.names.push_back(std::string("'lambda'("));
       const char *t0 = first + 2;
       if (first[2] == 'v') {
         db.names.back().first += ')';
         ++t0;
       } else {
-        const char *t1 = parse_type(t0, last, db);
-        if (t1 == t0) {
+        bool is_first_it = true;
+        while (true) {
+          long k0 = static_cast<long>(db.names.size());
+          const char *t1 = parse_type(t0, last, db);
+          long k1 = static_cast<long>(db.names.size());
+          if (t1 == t0)
+            break;
+          if (k0 >= k1)
+            return first;
+          // If the call to parse_type above found a pack expansion
+          // substitution, then multiple names could have been
+          // inserted into the name table. Walk through the names,
+          // appending each onto the lambda's parameter list.
+          std::for_each(db.names.begin() + k0, db.names.begin() + k1,
+                        [&](typename C::sub_type::value_type &pair) {
+                          if (pair.empty())
+                            return;
+                          auto &lambda = db.names[lambda_pos].first;
+                          if (!is_first_it)
+                            lambda.append(", ");
+                          is_first_it = false;
+                          lambda.append(pair.move_full());
+                        });
+          db.names.erase(db.names.begin() + k0, db.names.end());
+          t0 = t1;
+        }
+        if (is_first_it) {
           if (!db.names.empty())
             db.names.pop_back();
           return first;
         }
-        if (db.names.size() < 2)
-          return first;
-        auto tmp = db.names.back().move_full();
-        db.names.pop_back();
-        db.names.back().first.append(tmp);
-        t0 = t1;
-        while (true) {
-          t1 = parse_type(t0, last, db);
-          if (t1 == t0)
-            break;
-          if (db.names.size() < 2)
-            return first;
-          tmp = db.names.back().move_full();
-          db.names.pop_back();
-          if (!tmp.empty()) {
-            db.names.back().first.append(", ");
-            db.names.back().first.append(tmp);
-          }
-          t0 = t1;
-        }
-        if (db.names.empty())
+        if (db.names.empty() || db.names.size() - 1 != lambda_pos)
           return first;
         db.names.back().first.append(")");
       }
@@ -4030,6 +4039,8 @@ static const char *parse_encoding(const char *first, const char *last, C &db) {
     save_value<decltype(db.tag_templates)> sb(db.tag_templates);
     if (db.encoding_depth > 1)
       db.tag_templates = true;
+    save_value<decltype(db.parsed_ctor_dtor_cv)> sp(db.parsed_ctor_dtor_cv);
+    db.parsed_ctor_dtor_cv = false;
     switch (*first) {
     case 'G':
     case 'T':
@@ -4229,6 +4240,7 @@ template <class StrT> struct string_pair {
   template <size_t N> string_pair(const char (&s)[N]) : first(s, N - 1) {}
 
   size_t size() const { return first.size() + second.size(); }
+  bool empty() const { return first.empty() && second.empty(); }
   StrT full() const { return first + second; }
   StrT move_full() { return std::move(first) + std::move(second); }
 };
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 660843765b3f..9ce3974529bb 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -737,23 +737,23 @@ void RuntimeDyldELF::resolvePPC64Relocation(const SectionEntry &Section,
     writeInt16BE(LocalAddress, applyPPCha(Delta));
   } break;
   case ELF::R_PPC64_ADDR32: {
-    int32_t Result = static_cast<int32_t>(Value + Addend);
-    if (SignExtend32<32>(Result) != Result)
+    int64_t Result = static_cast<int64_t>(Value + Addend);
+    if (SignExtend64<32>(Result) != Result)
       llvm_unreachable("Relocation R_PPC64_ADDR32 overflow");
     writeInt32BE(LocalAddress, Result);
   } break;
   case ELF::R_PPC64_REL24: {
     uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
-    int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend);
-    if (SignExtend32<26>(delta) != delta)
+    int64_t delta = static_cast<int64_t>(Value - FinalAddress + Addend);
+    if (SignExtend64<26>(delta) != delta)
       llvm_unreachable("Relocation R_PPC64_REL24 overflow");
     // Generates a 'bl <address>' instruction
     writeInt32BE(LocalAddress, 0x48000001 | (delta & 0x03FFFFFC));
   } break;
   case ELF::R_PPC64_REL32: {
     uint64_t FinalAddress = Section.getLoadAddressWithOffset(Offset);
-    int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend);
-    if (SignExtend32<32>(delta) != delta)
+    int64_t delta = static_cast<int64_t>(Value - FinalAddress + Addend);
+    if (SignExtend64<32>(delta) != delta)
       llvm_unreachable("Relocation R_PPC64_REL32 overflow");
     writeInt32BE(LocalAddress, delta);
   } break;
@@ -1324,12 +1324,13 @@ RuntimeDyldELF::processRelocationRef(
       Obj.getPlatformFlags(AbiVariant);
       AbiVariant &= ELF::EF_PPC64_ABI;
       // A PPC branch relocation will need a stub function if the target is
-      // an external symbol (Symbol::ST_Unknown) or if the target address
-      // is not within the signed 24-bits branch address.
+      // an external symbol (either Value.SymbolName is set, or SymType is
+      // Symbol::ST_Unknown) or if the target address is not within the
+      // signed 24-bits branch address.
       SectionEntry &Section = Sections[SectionID];
       uint8_t *Target = Section.getAddressWithOffset(Offset);
       bool RangeOverflow = false;
-      if (SymType != SymbolRef::ST_Unknown) {
+      if (!Value.SymbolName && SymType != SymbolRef::ST_Unknown) {
         if (AbiVariant != 2) {
           // In the ELFv1 ABI, a function call may point to the .opd entry,
           // so the final symbol value is calculated based on the relocation
@@ -1344,21 +1345,19 @@ RuntimeDyldELF::processRelocationRef(
         }
         uint8_t *RelocTarget =
             Sections[Value.SectionID].getAddressWithOffset(Value.Addend);
-        int32_t delta = static_cast<int32_t>(Target - RelocTarget);
+        int64_t delta = static_cast<int64_t>(Target - RelocTarget);
         // If it is within 26-bits branch range, just set the branch target
-        if (SignExtend32<26>(delta) == delta) {
+        if (SignExtend64<26>(delta) == delta) {
           RelocationEntry RE(SectionID, Offset, RelType, Value.Addend);
-          if (Value.SymbolName)
-            addRelocationForSymbol(RE, Value.SymbolName);
-          else
-            addRelocationForSection(RE, Value.SectionID);
+          addRelocationForSection(RE, Value.SectionID);
         } else {
           RangeOverflow = true;
         }
       }
-      if (SymType == SymbolRef::ST_Unknown || RangeOverflow) {
-        // It is an external symbol (SymbolRef::ST_Unknown) or within a range
-        // larger than 24-bits.
+      if (Value.SymbolName || SymType == SymbolRef::ST_Unknown ||
+          RangeOverflow) {
+        // It is an external symbol (either Value.SymbolName is set, or
+        // SymType is SymbolRef::ST_Unknown) or out of range.
         StubMap::const_iterator i = Stubs.find(Value);
         if (i != Stubs.end()) {
           // Symbol function stub already created, just relocate to it
@@ -1412,7 +1411,7 @@ RuntimeDyldELF::processRelocationRef(
                             RelType, 0);
           Section.advanceStubOffset(getMaxStubSize());
         }
-        if (SymType == SymbolRef::ST_Unknown) {
+        if (Value.SymbolName || SymType == SymbolRef::ST_Unknown) {
           // Restore the TOC for external calls
           if (AbiVariant == 2)
             writeInt32BE(Target + 4, 0xE8410018); // ld r2,28(r1)
diff --git a/lib/Fuzzer/FuzzerUtilPosix.cpp b/lib/Fuzzer/FuzzerUtilPosix.cpp
index 0161309fbf86..bc85264ac187 100644
--- a/lib/Fuzzer/FuzzerUtilPosix.cpp
+++ b/lib/Fuzzer/FuzzerUtilPosix.cpp
@@ -47,8 +47,21 @@ static void FileSizeExceedHandler(int, siginfo_t *, void *) {
 
 static void SetSigaction(int signum,
                          void (*callback)(int, siginfo_t *, void *)) {
-  struct sigaction sigact;
-  memset(&sigact, 0, sizeof(sigact));
+  struct sigaction sigact = {};
+  if (sigaction(signum, nullptr, &sigact)) {
+    Printf("libFuzzer: sigaction failed with %d\n", errno);
+    exit(1);
+  }
+  if (sigact.sa_flags & SA_SIGINFO) {
+    if (sigact.sa_sigaction)
+      return;
+  } else {
+    if (sigact.sa_handler != SIG_DFL && sigact.sa_handler != SIG_IGN &&
+        sigact.sa_handler != SIG_ERR)
+      return;
+  }
+
+  sigact = {};
   sigact.sa_sigaction = callback;
   if (sigaction(signum, &sigact, 0)) {
     Printf("libFuzzer: sigaction failed with %d\n", errno);
diff --git a/lib/Fuzzer/test/fuzzer-segv.test b/lib/Fuzzer/test/fuzzer-segv.test
index b9a6a5ce44ca..90f01932f652 100644
--- a/lib/Fuzzer/test/fuzzer-segv.test
+++ b/lib/Fuzzer/test/fuzzer-segv.test
@@ -3,3 +3,5 @@ LIBFUZZER_OWN_SEGV_HANDLER: == ERROR: libFuzzer: deadly signal
 LIBFUZZER_OWN_SEGV_HANDLER: SUMMARY: libFuzzer: deadly signal
 LIBFUZZER_OWN_SEGV_HANDLER: Test unit written to ./crash-
 
+RUN: env ASAN_OPTIONS=handle_segv=1 not LLVMFuzzer-NullDerefTest 2>&1 | FileCheck %s --check-prefix=LIBFUZZER_ASAN_SEGV_HANDLER
+LIBFUZZER_ASAN_SEGV_HANDLER: ERROR: AddressSanitizer: {{SEGV|access-violation}} on unknown address
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index acfac316e91e..4ed7b021883d 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -212,27 +212,21 @@ using IndexAttrPair = std::pair<unsigned, AttributeSet>;
 /// return type, and parameters.
 class AttributeListImpl final
     : public FoldingSetNode,
-      private TrailingObjects<AttributeListImpl, IndexAttrPair> {
+      private TrailingObjects<AttributeListImpl, AttributeSet> {
   friend class AttributeList;
   friend TrailingObjects;
 
 private:
-  LLVMContext &Context;
-  unsigned NumSlots; ///< Number of entries in this set.
   /// Bitset with a bit for each available attribute Attribute::AttrKind.
   uint64_t AvailableFunctionAttrs;
+  LLVMContext &Context;
+  unsigned NumAttrSets; ///< Number of entries in this set.
 
   // Helper fn for TrailingObjects class.
-  size_t numTrailingObjects(OverloadToken<IndexAttrPair>) { return NumSlots; }
-
-  /// \brief Return a pointer to the IndexAttrPair for the specified slot.
-  const IndexAttrPair *getSlotPair(unsigned Slot) const {
-    return getTrailingObjects<IndexAttrPair>() + Slot;
-  }
+  size_t numTrailingObjects(OverloadToken<AttributeSet>) { return NumAttrSets; }
 
 public:
-  AttributeListImpl(LLVMContext &C,
-                    ArrayRef<std::pair<unsigned, AttributeSet>> Slots);
+  AttributeListImpl(LLVMContext &C, ArrayRef<AttributeSet> Sets);
 
   // AttributesSetImpt is uniqued, these should not be available.
   AttributeListImpl(const AttributeListImpl &) = delete;
@@ -243,41 +237,18 @@ public:
   /// \brief Get the context that created this AttributeListImpl.
   LLVMContext &getContext() { return Context; }
 
-  /// \brief Return the number of slots used in this attribute list. This is
-  /// the number of arguments that have an attribute set on them (including the
-  /// function itself).
-  unsigned getNumSlots() const { return NumSlots; }
-
-  /// \brief Get the index of the given "slot" in the AttrNodes list. This index
-  /// is the index of the return, parameter, or function object that the
-  /// attributes are applied to, not the index into the AttrNodes list where the
-  /// attributes reside.
-  unsigned getSlotIndex(unsigned Slot) const {
-    return getSlotPair(Slot)->first;
-  }
-
-  /// \brief Retrieve the attribute set node for the given "slot" in the
-  /// AttrNode list.
-  AttributeSet getSlotAttributes(unsigned Slot) const {
-    return getSlotPair(Slot)->second;
-  }
-
   /// \brief Return true if the AttributeSet or the FunctionIndex has an
   /// enum attribute of the given kind.
   bool hasFnAttribute(Attribute::AttrKind Kind) const {
     return AvailableFunctionAttrs & ((uint64_t)1) << Kind;
   }
 
-  using iterator = AttributeSet::iterator;
-
-  iterator begin(unsigned Slot) const {
-    return getSlotAttributes(Slot).begin();
-  }
-  iterator end(unsigned Slot) const { return getSlotAttributes(Slot).end(); }
+  typedef const AttributeSet *iterator;
+  iterator begin() const { return getTrailingObjects<AttributeSet>(); }
+  iterator end() const { return begin() + NumAttrSets; }
 
   void Profile(FoldingSetNodeID &ID) const;
-  static void Profile(FoldingSetNodeID &ID,
-                      ArrayRef<std::pair<unsigned, AttributeSet>> Nodes);
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<AttributeSet> Nodes);
 
   void dump() const;
 };
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index adb31d127a2e..19b7c3027232 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -507,7 +507,7 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<Attribute> Attrs) {
 }
 
 AttributeSet AttributeSet::addAttribute(LLVMContext &C,
-                          Attribute::AttrKind Kind) const {
+                                        Attribute::AttrKind Kind) const {
   if (hasAttribute(Kind)) return *this;
   AttrBuilder B;
   B.addAttribute(Kind);
@@ -515,7 +515,7 @@ AttributeSet AttributeSet::addAttribute(LLVMContext &C,
 }
 
 AttributeSet AttributeSet::addAttribute(LLVMContext &C, StringRef Kind,
-                          StringRef Value) const {
+                                        StringRef Value) const {
   AttrBuilder B;
   B.addAttribute(Kind, Value);
   return addAttributes(C, AttributeSet::get(C, B));
@@ -788,48 +788,44 @@ std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
 // AttributeListImpl Definition
 //===----------------------------------------------------------------------===//
 
-AttributeListImpl::AttributeListImpl(
-    LLVMContext &C, ArrayRef<std::pair<unsigned, AttributeSet>> Slots)
-    : Context(C), NumSlots(Slots.size()), AvailableFunctionAttrs(0) {
-#ifndef NDEBUG
-  assert(!Slots.empty() && "pointless AttributeListImpl");
-  if (Slots.size() >= 2) {
-    auto &PrevPair = Slots.front();
-    for (auto &CurPair : Slots.drop_front()) {
-      assert(PrevPair.first <= CurPair.first && "Attribute set not ordered!");
-    }
-  }
-#endif
+/// Map from AttributeList index to the internal array index. Adding one works:
+///   FunctionIndex: ~0U -> 0
+///   ReturnIndex:    0  -> 1
+///   FirstArgIndex: 1.. -> 2..
+static constexpr unsigned attrIdxToArrayIdx(unsigned Index) {
+  // MSVC warns about '~0U + 1' wrapping around when this is called on
+  // FunctionIndex, so cast to int first.
+  return static_cast<int>(Index) + 1;
+}
+
+AttributeListImpl::AttributeListImpl(LLVMContext &C,
+                                     ArrayRef<AttributeSet> Sets)
+    : AvailableFunctionAttrs(0), Context(C), NumAttrSets(Sets.size()) {
+  assert(!Sets.empty() && "pointless AttributeListImpl");
 
   // There's memory after the node where we can store the entries in.
-  std::copy(Slots.begin(), Slots.end(), getTrailingObjects<IndexAttrPair>());
+  std::copy(Sets.begin(), Sets.end(), getTrailingObjects<AttributeSet>());
 
   // Initialize AvailableFunctionAttrs summary bitset.
   static_assert(Attribute::EndAttrKinds <=
                     sizeof(AvailableFunctionAttrs) * CHAR_BIT,
                 "Too many attributes");
-  static_assert(AttributeList::FunctionIndex == ~0u,
-                "FunctionIndex should be biggest possible index");
-  const auto &Last = Slots.back();
-  if (Last.first == AttributeList::FunctionIndex) {
-    AttributeSet Node = Last.second;
-    for (Attribute I : Node) {
-      if (!I.isStringAttribute())
-        AvailableFunctionAttrs |= ((uint64_t)1) << I.getKindAsEnum();
-    }
+  static_assert(attrIdxToArrayIdx(AttributeList::FunctionIndex) == 0U,
+                "function should be stored in slot 0");
+  for (Attribute I : Sets[0]) {
+    if (!I.isStringAttribute())
+      AvailableFunctionAttrs |= 1ULL << I.getKindAsEnum();
   }
 }
 
 void AttributeListImpl::Profile(FoldingSetNodeID &ID) const {
-  Profile(ID, makeArrayRef(getSlotPair(0), getNumSlots()));
+  Profile(ID, makeArrayRef(begin(), end()));
 }
 
-void AttributeListImpl::Profile(
-    FoldingSetNodeID &ID, ArrayRef<std::pair<unsigned, AttributeSet>> Nodes) {
-  for (const auto &Node : Nodes) {
-    ID.AddInteger(Node.first);
-    ID.AddPointer(Node.second.SetNode);
-  }
+void AttributeListImpl::Profile(FoldingSetNodeID &ID,
+                                ArrayRef<AttributeSet> Sets) {
+  for (const auto &Set : Sets)
+    ID.AddPointer(Set.SetNode);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -842,24 +838,13 @@ LLVM_DUMP_METHOD void AttributeListImpl::dump() const {
 // AttributeList Construction and Mutation Methods
 //===----------------------------------------------------------------------===//
 
-AttributeList AttributeList::getImpl(
-    LLVMContext &C, ArrayRef<std::pair<unsigned, AttributeSet>> Attrs) {
-  assert(!Attrs.empty() && "creating pointless AttributeList");
-#ifndef NDEBUG
-  unsigned LastIndex = 0;
-  bool IsFirst = true;
-  for (auto &&AttrPair : Attrs) {
-    assert((IsFirst || LastIndex < AttrPair.first) &&
-           "unsorted or duplicate AttributeList indices");
-    assert(AttrPair.second.hasAttributes() && "pointless AttributeList slot");
-    LastIndex = AttrPair.first;
-    IsFirst = false;
-  }
-#endif
+AttributeList AttributeList::getImpl(LLVMContext &C,
+                                     ArrayRef<AttributeSet> AttrSets) {
+  assert(!AttrSets.empty() && "pointless AttributeListImpl");
 
   LLVMContextImpl *pImpl = C.pImpl;
   FoldingSetNodeID ID;
-  AttributeListImpl::Profile(ID, Attrs);
+  AttributeListImpl::Profile(ID, AttrSets);
 
   void *InsertPoint;
   AttributeListImpl *PA =
@@ -870,8 +855,8 @@ AttributeList AttributeList::getImpl(
   if (!PA) {
     // Coallocate entries after the AttributeListImpl itself.
     void *Mem = ::operator new(
-        AttributeListImpl::totalSizeToAlloc<IndexAttrPair>(Attrs.size()));
-    PA = new (Mem) AttributeListImpl(C, Attrs);
+        AttributeListImpl::totalSizeToAlloc<AttributeSet>(AttrSets.size()));
+    PA = new (Mem) AttributeListImpl(C, AttrSets);
     pImpl->AttrsLists.InsertNode(PA, InsertPoint);
   }
 
@@ -912,7 +897,7 @@ AttributeList::get(LLVMContext &C,
     AttrPairVec.emplace_back(Index, AttributeSet::get(C, AttrVec));
   }
 
-  return getImpl(C, AttrPairVec);
+  return get(C, AttrPairVec);
 }
 
 AttributeList
@@ -922,35 +907,76 @@ AttributeList::get(LLVMContext &C,
   if (Attrs.empty())
     return AttributeList();
 
-  return getImpl(C, Attrs);
+  assert(std::is_sorted(Attrs.begin(), Attrs.end(),
+                        [](const std::pair<unsigned, AttributeSet> &LHS,
+                           const std::pair<unsigned, AttributeSet> &RHS) {
+                          return LHS.first < RHS.first;
+                        }) &&
+         "Misordered Attributes list!");
+  assert(none_of(Attrs,
+                 [](const std::pair<unsigned, AttributeSet> &Pair) {
+                   return !Pair.second.hasAttributes();
+                 }) &&
+         "Pointless attribute!");
+
+  unsigned MaxIndex = Attrs.back().first;
+
+  SmallVector<AttributeSet, 4> AttrVec(attrIdxToArrayIdx(MaxIndex) + 1);
+  for (auto Pair : Attrs)
+    AttrVec[attrIdxToArrayIdx(Pair.first)] = Pair.second;
+
+  return getImpl(C, AttrVec);
 }
 
 AttributeList AttributeList::get(LLVMContext &C, AttributeSet FnAttrs,
                                  AttributeSet RetAttrs,
                                  ArrayRef<AttributeSet> ArgAttrs) {
-  SmallVector<std::pair<unsigned, AttributeSet>, 8> AttrPairs;
-  if (RetAttrs.hasAttributes())
-    AttrPairs.emplace_back(ReturnIndex, RetAttrs);
-  size_t Index = 1;
-  for (AttributeSet AS : ArgAttrs) {
-    if (AS.hasAttributes())
-      AttrPairs.emplace_back(Index, AS);
-    ++Index;
+  // Scan from the end to find the last argument with attributes.  Most
+  // arguments don't have attributes, so it's nice if we can have fewer unique
+  // AttributeListImpls by dropping empty attribute sets at the end of the list.
+  unsigned NumSets = 0;
+  for (size_t I = ArgAttrs.size(); I != 0; --I) {
+    if (ArgAttrs[I - 1].hasAttributes()) {
+      NumSets = I + 2;
+      break;
+    }
   }
-  if (FnAttrs.hasAttributes())
-    AttrPairs.emplace_back(FunctionIndex, FnAttrs);
-  if (AttrPairs.empty())
+  if (NumSets == 0) {
+    // Check function and return attributes if we didn't have argument
+    // attributes.
+    if (RetAttrs.hasAttributes())
+      NumSets = 2;
+    else if (FnAttrs.hasAttributes())
+      NumSets = 1;
+  }
+
+  // If all attribute sets were empty, we can use the empty attribute list.
+  if (NumSets == 0)
     return AttributeList();
-  return getImpl(C, AttrPairs);
+
+  SmallVector<AttributeSet, 8> AttrSets;
+  AttrSets.reserve(NumSets);
+  // If we have any attributes, we always have function attributes.
+  AttrSets.push_back(FnAttrs);
+  if (NumSets > 1)
+    AttrSets.push_back(RetAttrs);
+  if (NumSets > 2) {
+    // Drop the empty argument attribute sets at the end.
+    ArgAttrs = ArgAttrs.take_front(NumSets - 2);
+    AttrSets.insert(AttrSets.end(), ArgAttrs.begin(), ArgAttrs.end());
+  }
+
+  return getImpl(C, AttrSets);
 }
 
 AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
                                  const AttrBuilder &B) {
   if (!B.hasAttributes())
     return AttributeList();
-  AttributeSet AS = AttributeSet::get(C, B);
-  std::pair<unsigned, AttributeSet> Arr[1] = {{Index, AS}};
-  return getImpl(C, Arr);
+  Index = attrIdxToArrayIdx(Index);
+  SmallVector<AttributeSet, 8> AttrSets(Index + 1);
+  AttrSets[Index] = AttributeSet::get(C, B);
+  return getImpl(C, AttrSets);
 }
 
 AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
@@ -973,32 +999,22 @@ AttributeList AttributeList::get(LLVMContext &C,
                                  ArrayRef<AttributeList> Attrs) {
   if (Attrs.empty())
     return AttributeList();
-  if (Attrs.size() == 1) return Attrs[0];
+  if (Attrs.size() == 1)
+    return Attrs[0];
 
-  SmallVector<std::pair<unsigned, AttributeSet>, 8> AttrNodeVec;
-  AttributeListImpl *A0 = Attrs[0].pImpl;
-  if (A0)
-    AttrNodeVec.append(A0->getSlotPair(0), A0->getSlotPair(A0->getNumSlots()));
-  // Copy all attributes from Attrs into AttrNodeVec while keeping AttrNodeVec
-  // ordered by index.  Because we know that each list in Attrs is ordered by
-  // index we only need to merge each successive list in rather than doing a
-  // full sort.
-  for (unsigned I = 1, E = Attrs.size(); I != E; ++I) {
-    AttributeListImpl *ALI = Attrs[I].pImpl;
-    if (!ALI) continue;
-    SmallVector<std::pair<unsigned, AttributeSet>, 8>::iterator
-      ANVI = AttrNodeVec.begin(), ANVE;
-    for (const IndexAttrPair *AI = ALI->getSlotPair(0),
-                             *AE = ALI->getSlotPair(ALI->getNumSlots());
-         AI != AE; ++AI) {
-      ANVE = AttrNodeVec.end();
-      while (ANVI != ANVE && ANVI->first <= AI->first)
-        ++ANVI;
-      ANVI = AttrNodeVec.insert(ANVI, *AI) + 1;
-    }
+  unsigned MaxSize = 0;
+  for (AttributeList List : Attrs)
+    MaxSize = std::max(MaxSize, List.getNumAttrSets());
+
+  SmallVector<AttributeSet, 8> NewAttrSets(MaxSize);
+  for (unsigned I = 0; I < MaxSize; ++I) {
+    AttrBuilder CurBuilder;
+    for (AttributeList List : Attrs)
+      CurBuilder.merge(List.getAttributes(I - 1));
+    NewAttrSets[I] = AttributeSet::get(C, CurBuilder);
   }
 
-  return getImpl(C, AttrNodeVec);
+  return getImpl(C, NewAttrSets);
 }
 
 AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
@@ -1022,29 +1038,19 @@ AttributeList AttributeList::addAttribute(LLVMContext &C,
                                           Attribute A) const {
   assert(std::is_sorted(Indices.begin(), Indices.end()));
 
-  unsigned I = 0, E = pImpl ? pImpl->getNumSlots() : 0;
-  SmallVector<IndexAttrPair, 4> AttrVec;
-  for (unsigned Index : Indices) {
-    // Add all attribute slots before the current index.
-    for (; I < E && getSlotIndex(I) < Index; ++I)
-      AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotAttributes(I));
+  SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
+  unsigned MaxIndex = attrIdxToArrayIdx(Indices.back());
+  if (MaxIndex >= AttrSets.size())
+    AttrSets.resize(MaxIndex + 1);
 
-    // Add the attribute at this index. If we already have attributes at this
-    // index, merge them into a new set.
-    AttrBuilder B;
-    if (I < E && getSlotIndex(I) == Index) {
-      B.merge(AttrBuilder(pImpl->getSlotAttributes(I)));
-      ++I;
-    }
+  for (unsigned Index : Indices) {
+    Index = attrIdxToArrayIdx(Index);
+    AttrBuilder B(AttrSets[Index]);
     B.addAttribute(A);
-    AttrVec.emplace_back(Index, AttributeSet::get(C, B));
+    AttrSets[Index] = AttributeSet::get(C, B);
   }
 
-  // Add remaining attributes.
-  for (; I < E; ++I)
-    AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotAttributes(I));
-
-  return get(C, AttrVec);
+  return getImpl(C, AttrSets);
 }
 
 AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
@@ -1064,33 +1070,16 @@ AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
          "Attempt to change alignment!");
 #endif
 
-  SmallVector<IndexAttrPair, 4> AttrVec;
-  uint64_t NumAttrs = pImpl->getNumSlots();
-  unsigned I;
+  Index = attrIdxToArrayIdx(Index);
+  SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
+  if (Index >= AttrSets.size())
+    AttrSets.resize(Index + 1);
 
-  // Add all the attribute slots before the one we need to merge.
-  for (I = 0; I < NumAttrs; ++I) {
-    if (getSlotIndex(I) >= Index)
-      break;
-    AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotAttributes(I));
-  }
+  AttrBuilder Merged(AttrSets[Index]);
+  Merged.merge(B);
+  AttrSets[Index] = AttributeSet::get(C, Merged);
 
-  AttrBuilder NewAttrs;
-  if (I < NumAttrs && getSlotIndex(I) == Index) {
-    // We need to merge the attribute sets.
-    NewAttrs.merge(pImpl->getSlotAttributes(I));
-    ++I;
-  }
-  NewAttrs.merge(B);
-
-  // Add the new or merged attribute set at this index.
-  AttrVec.emplace_back(Index, AttributeSet::get(C, NewAttrs));
-
-  // Add the remaining entries.
-  for (; I < NumAttrs; ++I)
-    AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotAttributes(I));
-
-  return get(C, AttrVec);
+  return getImpl(C, AttrSets);
 }
 
 AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
@@ -1109,54 +1098,38 @@ AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
   return removeAttributes(C, Index, B);
 }
 
-AttributeList AttributeList::removeAttributes(LLVMContext &C, unsigned Index,
-                                              const AttrBuilder &Attrs) const {
+AttributeList
+AttributeList::removeAttributes(LLVMContext &C, unsigned Index,
+                                const AttrBuilder &AttrsToRemove) const {
   if (!pImpl)
     return AttributeList();
 
   // FIXME it is not obvious how this should work for alignment.
   // For now, say we can't pass in alignment, which no current use does.
-  assert(!Attrs.hasAlignmentAttr() && "Attempt to change alignment!");
+  assert(!AttrsToRemove.hasAlignmentAttr() && "Attempt to change alignment!");
 
-  // Add the attribute slots before the one we're trying to add.
-  SmallVector<IndexAttrPair, 4> AttrSets;
-  uint64_t NumAttrs = pImpl->getNumSlots();
-  AttrBuilder B;
-  uint64_t LastIndex = 0;
-  for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
-    if (getSlotIndex(I) >= Index) {
-      if (getSlotIndex(I) == Index)
-        B = AttrBuilder(getSlotAttributes(LastIndex++));
-      break;
-    }
-    LastIndex = I + 1;
-    AttrSets.push_back({getSlotIndex(I), getSlotAttributes(I)});
-  }
+  Index = attrIdxToArrayIdx(Index);
+  SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
+  if (Index >= AttrSets.size())
+    AttrSets.resize(Index + 1);
 
-  // Remove the attributes from the existing set and add them.
-  B.remove(Attrs);
-  if (B.hasAttributes())
-    AttrSets.push_back({Index, AttributeSet::get(C, B)});
+  AttrBuilder B(AttrSets[Index]);
+  B.remove(AttrsToRemove);
+  AttrSets[Index] = AttributeSet::get(C, B);
 
-  // Add the remaining attribute slots.
-  for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
-    AttrSets.push_back({getSlotIndex(I), getSlotAttributes(I)});
-
-  return get(C, AttrSets);
+  return getImpl(C, AttrSets);
 }
 
 AttributeList AttributeList::removeAttributes(LLVMContext &C,
                                               unsigned WithoutIndex) const {
   if (!pImpl)
     return AttributeList();
-
-  SmallVector<std::pair<unsigned, AttributeSet>, 4> AttrSet;
-  for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) {
-    unsigned Index = getSlotIndex(I);
-    if (Index != WithoutIndex)
-      AttrSet.push_back({Index, pImpl->getSlotAttributes(I)});
-  }
-  return get(C, AttrSet);
+  WithoutIndex = attrIdxToArrayIdx(WithoutIndex);
+  if (WithoutIndex >= getNumAttrSets())
+    return *this;
+  SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
+  AttrSets[WithoutIndex] = AttributeSet();
+  return getImpl(C, AttrSets);
 }
 
 AttributeList AttributeList::addDereferenceableAttr(LLVMContext &C,
@@ -1225,20 +1198,20 @@ bool AttributeList::hasFnAttribute(StringRef Kind) const {
 
 bool AttributeList::hasParamAttribute(unsigned ArgNo,
                                       Attribute::AttrKind Kind) const {
-  return hasAttribute(ArgNo + 1, Kind);
+  return hasAttribute(ArgNo + FirstArgIndex, Kind);
 }
 
 bool AttributeList::hasAttrSomewhere(Attribute::AttrKind Attr,
                                      unsigned *Index) const {
   if (!pImpl) return false;
 
-  for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I)
-    for (AttributeListImpl::iterator II = pImpl->begin(I), IE = pImpl->end(I);
-         II != IE; ++II)
-      if (II->hasAttribute(Attr)) {
-        if (Index) *Index = pImpl->getSlotIndex(I);
-        return true;
-      }
+  for (unsigned I = index_begin(), E = index_end(); I != E; ++I) {
+    if (hasAttribute(I, Attr)) {
+      if (Index)
+        *Index = I;
+      return true;
+    }
+  }
 
   return false;
 }
@@ -1282,60 +1255,35 @@ std::string AttributeList::getAsString(unsigned Index, bool InAttrGrp) const {
 }
 
 AttributeSet AttributeList::getAttributes(unsigned Index) const {
-  if (!pImpl) return AttributeSet();
-
-  // Loop through to find the attribute node we want.
-  for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I)
-    if (pImpl->getSlotIndex(I) == Index)
-      return pImpl->getSlotAttributes(I);
-
-  return AttributeSet();
+  Index = attrIdxToArrayIdx(Index);
+  if (!pImpl || Index >= getNumAttrSets())
+    return AttributeSet();
+  return pImpl->begin()[Index];
 }
 
-AttributeList::iterator AttributeList::begin(unsigned Slot) const {
-  if (!pImpl)
-    return ArrayRef<Attribute>().begin();
-  return pImpl->begin(Slot);
+AttributeList::iterator AttributeList::begin() const {
+  return pImpl ? pImpl->begin() : nullptr;
 }
 
-AttributeList::iterator AttributeList::end(unsigned Slot) const {
-  if (!pImpl)
-    return ArrayRef<Attribute>().end();
-  return pImpl->end(Slot);
+AttributeList::iterator AttributeList::end() const {
+  return pImpl ? pImpl->end() : nullptr;
 }
 
 //===----------------------------------------------------------------------===//
 // AttributeList Introspection Methods
 //===----------------------------------------------------------------------===//
 
-unsigned AttributeList::getNumSlots() const {
-  return pImpl ? pImpl->getNumSlots() : 0;
-}
-
-unsigned AttributeList::getSlotIndex(unsigned Slot) const {
-  assert(pImpl && Slot < pImpl->getNumSlots() &&
-         "Slot # out of range!");
-  return pImpl->getSlotIndex(Slot);
-}
-
-AttributeSet AttributeList::getSlotAttributes(unsigned Slot) const {
-  assert(pImpl && Slot < pImpl->getNumSlots() &&
-         "Slot # out of range!");
-  return pImpl->getSlotAttributes(Slot);
+unsigned AttributeList::getNumAttrSets() const {
+  return pImpl ? pImpl->NumAttrSets : 0;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void AttributeList::dump() const {
   dbgs() << "PAL[\n";
 
-  for (unsigned i = 0, e = getNumSlots(); i < e; ++i) {
-    uint64_t Index = getSlotIndex(i);
-    dbgs() << "  { ";
-    if (Index == ~0U)
-      dbgs() << "~0U";
-    else
-      dbgs() << Index;
-    dbgs() << " => " << getAsString(Index) << " }\n";
+  for (unsigned i = index_begin(), e = index_end(); i != e; ++i) {
+    if (getAttributes(i).hasAttributes())
+      dbgs() << "  { " << i << " => " << getAsString(i) << " }\n";
   }
 
   dbgs() << "]\n";
@@ -1346,26 +1294,16 @@ LLVM_DUMP_METHOD void AttributeList::dump() const {
 // AttrBuilder Method Implementations
 //===----------------------------------------------------------------------===//
 
+// FIXME: Remove this ctor, use AttributeSet.
 AttrBuilder::AttrBuilder(AttributeList AL, unsigned Index) {
-  AttributeListImpl *pImpl = AL.pImpl;
-  if (!pImpl) return;
-
-  for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) {
-    if (pImpl->getSlotIndex(I) != Index) continue;
-
-    for (AttributeListImpl::iterator II = pImpl->begin(I), IE = pImpl->end(I);
-         II != IE; ++II)
-      addAttribute(*II);
-
-    break;
-  }
+  AttributeSet AS = AL.getAttributes(Index);
+  for (const Attribute &A : AS)
+    addAttribute(A);
 }
 
 AttrBuilder::AttrBuilder(AttributeSet AS) {
-  if (AS.hasAttributes()) {
-    for (const Attribute &A : AS)
-      addAttribute(A);
-  }
+  for (const Attribute &A : AS)
+    addAttribute(A);
 }
 
 void AttrBuilder::clear() {
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index 90ca21ab91f8..1f8659d4e2ca 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -263,6 +263,10 @@ const BasicBlock *BasicBlock::getUniqueSuccessor() const {
   return SuccBB;
 }
 
+iterator_range<BasicBlock::phi_iterator> BasicBlock::phis() {
+  return make_range<phi_iterator>(dyn_cast<PHINode>(&front()), nullptr);
+}
+
 /// This method is used to notify a BasicBlock that the
 /// specified Predecessor of the block is no longer able to reach it.  This is
 /// actually not used to update the Predecessor list, but is actually used to
@@ -389,13 +393,11 @@ BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) {
     // Loop over any phi nodes in the basic block, updating the BB field of
     // incoming values...
     BasicBlock *Successor = *I;
-    PHINode *PN;
-    for (BasicBlock::iterator II = Successor->begin();
-         (PN = dyn_cast<PHINode>(II)); ++II) {
-      int IDX = PN->getBasicBlockIndex(this);
-      while (IDX != -1) {
-        PN->setIncomingBlock((unsigned)IDX, New);
-        IDX = PN->getBasicBlockIndex(this);
+    for (auto &PN : Successor->phis()) {
+      int Idx = PN.getBasicBlockIndex(this);
+      while (Idx != -1) {
+        PN.setIncomingBlock((unsigned)Idx, New);
+        Idx = PN.getBasicBlockIndex(this);
       }
     }
   }
diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp
index 3168ec6944a3..b7e3f0c6779e 100644
--- a/lib/IR/DebugLoc.cpp
+++ b/lib/IR/DebugLoc.cpp
@@ -163,7 +163,7 @@ void DebugLoc::reparentDebugInfo(Instruction &I, DISubprogram *OrigSP,
 
   // Fix up debug variables to point to NewSP.
   auto reparentVar = [&](DILocalVariable *Var) {
-    return DILocalVariable::getDistinct(
+    return DILocalVariable::get(
         Ctx,
         cast<DILocalScope>(
             reparentScope(Ctx, Var->getScope(), OrigSP, NewSP, Cache)),
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index 01d4ed6c8eef..d7baa9ebc223 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -454,6 +454,9 @@ bool CallInst::dataOperandHasImpliedAttr(unsigned i,
   // question is a call argument; or be indirectly implied by the kind of its
   // containing operand bundle, if the operand is a bundle operand.
 
+  if (i == AttributeList::ReturnIndex)
+    return hasRetAttr(Kind);
+
   // FIXME: Avoid these i - 1 calculations and update the API to use zero-based
   // indices.
   if (i < (getNumArgOperands() + 1))
@@ -779,6 +782,9 @@ bool InvokeInst::dataOperandHasImpliedAttr(unsigned i,
   // question is an invoke argument; or be indirectly implied by the kind of its
   // containing operand bundle, if the operand is a bundle operand.
 
+  if (i == AttributeList::ReturnIndex)
+    return hasRetAttr(Kind);
+
   // FIXME: Avoid these i - 1 calculations and update the API to use zero-based
   // indices.
   if (i < (getNumArgOperands() + 1))
diff --git a/lib/IR/IntrinsicInst.cpp b/lib/IR/IntrinsicInst.cpp
index c9814a96bea6..94e115a6a78d 100644
--- a/lib/IR/IntrinsicInst.cpp
+++ b/lib/IR/IntrinsicInst.cpp
@@ -97,7 +97,9 @@ Value *InstrProfIncrementInst::getStep() const {
 
 ConstrainedFPIntrinsic::RoundingMode
 ConstrainedFPIntrinsic::getRoundingMode() const {
-  Metadata *MD = dyn_cast<MetadataAsValue>(getOperand(2))->getMetadata();
+  unsigned NumOperands = getNumArgOperands();
+  Metadata *MD = 
+      dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 2))->getMetadata();
   if (!MD || !isa<MDString>(MD))
     return rmInvalid;
   StringRef RoundingArg = cast<MDString>(MD)->getString();
@@ -115,7 +117,9 @@ ConstrainedFPIntrinsic::getRoundingMode() const {
 
 ConstrainedFPIntrinsic::ExceptionBehavior
 ConstrainedFPIntrinsic::getExceptionBehavior() const {
-  Metadata *MD = dyn_cast<MetadataAsValue>(getOperand(3))->getMetadata();
+  unsigned NumOperands = getNumArgOperands();
+  Metadata *MD = 
+      dyn_cast<MetadataAsValue>(getArgOperand(NumOperands - 1))->getMetadata();
   if (!MD || !isa<MDString>(MD))
     return ebInvalid;
   StringRef ExceptionArg = cast<MDString>(MD)->getString();
@@ -125,3 +129,21 @@ ConstrainedFPIntrinsic::getExceptionBehavior() const {
     .Case("fpexcept.strict",  ebStrict)
     .Default(ebInvalid);
 }
+
+bool ConstrainedFPIntrinsic::isUnaryOp() const {
+  switch (getIntrinsicID()) {
+    default: 
+      return false;
+    case Intrinsic::experimental_constrained_sqrt:
+    case Intrinsic::experimental_constrained_sin:
+    case Intrinsic::experimental_constrained_cos:
+    case Intrinsic::experimental_constrained_exp:
+    case Intrinsic::experimental_constrained_exp2:
+    case Intrinsic::experimental_constrained_log:
+    case Intrinsic::experimental_constrained_log10:
+    case Intrinsic::experimental_constrained_log2:
+    case Intrinsic::experimental_constrained_rint:
+    case Intrinsic::experimental_constrained_nearbyint:
+      return true;
+  }
+}
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 12c258d95f52..95673e515a55 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -481,7 +481,7 @@ PICLevel::Level Module::getPICLevel() const {
 }
 
 void Module::setPICLevel(PICLevel::Level PL) {
-  addModuleFlag(ModFlagBehavior::Error, "PIC Level", PL);
+  addModuleFlag(ModFlagBehavior::Max, "PIC Level", PL);
 }
 
 PIELevel::Level Module::getPIELevel() const {
@@ -495,7 +495,7 @@ PIELevel::Level Module::getPIELevel() const {
 }
 
 void Module::setPIELevel(PIELevel::Level PL) {
-  addModuleFlag(ModFlagBehavior::Error, "PIE Level", PL);
+  addModuleFlag(ModFlagBehavior::Max, "PIE Level", PL);
 }
 
 void Module::setProfileSummary(Metadata *M) {
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 21e8048442be..a8523236ac9f 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -1282,6 +1282,13 @@ Verifier::visitModuleFlag(const MDNode *Op,
     // These behavior types accept any value.
     break;
 
+  case Module::Max: {
+    Assert(mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2)),
+           "invalid value for 'max' module flag (expected constant integer)",
+           Op->getOperand(2));
+    break;
+  }
+
   case Module::Require: {
     // The value should itself be an MDNode with two operands, a flag ID (an
     // MDString), and a value.
@@ -1729,17 +1736,9 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) {
 }
 
 bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) {
-  if (Attrs.getNumSlots() == 0)
-    return true;
-
-  unsigned LastSlot = Attrs.getNumSlots() - 1;
-  unsigned LastIndex = Attrs.getSlotIndex(LastSlot);
-  if (LastIndex <= Params ||
-      (LastIndex == AttributeList::FunctionIndex &&
-       (LastSlot == 0 || Attrs.getSlotIndex(LastSlot - 1) <= Params)))
-    return true;
-
-  return false;
+  // There shouldn't be more attribute sets than there are parameters plus the
+  // function and return value.
+  return Attrs.getNumAttrSets() <= Params + 2;
 }
 
 /// Verify that statepoint intrinsic is well formed.
@@ -3967,6 +3966,18 @@ void Verifier::visitIntrinsicCallSite(Intrinsic::ID ID, CallSite CS) {
   case Intrinsic::experimental_constrained_fmul:
   case Intrinsic::experimental_constrained_fdiv:
   case Intrinsic::experimental_constrained_frem:
+  case Intrinsic::experimental_constrained_sqrt:
+  case Intrinsic::experimental_constrained_pow:
+  case Intrinsic::experimental_constrained_powi:
+  case Intrinsic::experimental_constrained_sin:
+  case Intrinsic::experimental_constrained_cos:
+  case Intrinsic::experimental_constrained_exp:
+  case Intrinsic::experimental_constrained_exp2:
+  case Intrinsic::experimental_constrained_log:
+  case Intrinsic::experimental_constrained_log10:
+  case Intrinsic::experimental_constrained_log2:
+  case Intrinsic::experimental_constrained_rint:
+  case Intrinsic::experimental_constrained_nearbyint:
     visitConstrainedFPIntrinsic(
         cast<ConstrainedFPIntrinsic>(*CS.getInstruction()));
     break;
@@ -4336,7 +4347,12 @@ static DISubprogram *getSubprogram(Metadata *LocalScope) {
 }
 
 void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
-  Assert(isa<MetadataAsValue>(FPI.getOperand(2)),
+  unsigned NumOperands = FPI.getNumArgOperands();
+  Assert(((NumOperands == 3 && FPI.isUnaryOp()) || (NumOperands == 4)),
+         "invalid arguments for constrained FP intrinsic", &FPI);
+  Assert(isa<MetadataAsValue>(FPI.getArgOperand(NumOperands-1)),
+         "invalid exception behavior argument", &FPI);
+  Assert(isa<MetadataAsValue>(FPI.getArgOperand(NumOperands-2)),
          "invalid rounding mode argument", &FPI);
   Assert(FPI.getRoundingMode() != ConstrainedFPIntrinsic::rmInvalid,
          "invalid rounding mode argument", &FPI);
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index c73b6b6b15c1..9efc095f9fcf 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -114,7 +114,10 @@ static void computeCacheKey(
   AddUnsigned((unsigned)Conf.Options.DebuggerTuning);
   for (auto &A : Conf.MAttrs)
     AddString(A);
-  AddUnsigned(Conf.RelocModel);
+  if (Conf.RelocModel)
+    AddUnsigned(*Conf.RelocModel);
+  else
+    AddUnsigned(-1);
   AddUnsigned(Conf.CodeModel);
   AddUnsigned(Conf.CGOptLevel);
   AddUnsigned(Conf.CGFileType);
@@ -539,16 +542,10 @@ Error LTO::addRegularLTO(BitcodeModule BM,
         if (Sym.isUndefined())
           continue;
         Keep.push_back(GV);
-        switch (GV->getLinkage()) {
-        default:
-          break;
-        case GlobalValue::LinkOnceAnyLinkage:
-          GV->setLinkage(GlobalValue::WeakAnyLinkage);
-          break;
-        case GlobalValue::LinkOnceODRLinkage:
-          GV->setLinkage(GlobalValue::WeakODRLinkage);
-          break;
-        }
+        GlobalValue::LinkageTypes OriginalLinkage = GV->getLinkage();
+        if (GlobalValue::isLinkOnceLinkage(OriginalLinkage))
+          GV->setLinkage(GlobalValue::getWeakLinkage(
+              GlobalValue::isLinkOnceODRLinkage(OriginalLinkage)));
       } else if (isa<GlobalObject>(GV) &&
                  (GV->hasLinkOnceODRLinkage() || GV->hasWeakODRLinkage() ||
                   GV->hasAvailableExternallyLinkage()) &&
@@ -999,10 +996,6 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
         ExportedGUIDs.insert(GUID);
     }
 
-    auto isPrevailing = [&](GlobalValue::GUID GUID,
-                            const GlobalValueSummary *S) {
-      return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
-    };
     auto isExported = [&](StringRef ModuleIdentifier, GlobalValue::GUID GUID) {
       const auto &ExportList = ExportLists.find(ModuleIdentifier);
       return (ExportList != ExportLists.end() &&
@@ -1010,17 +1003,20 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
              ExportedGUIDs.count(GUID);
     };
     thinLTOInternalizeAndPromoteInIndex(ThinLTO.CombinedIndex, isExported);
-
-    auto recordNewLinkage = [&](StringRef ModuleIdentifier,
-                                GlobalValue::GUID GUID,
-                                GlobalValue::LinkageTypes NewLinkage) {
-      ResolvedODR[ModuleIdentifier][GUID] = NewLinkage;
-    };
-
-    thinLTOResolveWeakForLinkerInIndex(ThinLTO.CombinedIndex, isPrevailing,
-                                       recordNewLinkage);
   }
 
+  auto isPrevailing = [&](GlobalValue::GUID GUID,
+                          const GlobalValueSummary *S) {
+    return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
+  };
+  auto recordNewLinkage = [&](StringRef ModuleIdentifier,
+                              GlobalValue::GUID GUID,
+                              GlobalValue::LinkageTypes NewLinkage) {
+    ResolvedODR[ModuleIdentifier][GUID] = NewLinkage;
+  };
+  thinLTOResolveWeakForLinkerInIndex(ThinLTO.CombinedIndex, isPrevailing,
+                                     recordNewLinkage);
+
   std::unique_ptr<ThinBackendProc> BackendProc =
       ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
                       AddStream, Cache);
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index 30447c528af1..668667a53562 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -117,15 +117,22 @@ Error Config::addSaveTemps(std::string OutputFileName,
 namespace {
 
 std::unique_ptr<TargetMachine>
-createTargetMachine(Config &Conf, StringRef TheTriple,
-                    const Target *TheTarget) {
+createTargetMachine(Config &Conf, const Target *TheTarget, Module &M) {
+  StringRef TheTriple = M.getTargetTriple();
   SubtargetFeatures Features;
   Features.getDefaultSubtargetFeatures(Triple(TheTriple));
   for (const std::string &A : Conf.MAttrs)
     Features.AddFeature(A);
 
+  Reloc::Model RelocModel;
+  if (Conf.RelocModel)
+    RelocModel = *Conf.RelocModel;
+  else
+    RelocModel =
+        M.getPICLevel() == PICLevel::NotPIC ? Reloc::Static : Reloc::PIC_;
+
   return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
-      TheTriple, Conf.CPU, Features.getString(), Conf.Options, Conf.RelocModel,
+      TheTriple, Conf.CPU, Features.getString(), Conf.Options, RelocModel,
       Conf.CodeModel, Conf.CGOptLevel));
 }
 
@@ -311,7 +318,7 @@ void splitCodeGen(Config &C, TargetMachine *TM, AddStreamFn AddStream,
               std::unique_ptr<Module> MPartInCtx = std::move(MOrErr.get());
 
               std::unique_ptr<TargetMachine> TM =
-                  createTargetMachine(C, MPartInCtx->getTargetTriple(), T);
+                  createTargetMachine(C, T, *MPartInCtx);
 
               codegen(C, TM.get(), AddStream, ThreadId, *MPartInCtx);
             },
@@ -360,8 +367,7 @@ Error lto::backend(Config &C, AddStreamFn AddStream,
   if (!TOrErr)
     return TOrErr.takeError();
 
-  std::unique_ptr<TargetMachine> TM =
-      createTargetMachine(C, Mod->getTargetTriple(), *TOrErr);
+  std::unique_ptr<TargetMachine> TM = createTargetMachine(C, *TOrErr, *Mod);
 
   // Setup optimization remarks.
   auto DiagFileOrErr = lto::setupOptimizationRemarks(
@@ -397,8 +403,7 @@ Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream,
   if (!TOrErr)
     return TOrErr.takeError();
 
-  std::unique_ptr<TargetMachine> TM =
-      createTargetMachine(Conf, Mod.getTargetTriple(), *TOrErr);
+  std::unique_ptr<TargetMachine> TM = createTargetMachine(Conf, *TOrErr, Mod);
 
   if (Conf.CodeGenOnly) {
     codegen(Conf, TM.get(), AddStream, Task, Mod);
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index c0af21aa148c..defad1904989 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -1157,6 +1157,11 @@ Error IRLinker::linkModuleFlagsMetadata() {
         mdconst::extract<ConstantInt>(DstOp->getOperand(0));
     unsigned DstBehaviorValue = DstBehavior->getZExtValue();
 
+    auto overrideDstValue = [&]() {
+      DstModFlags->setOperand(DstIndex, SrcOp);
+      Flags[ID].first = SrcOp;
+    };
+
     // If either flag has override behavior, handle it first.
     if (DstBehaviorValue == Module::Override) {
       // Diagnose inconsistent flags which both have override behavior.
@@ -1167,8 +1172,7 @@ Error IRLinker::linkModuleFlagsMetadata() {
       continue;
     } else if (SrcBehaviorValue == Module::Override) {
       // Update the destination flag to that of the source.
-      DstModFlags->setOperand(DstIndex, SrcOp);
-      Flags[ID].first = SrcOp;
+      overrideDstValue();
       continue;
     }
 
@@ -1204,6 +1208,15 @@ Error IRLinker::linkModuleFlagsMetadata() {
       }
       continue;
     }
+    case Module::Max: {
+      ConstantInt *DstValue =
+          mdconst::extract<ConstantInt>(DstOp->getOperand(2));
+      ConstantInt *SrcValue =
+          mdconst::extract<ConstantInt>(SrcOp->getOperand(2));
+      if (SrcValue->getZExtValue() > DstValue->getZExtValue())
+        overrideDstValue();
+      break;
+    }
     case Module::Append: {
       MDNode *DstValue = cast<MDNode>(DstOp->getOperand(2));
       MDNode *SrcValue = cast<MDNode>(SrcOp->getOperand(2));
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
index 0540c4c47a3f..8c3df36cfb48 100644
--- a/lib/MC/WasmObjectWriter.cpp
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -422,6 +422,7 @@ static void ApplyRelocations(
                       RelEntry.Offset;
     switch (RelEntry.Type) {
     case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB: {
+      assert(SymbolIndices.count(RelEntry.Symbol));
       uint32_t Index = SymbolIndices[RelEntry.Symbol];
       assert(RelEntry.Addend == 0);
 
@@ -429,6 +430,7 @@ static void ApplyRelocations(
       break;
     }
     case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB: {
+      assert(SymbolIndices.count(RelEntry.Symbol));
       uint32_t Index = SymbolIndices[RelEntry.Symbol];
       assert(RelEntry.Addend == 0);
 
@@ -448,6 +450,7 @@ static void ApplyRelocations(
       break;
     }
     case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: {
+      assert(SymbolIndices.count(RelEntry.Symbol));
       uint32_t Index = SymbolIndices[RelEntry.Symbol];
       assert(RelEntry.Addend == 0);
 
@@ -478,6 +481,7 @@ WriteRelocations(ArrayRef<WasmRelocationEntry> Relocations,
 
     uint64_t Offset = RelEntry.Offset +
                       RelEntry.FixupSection->getSectionOffset() + HeaderSize;
+    assert(SymbolIndices.count(RelEntry.Symbol));
     uint32_t Index = SymbolIndices[RelEntry.Symbol];
     int64_t Addend = RelEntry.Addend;
 
@@ -726,10 +730,6 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
       if (IsAddressTaken.count(&WS))
         TableElems.push_back(Index);
     } else {
-      // For now, ignore temporary non-function symbols.
-      if (S.isTemporary())
-        continue;
-
       if (WS.getOffset() != 0)
         report_fatal_error("data sections must contain one variable each");
       if (!WS.getSize())
@@ -777,20 +777,18 @@ void WasmObjectWriter::writeObject(MCAssembler &Asm,
           }
         }
 
-        // For each external global, prepare a corresponding wasm global
-        // holding its address.
-        if (WS.isExternal()) {
-          Index = NumGlobalImports + Globals.size();
+        // For each global, prepare a corresponding wasm global holding its
+        // address.  For externals these will also be named exports.
+        Index = NumGlobalImports + Globals.size();
 
-          WasmGlobal Global;
-          Global.Type = PtrType;
-          Global.IsMutable = false;
-          Global.HasImport = false;
-          Global.InitialValue = DataSection.getSectionOffset();
-          Global.ImportIndex = 0;
-          SymbolIndices[&WS] = Index;
-          Globals.push_back(Global);
-        }
+        WasmGlobal Global;
+        Global.Type = PtrType;
+        Global.IsMutable = false;
+        Global.HasImport = false;
+        Global.InitialValue = DataSection.getSectionOffset();
+        Global.ImportIndex = 0;
+        SymbolIndices[&WS] = Index;
+        Globals.push_back(Global);
       }
     }
 
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 28531feccfe1..7372f24cb9a8 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -293,6 +293,10 @@ uint64_t COFFObjectFile::getSectionAddress(DataRefImpl Ref) const {
   return Result;
 }
 
+uint64_t COFFObjectFile::getSectionIndex(DataRefImpl Sec) const {
+  return toSec(Sec) - SectionTable;
+}
+
 uint64_t COFFObjectFile::getSectionSize(DataRefImpl Ref) const {
   return getSectionSize(toSec(Ref));
 }
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index 3d3fa07db3f4..bfb8875f47d4 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -1820,6 +1820,10 @@ uint64_t MachOObjectFile::getSectionAddress(DataRefImpl Sec) const {
   return getSection(Sec).addr;
 }
 
+uint64_t MachOObjectFile::getSectionIndex(DataRefImpl Sec) const {
+  return Sec.d.a;
+}
+
 uint64_t MachOObjectFile::getSectionSize(DataRefImpl Sec) const {
   // In the case if a malformed Mach-O file where the section offset is past
   // the end of the file or some part of the section size is past the end of
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index 058686e4db9e..f565d7a33e55 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -743,6 +743,10 @@ std::error_code WasmObjectFile::getSectionName(DataRefImpl Sec,
 
 uint64_t WasmObjectFile::getSectionAddress(DataRefImpl Sec) const { return 0; }
 
+uint64_t WasmObjectFile::getSectionIndex(DataRefImpl Sec) const {
+  return Sec.d.a;
+}
+
 uint64_t WasmObjectFile::getSectionSize(DataRefImpl Sec) const {
   const WasmSection &S = Sections[Sec.d.a];
   return S.Content.size();
diff --git a/lib/Option/OptTable.cpp b/lib/Option/OptTable.cpp
index 7eafb00855d7..b00d21ec8f67 100644
--- a/lib/Option/OptTable.cpp
+++ b/lib/Option/OptTable.cpp
@@ -186,6 +186,20 @@ static unsigned matchOption(const OptTable::Info *I, StringRef Str,
   return 0;
 }
 
+std::vector<std::string> OptTable::findByPrefix(StringRef Cur) const {
+  std::vector<std::string> Ret;
+  for (const Info &In : OptionInfos.slice(FirstSearchableIndex)) {
+    if (!In.Prefixes)
+      continue;
+    for (int I = 0; In.Prefixes[I]; I++) {
+      std::string S = std::string(In.Prefixes[I]) + std::string(In.Name);
+      if (StringRef(S).startswith(Cur))
+        Ret.push_back(S);
+    }
+  }
+  return Ret;
+}
+
 Arg *OptTable::ParseOneArg(const ArgList &Args, unsigned &Index,
                            unsigned FlagsToInclude,
                            unsigned FlagsToExclude) const {
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index 6ece7965ce64..abc53e97aa72 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -155,6 +155,11 @@ static cl::opt<bool>
                        cl::Hidden, cl::ZeroOrMore,
                        cl::desc("Run Partial inlinining pass"));
 
+static cl::opt<bool>
+    RunNewGVN("enable-npm-newgvn", cl::init(false),
+              cl::Hidden, cl::ZeroOrMore,
+              cl::desc("Run NewGVN instead of GVN"));
+
 static cl::opt<bool> EnableGVNHoist(
     "enable-npm-gvn-hoist", cl::init(false), cl::Hidden,
     cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));
@@ -336,10 +341,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // Rotate Loop - disable header duplication at -Oz
   LPM1.addPass(LoopRotatePass(Level != Oz));
   LPM1.addPass(LICMPass());
-#if 0
-  // The LoopUnswitch pass isn't yet ported to the new pass manager.
-  LPM1.addPass(LoopUnswitchPass(/* OptimizeForSize */ Level != O3));
-#endif
+  LPM1.addPass(SimpleLoopUnswitchPass());
   LPM2.addPass(IndVarSimplifyPass());
   LPM2.addPass(LoopIdiomRecognizePass());
   LPM2.addPass(LoopDeletionPass());
@@ -357,7 +359,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   if (Level != O1) {
     // These passes add substantial compile time so skip them at O1.
     FPM.addPass(MergedLoadStoreMotionPass());
-    FPM.addPass(GVN());
+    if (RunNewGVN)
+      FPM.addPass(NewGVNPass());
+    else
+      FPM.addPass(GVN());
   }
 
   // Specially optimize memory movement as it doesn't look like dataflow in SSA.
@@ -429,6 +434,11 @@ static void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
     MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPipeline)));
   }
 
+  // Delete anything that is now dead to make sure that we don't instrument
+  // dead code. Instrumentation can end up keeping dead code around and
+  // dramatically increase code size.
+  MPM.addPass(GlobalDCEPass());
+
   if (RunProfileGen) {
     MPM.addPass(PGOInstrumentationGen());
 
@@ -774,7 +784,10 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // FIXME: once we fix LoopPass Manager, add LICM here.
   // FIXME: once we provide support for enabling MLSM, add it here.
   // FIXME: once we provide support for enabling NewGVN, add it here.
-  MainFPM.addPass(GVN());
+  if (RunNewGVN)
+    MainFPM.addPass(NewGVNPass());
+  else
+    MainFPM.addPass(GVN());
 
   // Remove dead memcpy()'s.
   MainFPM.addPass(MemCpyOptPass());
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index 64a65ccc11a1..a2b7c94f9dec 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -355,7 +355,7 @@ void InstrProfSymtab::create(Module &M, bool InLTO) {
   finalizeSymtab();
 }
 
-Error collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
+Error collectPGOFuncNameStrings(ArrayRef<std::string> NameStrs,
                                 bool doCompression, std::string &Result) {
   assert(!NameStrs.empty() && "No name data to emit");
 
@@ -403,7 +403,7 @@ StringRef getPGOFuncNameVarInitializer(GlobalVariable *NameVar) {
   return NameStr;
 }
 
-Error collectPGOFuncNameStrings(const std::vector<GlobalVariable *> &NameVars,
+Error collectPGOFuncNameStrings(ArrayRef<GlobalVariable *> NameVars,
                                 std::string &Result, bool doCompression) {
   std::vector<std::string> NameStrs;
   for (auto *NameVar : NameVars) {
@@ -978,22 +978,22 @@ bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken) {
 }
 
 // Parse the value profile options.
-void getMemOPSizeRangeFromOption(std::string MemOPSizeRange,
-                                 int64_t &RangeStart, int64_t &RangeLast) {
+void getMemOPSizeRangeFromOption(StringRef MemOPSizeRange, int64_t &RangeStart,
+                                 int64_t &RangeLast) {
   static const int64_t DefaultMemOPSizeRangeStart = 0;
   static const int64_t DefaultMemOPSizeRangeLast = 8;
   RangeStart = DefaultMemOPSizeRangeStart;
   RangeLast = DefaultMemOPSizeRangeLast;
 
   if (!MemOPSizeRange.empty()) {
-    auto Pos = MemOPSizeRange.find(":");
+    auto Pos = MemOPSizeRange.find(':');
     if (Pos != std::string::npos) {
       if (Pos > 0)
-        RangeStart = atoi(MemOPSizeRange.substr(0, Pos).c_str());
+        MemOPSizeRange.substr(0, Pos).getAsInteger(10, RangeStart);
       if (Pos < MemOPSizeRange.size() - 1)
-        RangeLast = atoi(MemOPSizeRange.substr(Pos + 1).c_str());
+        MemOPSizeRange.substr(Pos + 1).getAsInteger(10, RangeLast);
     } else
-      RangeLast = atoi(MemOPSizeRange.c_str());
+      MemOPSizeRange.getAsInteger(10, RangeLast);
   }
   assert(RangeLast >= RangeStart);
 }
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 2a916b14bc22..e9716e3b1e87 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -2045,7 +2045,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
 
   if (isSingleWord()) {
     char Buffer[65];
-    char *BufPtr = Buffer+65;
+    char *BufPtr = std::end(Buffer);
 
     uint64_t N;
     if (!Signed) {
@@ -2069,7 +2069,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
       *--BufPtr = Digits[N % Radix];
       N /= Radix;
     }
-    Str.append(BufPtr, Buffer+65);
+    Str.append(BufPtr, std::end(Buffer));
     return;
   }
 
diff --git a/lib/Support/BinaryStreamReader.cpp b/lib/Support/BinaryStreamReader.cpp
index 5c277448a765..862232971162 100644
--- a/lib/Support/BinaryStreamReader.cpp
+++ b/lib/Support/BinaryStreamReader.cpp
@@ -42,29 +42,30 @@ Error BinaryStreamReader::readBytes(ArrayRef<uint8_t> &Buffer, uint32_t Size) {
 }
 
 Error BinaryStreamReader::readCString(StringRef &Dest) {
-  // TODO: This could be made more efficient by using readLongestContiguousChunk
-  // and searching for null terminators in the resulting buffer.
-
-  uint32_t Length = 0;
-  // First compute the length of the string by reading 1 byte at a time.
   uint32_t OriginalOffset = getOffset();
-  const char *C;
+  uint32_t FoundOffset = 0;
   while (true) {
-    if (auto EC = readObject(C))
+    uint32_t ThisOffset = getOffset();
+    ArrayRef<uint8_t> Buffer;
+    if (auto EC = readLongestContiguousChunk(Buffer))
       return EC;
-    if (*C == '\0')
+    StringRef S(reinterpret_cast<const char *>(Buffer.begin()), Buffer.size());
+    size_t Pos = S.find_first_of('\0');
+    if (LLVM_LIKELY(Pos != StringRef::npos)) {
+      FoundOffset = Pos + ThisOffset;
       break;
-    ++Length;
+    }
   }
-  // Now go back and request a reference for that many bytes.
-  uint32_t NewOffset = getOffset();
+  assert(FoundOffset >= OriginalOffset);
+
   setOffset(OriginalOffset);
+  size_t Length = FoundOffset - OriginalOffset;
 
   if (auto EC = readFixedString(Dest, Length))
     return EC;
 
-  // Now set the offset back to where it was after we calculated the length.
-  setOffset(NewOffset);
+  // Now set the offset back to after the null terminator.
+  setOffset(FoundOffset + 1);
   return Error::success();
 }
 
diff --git a/lib/Support/ConvertUTF.cpp b/lib/Support/ConvertUTF.cpp
index 39fd218d3f07..aa9507c189ed 100644
--- a/lib/Support/ConvertUTF.cpp
+++ b/lib/Support/ConvertUTF.cpp
@@ -53,6 +53,35 @@
 #endif
 #include <assert.h>
 
+
+/*
+ * This code extensively uses fall-through switches.
+ * Keep the compiler from warning about that.
+ */
+#if defined(__clang__) && defined(__has_warning)
+# if __has_warning("-Wimplicit-fallthrough")
+#  define ConvertUTF_DISABLE_WARNINGS \
+    _Pragma("clang diagnostic push")  \
+    _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
+#  define ConvertUTF_RESTORE_WARNINGS \
+    _Pragma("clang diagnostic pop")
+# endif
+#elif defined(__GNUC__) && __GNUC__ > 6
+# define ConvertUTF_DISABLE_WARNINGS \
+   _Pragma("GCC diagnostic push")    \
+   _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
+# define ConvertUTF_RESTORE_WARNINGS \
+   _Pragma("GCC diagnostic pop")
+#endif
+#ifndef ConvertUTF_DISABLE_WARNINGS
+# define ConvertUTF_DISABLE_WARNINGS
+#endif
+#ifndef ConvertUTF_RESTORE_WARNINGS
+# define ConvertUTF_RESTORE_WARNINGS
+#endif
+
+ConvertUTF_DISABLE_WARNINGS
+
 namespace llvm {
 
 static const int halfShift  = 10; /* used for shifting by 10 bits */
@@ -708,3 +737,5 @@ ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
    --------------------------------------------------------------------- */
 
 } // namespace llvm
+
+ConvertUTF_RESTORE_WARNINGS
diff --git a/lib/Support/DebugCounter.cpp b/lib/Support/DebugCounter.cpp
index 29dae8a20f00..a10ac8e85396 100644
--- a/lib/Support/DebugCounter.cpp
+++ b/lib/Support/DebugCounter.cpp
@@ -6,6 +6,7 @@
 
 using namespace llvm;
 
+namespace {
 // This class overrides the default list implementation of printing so we
 // can pretty print the list of debug counter options.  This type of
 // dynamic option is pretty rare (basically this and pass lists).
@@ -40,6 +41,7 @@ private:
     }
   }
 };
+} // namespace
 
 // Create our command line option.
 static DebugCounterList DebugCounterOption(
diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index 1541a5726302..9398789cea87 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp
@@ -127,10 +127,15 @@ void DynamicLibrary::AddSymbol(StringRef SymbolName, void *SymbolValue) {
 
 DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *FileName,
                                                    std::string *Err) {
-  SmartScopedLock<true> Lock(*SymbolsMutex);
+  // Force OpenedHandles to be added into the ManagedStatic list before any
+  // ManagedStatic can be added from static constructors in HandleSet::DLOpen.
+  HandleSet& HS = *OpenedHandles;
+
   void *Handle = HandleSet::DLOpen(FileName, Err);
-  if (Handle != &Invalid)
-    OpenedHandles->AddLibrary(Handle, /*IsProcess*/ FileName == nullptr);
+  if (Handle != &Invalid) {
+    SmartScopedLock<true> Lock(*SymbolsMutex);
+    HS.AddLibrary(Handle, /*IsProcess*/ FileName == nullptr);
+  }
 
   return DynamicLibrary(Handle);
 }
diff --git a/lib/Support/GraphWriter.cpp b/lib/Support/GraphWriter.cpp
index d0e1d50e8ccb..f70b77da8de4 100644
--- a/lib/Support/GraphWriter.cpp
+++ b/lib/Support/GraphWriter.cpp
@@ -43,6 +43,7 @@ std::string llvm::DOT::EscapeString(const std::string &Label) {
             Str.erase(Str.begin()+i); continue;
           default: break;
         }
+        LLVM_FALLTHROUGH;
     case '{': case '}':
     case '<': case '>':
     case '|': case '"':
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 6a0b64fb884d..234f7439a546 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -1401,6 +1401,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
 
   Features["prefetchwt1"] = HasLeaf7 && (ECX & 1);
   Features["avx512vbmi"] = HasLeaf7 && ((ECX >> 1) & 1) && HasAVX512Save;
+  Features["avx512vpopcntdq"] = HasLeaf7 && ((ECX >> 14) & 1) && HasAVX512Save;  
   // Enable protection keys
   Features["pku"] = HasLeaf7 && ((ECX >> 4) & 1);
 
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index 9fd6652ce4b8..80bef558258d 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -1156,6 +1156,7 @@ file_magic identify_magic(StringRef Magic) {
     case 0xc4: // ARMNT Windows
       if (Magic[1] == 0x01)
         return file_magic::coff_object;
+      LLVM_FALLTHROUGH;
 
     case 0x90: // PA-RISC Windows
     case 0x68: // mc68K Windows
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index b0e3d6898cae..318e21da999d 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -34,6 +34,7 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case mips64:         return "mips64";
   case mips64el:       return "mips64el";
   case msp430:         return "msp430";
+  case nios2:          return "nios2";
   case ppc64:          return "powerpc64";
   case ppc64le:        return "powerpc64le";
   case ppc:            return "powerpc";
@@ -98,6 +99,8 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
   case mips64:
   case mips64el:    return "mips";
 
+  case nios2:       return "nios2";
+
   case hexagon:     return "hexagon";
 
   case amdgcn:      return "amdgcn";
@@ -262,6 +265,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("mips64", mips64)
     .Case("mips64el", mips64el)
     .Case("msp430", msp430)
+    .Case("nios2", nios2)
     .Case("ppc64", ppc64)
     .Case("ppc32", ppc)
     .Case("ppc", ppc)
@@ -384,6 +388,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Cases("mipsel", "mipsallegrexel", Triple::mipsel)
     .Cases("mips64", "mips64eb", Triple::mips64)
     .Case("mips64el", Triple::mips64el)
+    .Case("nios2", Triple::nios2)
     .Case("r600", Triple::r600)
     .Case("amdgcn", Triple::amdgcn)
     .Case("riscv32", Triple::riscv32)
@@ -625,6 +630,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::mips64el:
   case Triple::mipsel:
   case Triple::msp430:
+  case Triple::nios2:
   case Triple::nvptx:
   case Triple::nvptx64:
   case Triple::ppc64le:
@@ -643,11 +649,13 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::tce:
   case Triple::tcele:
   case Triple::thumbeb:
-  case Triple::wasm32:
-  case Triple::wasm64:
   case Triple::xcore:
     return Triple::ELF;
 
+  case Triple::wasm32:
+  case Triple::wasm64:
+    return Triple::Wasm;
+
   case Triple::ppc:
   case Triple::ppc64:
     if (T.isOSDarwin())
@@ -1160,6 +1168,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::le32:
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
+  case llvm::Triple::nios2:
   case llvm::Triple::nvptx:
   case llvm::Triple::ppc:
   case llvm::Triple::r600:
@@ -1243,6 +1252,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::le32:
   case Triple::mips:
   case Triple::mipsel:
+  case Triple::nios2:
   case Triple::nvptx:
   case Triple::ppc:
   case Triple::r600:
@@ -1290,6 +1300,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::kalimba:
   case Triple::lanai:
   case Triple::msp430:
+  case Triple::nios2:
   case Triple::r600:
   case Triple::tce:
   case Triple::tcele:
@@ -1361,6 +1372,7 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::le32:
   case Triple::le64:
   case Triple::msp430:
+  case Triple::nios2:
   case Triple::nvptx64:
   case Triple::nvptx:
   case Triple::r600:
@@ -1447,6 +1459,7 @@ bool Triple::isLittleEndian() const {
   case Triple::mips64el:
   case Triple::mipsel:
   case Triple::msp430:
+  case Triple::nios2:
   case Triple::nvptx64:
   case Triple::nvptx:
   case Triple::ppc64le:
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index c17a6f6e1ea6..f1496393e55e 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp
@@ -2116,6 +2116,7 @@ void MappingNode::increment() {
       break;
     default:
       setError("Unexpected token. Expected Key or Block End", T);
+      LLVM_FALLTHROUGH;
     case Token::TK_Error:
       IsAtEnd = true;
       CurrentEntry = nullptr;
@@ -2128,6 +2129,7 @@ void MappingNode::increment() {
       return increment();
     case Token::TK_FlowMappingEnd:
       getNext();
+      LLVM_FALLTHROUGH;
     case Token::TK_Error:
       // Set this to end iterator.
       IsAtEnd = true;
@@ -2170,6 +2172,7 @@ void SequenceNode::increment() {
     default:
       setError( "Unexpected token. Expected Block Entry or Block End."
               , T);
+      LLVM_FALLTHROUGH;
     case Token::TK_Error:
       IsAtEnd = true;
       CurrentEntry = nullptr;
@@ -2198,6 +2201,7 @@ void SequenceNode::increment() {
       return increment();
     case Token::TK_FlowSequenceEnd:
       getNext();
+      LLVM_FALLTHROUGH;
     case Token::TK_Error:
       // Set this to end iterator.
       IsAtEnd = true;
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index 33d3de5daf33..09f9759ce7da 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -219,7 +219,6 @@ ProfileBitsInit(FoldingSetNodeID &ID, ArrayRef<Init *> Range) {
 
 BitsInit *BitsInit::get(ArrayRef<Init *> Range) {
   static FoldingSet<BitsInit> ThePool;
-  static std::vector<BitsInit*> TheActualPool;
 
   FoldingSetNodeID ID;
   ProfileBitsInit(ID, Range);
@@ -234,7 +233,6 @@ BitsInit *BitsInit::get(ArrayRef<Init *> Range) {
   std::uninitialized_copy(Range.begin(), Range.end(),
                           I->getTrailingObjects<Init *>());
   ThePool.InsertNode(I, IP);
-  TheActualPool.push_back(I);
   return I;
 }
 
@@ -456,7 +454,6 @@ static void ProfileListInit(FoldingSetNodeID &ID,
 
 ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
   static FoldingSet<ListInit> ThePool;
-  static std::vector<ListInit*> TheActualPool;
 
   FoldingSetNodeID ID;
   ProfileListInit(ID, Range, EltTy);
@@ -471,7 +468,6 @@ ListInit *ListInit::get(ArrayRef<Init *> Range, RecTy *EltTy) {
   std::uninitialized_copy(Range.begin(), Range.end(),
                           I->getTrailingObjects<Init *>());
   ThePool.InsertNode(I, IP);
-  TheActualPool.push_back(I);
   return I;
 }
 
@@ -606,7 +602,6 @@ ProfileUnOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *Op, RecTy *Type) {
 
 UnOpInit *UnOpInit::get(UnaryOp Opc, Init *LHS, RecTy *Type) {
   static FoldingSet<UnOpInit> ThePool;
-  static std::vector<UnOpInit*> TheActualPool;
 
   FoldingSetNodeID ID;
   ProfileUnOpInit(ID, Opc, LHS, Type);
@@ -617,7 +612,6 @@ UnOpInit *UnOpInit::get(UnaryOp Opc, Init *LHS, RecTy *Type) {
 
   UnOpInit *I = new(Allocator) UnOpInit(Opc, LHS, Type);
   ThePool.InsertNode(I, IP);
-  TheActualPool.push_back(I);
   return I;
 }
 
@@ -752,7 +746,6 @@ ProfileBinOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *RHS,
 BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS,
                           Init *RHS, RecTy *Type) {
   static FoldingSet<BinOpInit> ThePool;
-  static std::vector<BinOpInit*> TheActualPool;
 
   FoldingSetNodeID ID;
   ProfileBinOpInit(ID, Opc, LHS, RHS, Type);
@@ -763,7 +756,6 @@ BinOpInit *BinOpInit::get(BinaryOp Opc, Init *LHS,
 
   BinOpInit *I = new(Allocator) BinOpInit(Opc, LHS, RHS, Type);
   ThePool.InsertNode(I, IP);
-  TheActualPool.push_back(I);
   return I;
 }
 
@@ -910,7 +902,6 @@ ProfileTernOpInit(FoldingSetNodeID &ID, unsigned Opcode, Init *LHS, Init *MHS,
 TernOpInit *TernOpInit::get(TernaryOp Opc, Init *LHS, Init *MHS, Init *RHS,
                             RecTy *Type) {
   static FoldingSet<TernOpInit> ThePool;
-  static std::vector<TernOpInit*> TheActualPool;
 
   FoldingSetNodeID ID;
   ProfileTernOpInit(ID, Opc, LHS, MHS, RHS, Type);
@@ -921,7 +912,6 @@ TernOpInit *TernOpInit::get(TernaryOp Opc, Init *LHS, Init *MHS, Init *RHS,
 
   TernOpInit *I = new(Allocator) TernOpInit(Opc, LHS, MHS, RHS, Type);
   ThePool.InsertNode(I, IP);
-  TheActualPool.push_back(I);
   return I;
 }
 
@@ -1503,7 +1493,6 @@ DagInit *
 DagInit::get(Init *V, StringInit *VN, ArrayRef<Init *> ArgRange,
              ArrayRef<StringInit *> NameRange) {
   static FoldingSet<DagInit> ThePool;
-  static std::vector<DagInit*> TheActualPool;
 
   FoldingSetNodeID ID;
   ProfileDagInit(ID, V, VN, ArgRange, NameRange);
@@ -1512,9 +1501,13 @@ DagInit::get(Init *V, StringInit *VN, ArrayRef<Init *> ArgRange,
   if (DagInit *I = ThePool.FindNodeOrInsertPos(ID, IP))
     return I;
 
-  DagInit *I = new(Allocator) DagInit(V, VN, ArgRange, NameRange);
+  void *Mem = Allocator.Allocate(totalSizeToAlloc<Init *, StringInit *>(ArgRange.size(), NameRange.size()), alignof(BitsInit));
+  DagInit *I = new(Mem) DagInit(V, VN, ArgRange.size(), NameRange.size());
+  std::uninitialized_copy(ArgRange.begin(), ArgRange.end(),
+                          I->getTrailingObjects<Init *>());
+  std::uninitialized_copy(NameRange.begin(), NameRange.end(),
+                          I->getTrailingObjects<StringInit *>());
   ThePool.InsertNode(I, IP);
-  TheActualPool.push_back(I);
   return I;
 }
 
@@ -1533,7 +1526,7 @@ DagInit::get(Init *V, StringInit *VN,
 }
 
 void DagInit::Profile(FoldingSetNodeID &ID) const {
-  ProfileDagInit(ID, Val, ValName, Args, ArgNames);
+  ProfileDagInit(ID, Val, ValName, makeArrayRef(getTrailingObjects<Init *>(), NumArgs), makeArrayRef(getTrailingObjects<StringInit *>(), NumArgNames));
 }
 
 Init *DagInit::convertInitializerTo(RecTy *Ty) const {
@@ -1545,9 +1538,9 @@ Init *DagInit::convertInitializerTo(RecTy *Ty) const {
 
 Init *DagInit::resolveReferences(Record &R, const RecordVal *RV) const {
   SmallVector<Init*, 8> NewArgs;
-  NewArgs.reserve(Args.size());
+  NewArgs.reserve(arg_size());
   bool ArgsChanged = false;
-  for (const Init *Arg : Args) {
+  for (const Init *Arg : args()) {
     Init *NewArg = Arg->resolveReferences(R, RV);
     NewArgs.push_back(NewArg);
     ArgsChanged |= NewArg != Arg;
@@ -1555,7 +1548,7 @@ Init *DagInit::resolveReferences(Record &R, const RecordVal *RV) const {
 
   Init *Op = Val->resolveReferences(R, RV);
   if (Op != Val || ArgsChanged)
-    return DagInit::get(Op, ValName, NewArgs, ArgNames);
+    return DagInit::get(Op, ValName, NewArgs, getArgNames());
 
   return const_cast<DagInit *>(this);
 }
@@ -1564,12 +1557,12 @@ std::string DagInit::getAsString() const {
   std::string Result = "(" + Val->getAsString();
   if (ValName)
     Result += ":" + ValName->getAsUnquotedString();
-  if (!Args.empty()) {
-    Result += " " + Args[0]->getAsString();
-    if (ArgNames[0]) Result += ":$" + ArgNames[0]->getAsUnquotedString();
-    for (unsigned i = 1, e = Args.size(); i != e; ++i) {
-      Result += ", " + Args[i]->getAsString();
-      if (ArgNames[i]) Result += ":$" + ArgNames[i]->getAsUnquotedString();
+  if (!arg_empty()) {
+    Result += " " + getArg(0)->getAsString();
+    if (getArgName(0)) Result += ":$" + getArgName(0)->getAsUnquotedString();
+    for (unsigned i = 1, e = getNumArgs(); i != e; ++i) {
+      Result += ", " + getArg(i)->getAsString();
+      if (getArgName(i)) Result += ":$" + getArgName(i)->getAsUnquotedString();
     }
   }
   return Result + ")";
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 056ffd58b521..981fd22c213c 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -320,6 +320,9 @@ bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     switch (ExtraCode[0]) {
     default:
       return true; // Unknown modifier.
+    case 'a':      // Print 'a' modifier
+      PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+      return false;
     case 'w':      // Print W register
     case 'x':      // Print X register
       if (MO.isReg())
@@ -388,7 +391,7 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                               unsigned AsmVariant,
                                               const char *ExtraCode,
                                               raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0])
+  if (ExtraCode && ExtraCode[0] && ExtraCode[0] != 'a')
     return true; // Unknown modifier.
 
   const MachineOperand &MO = MI->getOperand(OpNum);
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 629ad5c61b78..33fec74998d6 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -584,27 +584,21 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   return true;
 }
 
-static void addPostLoopLiveIns(MachineBasicBlock *MBB, LivePhysRegs &LiveRegs) {
-  for (auto I = LiveRegs.begin(); I != LiveRegs.end(); ++I)
-    MBB->addLiveIn(*I);
-}
-
 bool AArch64ExpandPseudo::expandCMP_SWAP(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned LdarOp,
     unsigned StlrOp, unsigned CmpOp, unsigned ExtendImm, unsigned ZeroReg,
     MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
-  MachineOperand &Dest = MI.getOperand(0);
+  const MachineOperand &Dest = MI.getOperand(0);
   unsigned StatusReg = MI.getOperand(1).getReg();
-  MachineOperand &Addr = MI.getOperand(2);
-  MachineOperand &Desired = MI.getOperand(3);
-  MachineOperand &New = MI.getOperand(4);
-
-  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
-  LiveRegs.addLiveOuts(MBB);
-  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
-    LiveRegs.stepBackward(*I);
+  bool StatusDead = MI.getOperand(1).isDead();
+  // Duplicating undef operands into 2 instructions does not guarantee the same
+  // value on both; However undef should be replaced by xzr anyway.
+  assert(!MI.getOperand(2).isUndef() && "cannot handle undef");
+  unsigned AddrReg = MI.getOperand(2).getReg();
+  unsigned DesiredReg = MI.getOperand(3).getReg();
+  unsigned NewReg = MI.getOperand(4).getReg();
 
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -616,19 +610,18 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
   MF->insert(++StoreBB->getIterator(), DoneBB);
 
   // .Lloadcmp:
+  //     mov wStatus, 0
   //     ldaxr xDest, [xAddr]
   //     cmp xDest, xDesired
   //     b.ne .Ldone
-  LoadCmpBB->addLiveIn(Addr.getReg());
-  LoadCmpBB->addLiveIn(Dest.getReg());
-  LoadCmpBB->addLiveIn(Desired.getReg());
-  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
-
+  if (!StatusDead)
+    BuildMI(LoadCmpBB, DL, TII->get(AArch64::MOVZWi), StatusReg)
+      .addImm(0).addImm(0);
   BuildMI(LoadCmpBB, DL, TII->get(LdarOp), Dest.getReg())
-      .addReg(Addr.getReg());
+      .addReg(AddrReg);
   BuildMI(LoadCmpBB, DL, TII->get(CmpOp), ZeroReg)
       .addReg(Dest.getReg(), getKillRegState(Dest.isDead()))
-      .add(Desired)
+      .addReg(DesiredReg)
       .addImm(ExtendImm);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::Bcc))
       .addImm(AArch64CC::NE)
@@ -640,25 +633,35 @@ bool AArch64ExpandPseudo::expandCMP_SWAP(
   // .Lstore:
   //     stlxr wStatus, xNew, [xAddr]
   //     cbnz wStatus, .Lloadcmp
-  StoreBB->addLiveIn(Addr.getReg());
-  StoreBB->addLiveIn(New.getReg());
-  addPostLoopLiveIns(StoreBB, LiveRegs);
-
-  BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg).add(New).add(Addr);
+  BuildMI(StoreBB, DL, TII->get(StlrOp), StatusReg)
+      .addReg(NewReg)
+      .addReg(AddrReg);
   BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
-      .addReg(StatusReg, RegState::Kill)
+      .addReg(StatusReg, getKillRegState(StatusDead))
       .addMBB(LoadCmpBB);
   StoreBB->addSuccessor(LoadCmpBB);
   StoreBB->addSuccessor(DoneBB);
 
   DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
   DoneBB->transferSuccessors(&MBB);
-  addPostLoopLiveIns(DoneBB, LiveRegs);
 
   MBB.addSuccessor(LoadCmpBB);
 
   NextMBBI = MBB.end();
   MI.eraseFromParent();
+
+  // Recompute livein lists.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  LivePhysRegs LiveRegs;
+  computeLiveIns(LiveRegs, MRI, *DoneBB);
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+  // Do an extra pass around the loop to get loop carried registers right.
+  StoreBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  LoadCmpBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+
   return true;
 }
 
@@ -671,16 +674,15 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
   MachineOperand &DestLo = MI.getOperand(0);
   MachineOperand &DestHi = MI.getOperand(1);
   unsigned StatusReg = MI.getOperand(2).getReg();
-  MachineOperand &Addr = MI.getOperand(3);
-  MachineOperand &DesiredLo = MI.getOperand(4);
-  MachineOperand &DesiredHi = MI.getOperand(5);
-  MachineOperand &NewLo = MI.getOperand(6);
-  MachineOperand &NewHi = MI.getOperand(7);
-
-  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
-  LiveRegs.addLiveOuts(MBB);
-  for (auto I = std::prev(MBB.end()); I != MBBI; --I)
-    LiveRegs.stepBackward(*I);
+  bool StatusDead = MI.getOperand(2).isDead();
+  // Duplicating undef operands into 2 instructions does not guarantee the same
+  // value on both; However undef should be replaced by xzr anyway.
+  assert(!MI.getOperand(3).isUndef() && "cannot handle undef");
+  unsigned AddrReg = MI.getOperand(3).getReg();
+  unsigned DesiredLoReg = MI.getOperand(4).getReg();
+  unsigned DesiredHiReg = MI.getOperand(5).getReg();
+  unsigned NewLoReg = MI.getOperand(6).getReg();
+  unsigned NewHiReg = MI.getOperand(7).getReg();
 
   MachineFunction *MF = MBB.getParent();
   auto LoadCmpBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
@@ -696,20 +698,13 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
   //     cmp xDestLo, xDesiredLo
   //     sbcs xDestHi, xDesiredHi
   //     b.ne .Ldone
-  LoadCmpBB->addLiveIn(Addr.getReg());
-  LoadCmpBB->addLiveIn(DestLo.getReg());
-  LoadCmpBB->addLiveIn(DestHi.getReg());
-  LoadCmpBB->addLiveIn(DesiredLo.getReg());
-  LoadCmpBB->addLiveIn(DesiredHi.getReg());
-  addPostLoopLiveIns(LoadCmpBB, LiveRegs);
-
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::LDAXPX))
       .addReg(DestLo.getReg(), RegState::Define)
       .addReg(DestHi.getReg(), RegState::Define)
-      .addReg(Addr.getReg());
+      .addReg(AddrReg);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
       .addReg(DestLo.getReg(), getKillRegState(DestLo.isDead()))
-      .add(DesiredLo)
+      .addReg(DesiredLoReg)
       .addImm(0);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
     .addUse(AArch64::WZR)
@@ -717,14 +712,14 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
     .addImm(AArch64CC::EQ);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::SUBSXrs), AArch64::XZR)
       .addReg(DestHi.getReg(), getKillRegState(DestHi.isDead()))
-      .add(DesiredHi)
+      .addReg(DesiredHiReg)
       .addImm(0);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::CSINCWr), StatusReg)
       .addUse(StatusReg, RegState::Kill)
       .addUse(StatusReg, RegState::Kill)
       .addImm(AArch64CC::EQ);
   BuildMI(LoadCmpBB, DL, TII->get(AArch64::CBNZW))
-      .addUse(StatusReg, RegState::Kill)
+      .addUse(StatusReg, getKillRegState(StatusDead))
       .addMBB(DoneBB);
   LoadCmpBB->addSuccessor(DoneBB);
   LoadCmpBB->addSuccessor(StoreBB);
@@ -732,28 +727,36 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
   // .Lstore:
   //     stlxp wStatus, xNewLo, xNewHi, [xAddr]
   //     cbnz wStatus, .Lloadcmp
-  StoreBB->addLiveIn(Addr.getReg());
-  StoreBB->addLiveIn(NewLo.getReg());
-  StoreBB->addLiveIn(NewHi.getReg());
-  addPostLoopLiveIns(StoreBB, LiveRegs);
   BuildMI(StoreBB, DL, TII->get(AArch64::STLXPX), StatusReg)
-      .add(NewLo)
-      .add(NewHi)
-      .add(Addr);
+      .addReg(NewLoReg)
+      .addReg(NewHiReg)
+      .addReg(AddrReg);
   BuildMI(StoreBB, DL, TII->get(AArch64::CBNZW))
-      .addReg(StatusReg, RegState::Kill)
+      .addReg(StatusReg, getKillRegState(StatusDead))
       .addMBB(LoadCmpBB);
   StoreBB->addSuccessor(LoadCmpBB);
   StoreBB->addSuccessor(DoneBB);
 
   DoneBB->splice(DoneBB->end(), &MBB, MI, MBB.end());
   DoneBB->transferSuccessors(&MBB);
-  addPostLoopLiveIns(DoneBB, LiveRegs);
 
   MBB.addSuccessor(LoadCmpBB);
 
   NextMBBI = MBB.end();
   MI.eraseFromParent();
+
+  // Recompute liveness bottom up.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  LivePhysRegs LiveRegs;
+  computeLiveIns(LiveRegs, MRI, *DoneBB);
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+  // Do an extra pass in the loop to get the loop carried dependencies right.
+  StoreBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *StoreBB);
+  LoadCmpBB->clearLiveIns();
+  computeLiveIns(LiveRegs, MRI, *LoadCmpBB);
+
   return true;
 }
 
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 1aec602a2a36..0b92249580c8 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -267,12 +267,12 @@ static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
     return AArch64::X9;
 
   const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
-  const AArch64RegisterInfo *TRI = Subtarget.getRegisterInfo();
+  const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
   LivePhysRegs LiveRegs(TRI);
   LiveRegs.addLiveIns(*MBB);
 
   // Mark callee saved registers as used so we will not choose them.
-  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(MF);
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(MF);
   for (unsigned i = 0; CSRegs[i]; ++i)
     LiveRegs.addReg(CSRegs[i]);
 
@@ -991,6 +991,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
   SmallVector<RegPairInfo, 8> RegPairs;
 
   computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs);
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   for (auto RPII = RegPairs.rbegin(), RPIE = RegPairs.rend(); RPII != RPIE;
        ++RPII) {
@@ -1022,9 +1023,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
           dbgs() << ")\n");
 
     MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
-    MBB.addLiveIn(Reg1);
+    if (!MRI.isReserved(Reg1))
+      MBB.addLiveIn(Reg1);
     if (RPI.isPaired()) {
-      MBB.addLiveIn(Reg2);
+      if (!MRI.isReserved(Reg2))
+        MBB.addLiveIn(Reg2);
       MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, RPI.FrameIdx + 1),
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 1af36086ad90..62f4c953830b 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -886,18 +886,21 @@ static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
   // Create the new constant immediate node.
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
+  SDValue New;
 
   // If the new constant immediate is all-zeros or all-ones, let the target
   // independent DAG combine optimize this node.
-  if (NewImm == 0 || NewImm == OrigMask)
-    return TLO.CombineTo(Op.getOperand(1), TLO.DAG.getConstant(NewImm, DL, VT));
-
+  if (NewImm == 0 || NewImm == OrigMask) {
+    New = TLO.DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
+                          TLO.DAG.getConstant(NewImm, DL, VT));
   // Otherwise, create a machine node so that target independent DAG combine
   // doesn't undo this optimization.
-  Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
-  SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
-  SDValue New(
-      TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
+  } else {
+    Enc = AArch64_AM::encodeLogicalImmediate(NewImm, Size);
+    SDValue EncConst = TLO.DAG.getTargetConstant(Enc, DL, VT);
+    New = SDValue(
+        TLO.DAG.getMachineNode(NewOpc, DL, VT, Op.getOperand(0), EncConst), 0);
+  }
 
   return TLO.CombineTo(Op, New);
 }
@@ -9219,16 +9222,26 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
   // instructions (stp).
   SDLoc DL(&St);
   SDValue BasePtr = St.getBasePtr();
+  uint64_t BaseOffset = 0;
+
   const MachinePointerInfo &PtrInfo = St.getPointerInfo();
   SDValue NewST1 =
       DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
                    OrigAlignment, St.getMemOperand()->getFlags());
 
+  // As this in ISel, we will not merge this add which may degrade results.
+  if (BasePtr->getOpcode() == ISD::ADD &&
+      isa<ConstantSDNode>(BasePtr->getOperand(1))) {
+    BaseOffset = cast<ConstantSDNode>(BasePtr->getOperand(1))->getSExtValue();
+    BasePtr = BasePtr->getOperand(0);
+  }
+
   unsigned Offset = EltOffset;
   while (--NumVecElts) {
     unsigned Alignment = MinAlign(OrigAlignment, Offset);
-    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
-                                    DAG.getConstant(Offset, DL, MVT::i64));
+    SDValue OffsetPtr =
+        DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                    DAG.getConstant(BaseOffset + Offset, DL, MVT::i64));
     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
                           PtrInfo.getWithOffset(Offset), Alignment,
                           St.getMemOperand()->getFlags());
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index c42738da7ab0..faf39be9b41e 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -763,15 +763,126 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   llvm_unreachable("Unknown opcode to check as cheap as a move!");
 }
 
-bool AArch64InstrInfo::isFalkorLSLFast(const MachineInstr &MI) const {
-  if (MI.getNumOperands() < 4)
+bool AArch64InstrInfo::isFalkorShiftExtFast(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  default:
     return false;
-  unsigned ShOpVal = MI.getOperand(3).getImm();
-  unsigned ShImm = AArch64_AM::getShiftValue(ShOpVal);
-  if (AArch64_AM::getShiftType(ShOpVal) == AArch64_AM::LSL &&
-       ShImm < 4)
-    return true;
-  return false;
+
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSXrs: {
+    unsigned Imm = MI.getOperand(3).getImm();
+    unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
+    if (ShiftVal == 0)
+      return true;
+    return AArch64_AM::getShiftType(Imm) == AArch64_AM::LSL && ShiftVal <= 5;
+  }
+
+  case AArch64::ADDWrx:
+  case AArch64::ADDXrx:
+  case AArch64::ADDXrx64:
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrx:
+  case AArch64::ADDSXrx64: {
+    unsigned Imm = MI.getOperand(3).getImm();
+    switch (AArch64_AM::getArithExtendType(Imm)) {
+    default:
+      return false;
+    case AArch64_AM::UXTB:
+    case AArch64_AM::UXTH:
+    case AArch64_AM::UXTW:
+    case AArch64_AM::UXTX:
+      return AArch64_AM::getArithShiftValue(Imm) <= 4;
+    }
+  }
+
+  case AArch64::SUBWrs:
+  case AArch64::SUBSWrs: {
+    unsigned Imm = MI.getOperand(3).getImm();
+    unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
+    return ShiftVal == 0 ||
+           (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 31);
+  }
+
+  case AArch64::SUBXrs:
+  case AArch64::SUBSXrs: {
+    unsigned Imm = MI.getOperand(3).getImm();
+    unsigned ShiftVal = AArch64_AM::getShiftValue(Imm);
+    return ShiftVal == 0 ||
+           (AArch64_AM::getShiftType(Imm) == AArch64_AM::ASR && ShiftVal == 63);
+  }
+
+  case AArch64::SUBWrx:
+  case AArch64::SUBXrx:
+  case AArch64::SUBXrx64:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrx:
+  case AArch64::SUBSXrx64: {
+    unsigned Imm = MI.getOperand(3).getImm();
+    switch (AArch64_AM::getArithExtendType(Imm)) {
+    default:
+      return false;
+    case AArch64_AM::UXTB:
+    case AArch64_AM::UXTH:
+    case AArch64_AM::UXTW:
+    case AArch64_AM::UXTX:
+      return AArch64_AM::getArithShiftValue(Imm) == 0;
+    }
+  }
+
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRBroW:
+  case AArch64::LDRBroX:
+  case AArch64::LDRDroW:
+  case AArch64::LDRDroX:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRHroW:
+  case AArch64::LDRHroX:
+  case AArch64::LDRQroW:
+  case AArch64::LDRQroX:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSroW:
+  case AArch64::LDRSroX:
+  case AArch64::LDRWroW:
+  case AArch64::LDRWroX:
+  case AArch64::LDRXroW:
+  case AArch64::LDRXroX:
+  case AArch64::PRFMroW:
+  case AArch64::PRFMroX:
+  case AArch64::STRBBroW:
+  case AArch64::STRBBroX:
+  case AArch64::STRBroW:
+  case AArch64::STRBroX:
+  case AArch64::STRDroW:
+  case AArch64::STRDroX:
+  case AArch64::STRHHroW:
+  case AArch64::STRHHroX:
+  case AArch64::STRHroW:
+  case AArch64::STRHroX:
+  case AArch64::STRQroW:
+  case AArch64::STRQroX:
+  case AArch64::STRSroW:
+  case AArch64::STRSroX:
+  case AArch64::STRWroW:
+  case AArch64::STRWroX:
+  case AArch64::STRXroW:
+  case AArch64::STRXroX: {
+    unsigned IsSigned = MI.getOperand(3).getImm();
+    return !IsSigned;
+  }
+  }
 }
 
 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 4cd14db633b9..59f3405fe439 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -270,7 +270,7 @@ public:
                      bool IsTailCall) const override;
   /// Returns true if the instruction has a shift by immediate that can be
   /// executed in one cycle less.
-  bool isFalkorLSLFast(const MachineInstr &MI) const;
+  bool isFalkorShiftExtFast(const MachineInstr &MI) const;
 private:
 
   /// \brief Sets the offsets on outlined instructions in \p MBB which use SP
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index da68f3165c5e..ad24612239fa 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -442,7 +442,7 @@ def MSRpstateImm4 : MSRpstateImm0_15;
 // TPIDR_EL0.  Add pseudo op so we can mark it as not having any side effects.
 let hasSideEffects = 0 in
 def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
-                       [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[]>;
+                       [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[WriteSys]>;
 
 // The cycle counter PMC register is PMCCNTR_EL0.
 let Predicates = [HasPerfMon] in
diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
index a6926a6700e1..3b71d529db59 100644
--- a/lib/Target/AArch64/AArch64MacroFusion.cpp
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -232,6 +232,19 @@ static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) {
           dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " <<
                     DAG->TII->getName(SecondMI->getOpcode()) << '\n'; );
 
+    if (&SecondSU != &DAG->ExitSU)
+      // Make instructions dependent on FirstSU also dependent on SecondSU to
+      // prevent them from being scheduled between FirstSU and and SecondSU.
+      for (SUnit::const_succ_iterator
+             SI = FirstSU.Succs.begin(), SE = FirstSU.Succs.end();
+           SI != SE; ++SI) {
+        if (!SI->getSUnit() || SI->getSUnit() == &SecondSU)
+          continue;
+        DEBUG(dbgs() << "  Copy Succ ";
+              SI->getSUnit()->print(dbgs(), DAG); dbgs() << '\n';);
+        DAG->addEdge(SI->getSUnit(), SDep(&SecondSU, SDep::Artificial));
+      }
+
     ++NumFused;
     return true;
   }
diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td
index cf1c0b66db58..44fd94fc3d48 100644
--- a/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -61,56 +61,42 @@ let SchedModel = FalkorModel in {
 
 let SchedModel = FalkorModel in {
 
-def : WriteRes<WriteImm,   [FalkorUnitXYZ]> { let Latency = 1; }
-def : WriteRes<WriteI,     [FalkorUnitXYZ]> { let Latency = 1; }
-def : WriteRes<WriteISReg, [FalkorUnitVXVY, FalkorUnitVXVY]>
-      { let Latency = 1; let NumMicroOps = 2; }
-def : WriteRes<WriteIEReg, [FalkorUnitXYZ, FalkorUnitXYZ]>
-      { let Latency = 2; let NumMicroOps = 2; }
-def : WriteRes<WriteExtr,  [FalkorUnitXYZ, FalkorUnitXYZ]>
-      { let Latency = 2; let NumMicroOps = 2; }
-def : WriteRes<WriteIS,    [FalkorUnitXYZ]> { let Latency = 1; }
-def : WriteRes<WriteID32,  [FalkorUnitX, FalkorUnitZ]>
-      { let Latency = 8; let NumMicroOps = 2; }
-def : WriteRes<WriteID64,  [FalkorUnitX, FalkorUnitZ]>
-      { let Latency = 16; let NumMicroOps = 2; }
-def : WriteRes<WriteIM32,  [FalkorUnitX]> { let Latency = 4; }
-def : WriteRes<WriteIM64,  [FalkorUnitX]> { let Latency = 5; }
-def : WriteRes<WriteBr,    [FalkorUnitB]> { let Latency = 1; }
-def : WriteRes<WriteBrReg, [FalkorUnitB]> { let Latency = 1; }
-def : WriteRes<WriteLD,    [FalkorUnitLD]> { let Latency = 3; }
-def : WriteRes<WriteST,    [FalkorUnitST, FalkorUnitSD]>
-      { let Latency = 0; let NumMicroOps = 2; }
-def : WriteRes<WriteSTP,   [FalkorUnitST, FalkorUnitSD]>
-      { let Latency = 0; let NumMicroOps = 2; }
-def : WriteRes<WriteAdr,   [FalkorUnitXYZ]> { let Latency = 1; }
-def : WriteRes<WriteLDIdx, [FalkorUnitLD]> { let Latency = 5; }
-def : WriteRes<WriteSTIdx, [FalkorUnitST, FalkorUnitSD]>
-      { let Latency = 0; let NumMicroOps = 2; }
-def : WriteRes<WriteF,     [FalkorUnitVXVY, FalkorUnitVXVY]>
-      { let Latency = 3; let NumMicroOps = 2; }
-def : WriteRes<WriteFCmp,  [FalkorUnitVXVY]> { let Latency = 2; }
-def : WriteRes<WriteFCvt,  [FalkorUnitVXVY]> { let Latency = 4; }
-def : WriteRes<WriteFCopy, [FalkorUnitVXVY]> { let Latency = 4; }
-def : WriteRes<WriteFImm,  [FalkorUnitVXVY]> { let Latency = 4; }
-def : WriteRes<WriteFMul,  [FalkorUnitVXVY, FalkorUnitVXVY]>
-      { let Latency = 6; let NumMicroOps = 2; }
-def : WriteRes<WriteFDiv,  [FalkorUnitVXVY, FalkorUnitVXVY]>
-      { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1
-def : WriteRes<WriteV,     [FalkorUnitVXVY]> { let Latency = 6; }
-def : WriteRes<WriteVLD,   [FalkorUnitLD]> { let Latency = 3; }
-def : WriteRes<WriteVST,   [FalkorUnitST, FalkorUnitVSD]>
-      { let Latency = 0; let NumMicroOps = 2; }
+// These WriteRes entries are not used in the Falkor sched model.
+def : WriteRes<WriteImm, []>     { let Unsupported = 1; }
+def : WriteRes<WriteI, []>       { let Unsupported = 1; }
+def : WriteRes<WriteISReg, []>   { let Unsupported = 1; }
+def : WriteRes<WriteIEReg, []>   { let Unsupported = 1; }
+def : WriteRes<WriteExtr, []>    { let Unsupported = 1; }
+def : WriteRes<WriteIS, []>      { let Unsupported = 1; }
+def : WriteRes<WriteID32, []>    { let Unsupported = 1; }
+def : WriteRes<WriteID64, []>    { let Unsupported = 1; }
+def : WriteRes<WriteIM32, []>    { let Unsupported = 1; }
+def : WriteRes<WriteIM64, []>    { let Unsupported = 1; }
+def : WriteRes<WriteBr, []>      { let Unsupported = 1; }
+def : WriteRes<WriteBrReg, []>   { let Unsupported = 1; }
+def : WriteRes<WriteLD, []>      { let Unsupported = 1; }
+def : WriteRes<WriteST, []>      { let Unsupported = 1; }
+def : WriteRes<WriteSTP, []>     { let Unsupported = 1; }
+def : WriteRes<WriteAdr, []>     { let Unsupported = 1; }
+def : WriteRes<WriteLDIdx, []>   { let Unsupported = 1; }
+def : WriteRes<WriteSTIdx, []>   { let Unsupported = 1; }
+def : WriteRes<WriteF, []>       { let Unsupported = 1; }
+def : WriteRes<WriteFCmp, []>    { let Unsupported = 1; }
+def : WriteRes<WriteFCvt, []>    { let Unsupported = 1; }
+def : WriteRes<WriteFCopy, []>   { let Unsupported = 1; }
+def : WriteRes<WriteFImm, []>    { let Unsupported = 1; }
+def : WriteRes<WriteFMul, []>    { let Unsupported = 1; }
+def : WriteRes<WriteFDiv, []>    { let Unsupported = 1; }
+def : WriteRes<WriteV, []>       { let Unsupported = 1; }
+def : WriteRes<WriteVLD, []>     { let Unsupported = 1; }
+def : WriteRes<WriteVST, []>     { let Unsupported = 1; }
+def : WriteRes<WriteSys, []>     { let Unsupported = 1; }
+def : WriteRes<WriteBarrier, []> { let Unsupported = 1; }
+def : WriteRes<WriteHint, []>    { let Unsupported = 1; }
+def : WriteRes<WriteLDHi, []>    { let Unsupported = 1; }
+def : WriteRes<WriteAtomic, []>  { let Unsupported = 1; }
 
-def : WriteRes<WriteSys,     []> { let Latency = 1; }
-def : WriteRes<WriteBarrier, []> { let Latency = 1; }
-def : WriteRes<WriteHint,    []> { let Latency = 1; }
-
-def : WriteRes<WriteLDHi,    []> { let Latency = 3; }
-
-def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
-
-// No forwarding logic is modelled yet.
+// These ReadAdvance entries are not used in the Falkor sched model.
 def : ReadAdvance<ReadI,       0>;
 def : ReadAdvance<ReadISReg,   0>;
 def : ReadAdvance<ReadIEReg,   0>;
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index a9b4d44a523e..d098cf7a5a37 100644
--- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -12,7 +12,509 @@
 //
 //===----------------------------------------------------------------------===//
 
-include "AArch64SchedFalkorWriteRes.td"
+// Contains all of the Falkor specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+//   Prefix: FalkorWr
+//   MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD)
+//   Latency: #cyc
+//
+// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued
+//      down one Z pipe, six SD pipes, four VX pipes and the total latency is
+//      six cycles.
+//
+// Contains all of the Falkor specific ReadAdvance types for forwarding logic.
+//
+// Contains all of the Falkor specific WriteVariant types for immediate zero
+// and LSLFast.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define 0 micro-op types
+def FalkorWr_none_1cyc : SchedWriteRes<[]> {
+  let Latency = 1;
+  let NumMicroOps = 0;
+}
+def FalkorWr_none_3cyc : SchedWriteRes<[]> {
+  let Latency = 3;
+  let NumMicroOps = 0;
+}
+def FalkorWr_none_4cyc : SchedWriteRes<[]> {
+  let Latency = 4;
+  let NumMicroOps = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 1 micro-op types
+
+def FalkorWr_1X_2cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 2; }
+def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
+def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; }
+def FalkorWr_1Z_0cyc    : SchedWriteRes<[FalkorUnitZ]>   { let Latency = 0; }
+def FalkorWr_1ZB_0cyc   : SchedWriteRes<[FalkorUnitZB]>  { let Latency = 0; }
+def FalkorWr_1LD_3cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 3; }
+def FalkorWr_1LD_4cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 4; }
+def FalkorWr_1XYZ_1cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; }
+def FalkorWr_1XYZ_2cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; }
+def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; }
+def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; }
+def FalkorWr_1none_0cyc : SchedWriteRes<[]>              { let Latency = 0; }
+
+def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
+def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
+def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
+def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+
+def FalkorWr_1LD_0cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 0; }
+def FalkorWr_1ST_0cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 0; }
+def FalkorWr_1ST_3cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 3; }
+
+def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; }
+def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; }
+def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Define 2 micro-op types
+
+def FalkorWr_2VXVY_1cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_2cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_3cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_4cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_5cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_6cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_1XYZ_1LD_4cyc  : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2LD_3cyc   : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_2GTOV_1cyc    : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_2XYZ_2cyc   : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1X_1Z_8cyc  : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+  let Latency = 8;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2, 8];
+}
+
+def FalkorWr_1X_1Z_16cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+  let Latency = 16;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2, 16];
+}
+
+def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitVSD, FalkorUnitST]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 3 micro-op types
+
+def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
+                                               FalkorUnitLD]> {
+  let Latency = 0;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
+                                               FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_1LD_2VXVY_4cyc  : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_2LD_1none_3cyc  : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_3LD_3cyc        : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                              FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_2LD_1Z_3cyc     : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_1XYZ_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitSD, FalkorUnitST]> {
+  let Latency = 0;
+  let NumMicroOps = 3;
+}
+def FalkorWr_1XYZ_1VSD_1ST_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitVSD, FalkorUnitST]> {
+  let Latency = 0;
+  let NumMicroOps = 3;
+}
+//===----------------------------------------------------------------------===//
+// Define 4 micro-op types
+
+def FalkorWr_2VX_2VY_2cyc  : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+                                            FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_4VXVY_2cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_3cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_4cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_6cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_4LD_3cyc      : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                            FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST,
+                                              FalkorUnitSD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_2VSD_2ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD,
+                                           FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 5 micro-op types
+
+def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+}
+def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+}
+def FalkorWr_5VXVY_7cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY]> {
+  let Latency = 7;
+  let NumMicroOps = 5;
+}
+def FalkorWr_1XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST,
+                                                FalkorUnitVSD, FalkorUnitST,
+                                                FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 5;
+}
+def FalkorWr_1VXVY_2ST_2VSD_0cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+                                                  FalkorUnitVSD, FalkorUnitST,
+                                                  FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 5;
+}
+//===----------------------------------------------------------------------===//
+// Define 6 micro-op types
+
+def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 6;
+}
+
+def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST,
+                                                FalkorUnitVSD, FalkorUnitXYZ,
+                                                FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 6;
+}
+
+def FalkorWr_2VXVY_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+                                                 FalkorUnitVSD, FalkorUnitVXVY,
+                                                 FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 6;
+}
+
+def FalkorWr_3VSD_3ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD,
+                                           FalkorUnitST, FalkorUnitVSD,
+                                           FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 6;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 8 micro-op types
+
+def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitVXVY, FalkorUnitVXVY,
+                                             FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 8;
+}
+
+def FalkorWr_4VSD_4ST_0cyc: SchedWriteRes<[FalkorUnitST, FalkorUnitVSD,
+                                           FalkorUnitST, FalkorUnitVSD,
+                                           FalkorUnitST, FalkorUnitVSD,
+                                           FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 9 micro-op types
+
+def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
+                                             FalkorUnitLD, FalkorUnitVXVY,
+                                             FalkorUnitVXVY, FalkorUnitLD,
+                                             FalkorUnitLD, FalkorUnitXYZ,
+                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 9;
+}
+
+def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
+                                             FalkorUnitLD, FalkorUnitVXVY,
+                                             FalkorUnitVXVY, FalkorUnitXYZ,
+                                             FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 9;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 10 micro-op types
+
+def FalkorWr_2VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+                                                 FalkorUnitVSD, FalkorUnitVXVY,
+                                                 FalkorUnitST, FalkorUnitVSD,
+                                                 FalkorUnitST, FalkorUnitVSD,
+                                                 FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 10;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 12 micro-op types
+
+def FalkorWr_4VXVY_4ST_4VSD_0cyc: SchedWriteRes<[FalkorUnitVXVY, FalkorUnitST,
+                                                 FalkorUnitVSD, FalkorUnitVXVY,
+                                                 FalkorUnitST, FalkorUnitVSD,
+                                                 FalkorUnitVXVY, FalkorUnitST,
+                                                 FalkorUnitVSD, FalkorUnitVXVY,
+                                                 FalkorUnitST, FalkorUnitVSD]> {
+  let Latency = 0;
+  let NumMicroOps = 12;
+}
+
+// Forwarding logic is modeled for multiply add/accumulate.
+// -----------------------------------------------------------------------------
+def FalkorReadIMA32  : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
+def FalkorReadIMA64  : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
+def FalkorReadVMA    : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>;
+def FalkorReadFMA32  : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
+def FalkorReadFMA64  : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
+
+// SchedPredicates and WriteVariants for Immediate Zero and LSLFast/ASRFast
+// -----------------------------------------------------------------------------
+def FalkorImmZPred    : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>;
+def FalkorFMOVZrReg   : SchedPredicate<[{MI->getOperand(1).getReg() == AArch64::WZR ||
+                                         MI->getOperand(1).getReg() == AArch64::XZR}]>;
+def FalkorShiftExtFastPred : SchedPredicate<[{TII->isFalkorShiftExtFast(*MI)}]>;
+
+def FalkorWr_FMOV  : SchedWriteVariant<[
+                       SchedVar<FalkorFMOVZrReg, [FalkorWr_1none_0cyc]>,
+                       SchedVar<NoSchedPred,     [FalkorWr_1GTOV_1cyc]>]>;
+
+def FalkorWr_MOVZ  : SchedWriteVariant<[
+                       SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
+                       SchedVar<NoSchedPred,    [FalkorWr_1XYZB_1cyc]>]>;
+
+def FalkorWr_ADDSUBsx : SchedWriteVariant<[
+                          SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_1cyc]>,
+                          SchedVar<NoSchedPred,            [FalkorWr_2XYZ_2cyc]>]>;
+
+def FalkorWr_LDRro : SchedWriteVariant<[
+                       SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_3cyc]>,
+                       SchedVar<NoSchedPred,            [FalkorWr_1XYZ_1LD_4cyc]>]>;
+
+def FalkorWr_LDRSro : SchedWriteVariant<[
+                        SchedVar<FalkorShiftExtFastPred, [FalkorWr_1LD_4cyc]>,
+                        SchedVar<NoSchedPred,            [FalkorWr_1XYZ_1LD_5cyc]>]>;
+
+def FalkorWr_PRFMro : SchedWriteVariant<[
+                        SchedVar<FalkorShiftExtFastPred, [FalkorWr_1ST_3cyc]>,
+                        SchedVar<NoSchedPred,            [FalkorWr_1XYZ_1ST_4cyc]>]>;
+
+def FalkorWr_STRVro : SchedWriteVariant<[
+                        SchedVar<FalkorShiftExtFastPred, [FalkorWr_1VSD_1ST_0cyc]>,
+                        SchedVar<NoSchedPred,            [FalkorWr_1XYZ_1VSD_1ST_0cyc]>]>;
+
+def FalkorWr_STRQro : SchedWriteVariant<[
+                        SchedVar<FalkorShiftExtFastPred, [FalkorWr_1XYZ_2ST_2VSD_0cyc]>,
+                        SchedVar<NoSchedPred,            [FalkorWr_2XYZ_2ST_2VSD_0cyc]>]>;
+
+def FalkorWr_STRro : SchedWriteVariant<[
+                       SchedVar<FalkorShiftExtFastPred, [FalkorWr_1SD_1ST_0cyc]>,
+                       SchedVar<NoSchedPred,            [FalkorWr_1XYZ_1SD_1ST_0cyc]>]>;
 
 //===----------------------------------------------------------------------===//
 // Specialize the coarse model by associating instruction groups with the
@@ -22,63 +524,76 @@ include "AArch64SchedFalkorWriteRes.td"
 // Miscellaneous
 // -----------------------------------------------------------------------------
 
-def : InstRW<[WriteI], (instrs COPY)>;
+// FIXME: This could be better modeled by looking at the regclasses of the operands.
+def : InstRW<[FalkorWr_1XYZ_1cyc], (instrs COPY)>;
 
 // SIMD Floating-point Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(FABS|FNEG)(v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(FABS|FNEG)v2f32$")>;
 
-def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v4f16|v2i16p|v2i32p)$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FAC(GE|GT)(16|32|64)$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FCM(EQ|GE|GT)(16|32|64|v2f32|v4f16|v2i32|v4i16)$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i16|v1i32|v1i64|v2i32|v4i16)rz$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v2i32p)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FAC(GE|GT)(32|64)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FCM(EQ|GE|GT)(32|64|v2f32|v2i32)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i32|v1i64|v2i32)rz$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)v2f32$")>;
 
-def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^F(MAX|MIN)(NM)?V(v4i16|v4i32|v8i16)v$")>;
-def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(FABD|FADD|FSUB)(v2f32|v4f16)$")>;
-def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^FADDP(v2i16p|v2i32p|v2i64p|v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^F(MAX|MIN)(NM)?Vv4i32v$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(FABD|FADD|FSUB)v2f32$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^FADDP(v2i32p|v2i64p|v2f32)$")>;
 
-def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v2f32)$")>;
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs FCVTXNv1i64)>;
-def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVTZ(S|U)v2i32(_shift)?$")>;
 
-def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
-def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FMULX16, FMULX32)>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+                                      (instregex "^(FMUL|FMULX)(v2f32|(v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+                                      (instrs FMULX32)>;
 
-def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
-def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FMULX64)>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+                                      (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+                                      (instrs FMULX64)>;
 
-def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(FABS|FNEG)(v2f64|v4f32)$")>;
 
-def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v8f16|v2i64p)$")>;
-def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32|v8i16)rz$")>;
-def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32|v8f16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v2i64p)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32)rz$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instrs FCVTLv4i16, FCVTLv2i32)>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32)$")>;
 
-def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^(FDIV|FSQRT)(v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^(FDIV|FSQRT)v2f32$")>;
 
-def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32|v8f16)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32)$")>;
 
-def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32|v8f16)$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>;
-def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32)$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instrs FCVTLv8i16, FCVTLv4i32)>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVTZ(S|U)(v2i64|v4i32)(_shift)?$")>;
 
-def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc],
+                                      (instregex "^(FMUL|FMULX)(v2f64|v4f32|v4i32_indexed)$")>;
 
-def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc],
+                                      (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
 
-def : InstRW<[FalkorWr_3VXVY_4cyc],   (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_3VXVY_4cyc],   (instrs FCVTNv4i16, FCVTNv2i32, FCVTXNv2f32)>;
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instrs FCVTNv8i16, FCVTNv4i32, FCVTXNv4f32)>;
 
-def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i16|v4i32|v8i16|v4f32)$")>;
+def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32)$")>;
 
-def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
+                                      (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+                                      (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
 
-def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
-
-def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
-def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)v1i64_indexed$")>;
-def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
-def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32],
+                                      (instregex "^FML(A|S)(v2f32|(v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64],
+                                      (instregex "^FML(A|S)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32],
+                                      (instregex "^FML(A|S)(v4f32|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64],
+                                      (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>;
 
 // SIMD Integer Instructions
 // -----------------------------------------------------------------------------
@@ -92,12 +607,14 @@ def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^SUB(v1i64|v2i32|v4i16|v8i8)$"
 def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v2i32|v4i16|v8i8)(_v.*)?$")>;
 def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)SHLv1i64$")>;
 def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)SHR(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)SHRd$")>;
 def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^((S|U)?(MAX|MIN)P?|ABS|ADDP|CM(EQ|GE|HS|GT|HI))(v1i64|v2i32|v4i16|v8i8)$")>;
 def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^CM(EQ|GE|HS|GT|HI)(v1i64|v2i32|v4i16|v8i8)$")>;
 def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^CM(EQ|LE|GE|GT|LT)(v1i64|v2i32|v4i16|v8i8)rz$")>;
 def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^CMTST(v1i64|v2i32|v4i16|v8i8)$")>;
 def : InstRW<[FalkorWr_1VXVY_2cyc],   (instrs PMULv8i8)>;
 def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^SHL(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^SHLd$")>;
 
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQNEG(v2i32|v4i16|v8i8)$")>;
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)$")>;
@@ -110,6 +627,8 @@ def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)QSUB(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)RHADD(v2i32|v4i16|v8i8)$")>;
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)RSHR(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)RSHRd$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^R?SHRN(v2i32|v4i16|v8i8)_shift$")>;
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)?(MAX|MIN)V(v4i16v|v4i32v)$")>;
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs ADDVv4i16v)>;
@@ -120,10 +639,14 @@ def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64)
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)ADDLVv8i8v$")>;
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>;
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs ADDVv8i8v)>;
-def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
-def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
-def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>;
-def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc],
+                                      (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc],
+                                      (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc],
+                                      (instregex "^SQDMULL(i16|i32)$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
+                                      (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
 
 def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>;
 
@@ -154,7 +677,7 @@ def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^ADDP(v4i32|v8i16|v16i8)$")>;
 def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^CM(EQ|GE|HS|GT|HI)(v16i8|v2i64|v4i32|v8i16)$")>;
 def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^CM(EQ|LE|GE|GT|LT)(v16i8|v2i64|v4i32|v8i16)rz$")>;
 def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(CMTST|PMUL)(v16i8|v2i64|v4i32|v8i16)$")>;
-def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^PMULL2?(v8i8|v16i8)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^PMULL(v8i8|v16i8)$")>;
 def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^SHL(v16i8|v8i16|v4i32|v2i64)_shift$")>;
 def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
 
@@ -165,14 +688,18 @@ def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)(ADALP|QADD)(v16i8|v8i16
 def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)QSHLU?(v2i64|v4i32|v8i16|v16i8)_shift$")>;
 def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)(QSHL|RSHL|QRSHL|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)$")>;
 def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)RSHR(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^R?SHRN(v2i64|v4i32|v8i16|v16i8)_shift$")>;
 def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)$")>;
-def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^PMULL2?(v1i64|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^PMULL(v1i64|v2i64)$")>;
 def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>;
 def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>;
 
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>;
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
+                                      (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
+                                      (instregex "^SQDMULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+                                      (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
 
 def : InstRW<[FalkorWr_3VXVY_3cyc],   (instregex "^(S|U)ADDLVv4i32v$")>;
 
@@ -186,99 +713,114 @@ def : InstRW<[FalkorWr_4VXVY_3cyc],   (instregex "^(S|U)ABALv.*$")>;
 
 def : InstRW<[FalkorWr_4VXVY_4cyc],   (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>;
 
-def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>;
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)v[248].*$")>;
+def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA],
+                                      (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+                                      (instregex "^SQD(MLAL|MLSL)v[248].*$")>;
 
 // SIMD Load Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[WriteVLD],                               (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
-def : InstRW<[WriteVLD],                               (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[WriteVLD],                               (instrs LD2i64)>;
-def : InstRW<[WriteVLD, WriteAdr],                     (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
-def : InstRW<[WriteVLD, WriteAdr],                     (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
-def : InstRW<[WriteVLD, WriteAdr],                     (instrs LD2i64_POST)>;
+def : InstRW<[FalkorWr_1LD_3cyc],                           (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],       (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],                           (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],       (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],                           (instrs LD2i64)>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],       (instrs LD2i64_POST)>;
 
-def : InstRW<[FalkorWr_1LD_1VXVY_4cyc],                (instregex "LD1i(8|16|32)$")>;
-def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, WriteAdr],      (instregex "LD1i(8|16|32)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1VXVY_4cyc],                     (instregex "^LD1i(8|16|32)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1VXVY_4cyc], (instregex "^LD1i(8|16|32)_POST$")>;
 
-def : InstRW<[FalkorWr_1LD_1none_3cyc],                (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc],                (instregex "^LD2Twov(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc],                (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr],      (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr],      (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr],      (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc],                     (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc],                     (instregex "^LD2Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc],                     (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc], (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
 
-def : InstRW<[FalkorWr_2LD_3cyc],                      (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc],                      (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc],                      (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc],                      (instrs LD3i64)>;
-def : InstRW<[FalkorWr_2LD_3cyc],                      (instrs LD4i64)>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instrs LD3i64_POST)>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instrs LD4i64_POST)>;
+def : InstRW<[FalkorWr_2LD_3cyc],                           (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc],                           (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc],                           (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc],                           (instrs LD3i64)>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instrs LD3i64_POST)>;
+def : InstRW<[FalkorWr_2LD_3cyc],                           (instrs LD4i64)>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc],       (instrs LD4i64_POST)>;
 
-def : InstRW<[FalkorWr_1LD_2VXVY_4cyc],                (instregex "^LD2i(8|16|32)$")>;
-def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, WriteAdr],      (instregex "^LD2i(8|16|32)_POST$")>;
+def : InstRW<[FalkorWr_1LD_2VXVY_4cyc],                     (instregex "^LD2i(8|16|32)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_2VXVY_4cyc], (instregex "^LD2i(8|16|32)_POST$")>;
 
-def : InstRW<[FalkorWr_2LD_1none_3cyc],                (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_2LD_1none_3cyc],                (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr],      (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr],      (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc],                     (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc],                     (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_1none_3cyc], (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
 
-def : InstRW<[FalkorWr_3LD_3cyc],                      (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_3LD_3cyc],                      (instrs LD3Threev2d)>;
-def : InstRW<[FalkorWr_3LD_3cyc],                      (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr],            (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr],            (instrs LD3Threev2d_POST)>;
-def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr],            (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_3LD_3cyc],                           (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc],       (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_3LD_3cyc],                           (instrs LD3Threev2d)>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc],       (instrs LD3Threev2d_POST)>;
+def : InstRW<[FalkorWr_3LD_3cyc],                           (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_3LD_3cyc],       (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
 
-def : InstRW<[FalkorWr_1LD_3VXVY_4cyc],                (instregex "LD3i(8|16|32)$")>;
-def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, WriteAdr],      (instregex "LD3i(8|16|32)_POST$")>;
+def : InstRW<[FalkorWr_1LD_3VXVY_4cyc],                     (instregex "^LD3i(8|16|32)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3VXVY_4cyc], (instregex "^LD3i(8|16|32)_POST$")>;
 
-def : InstRW<[FalkorWr_2LD_2none_3cyc],                (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_2LD_2none_3cyc],                (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr],      (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
-def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr],      (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc],                     (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc],                     (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2none_3cyc], (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
 
-def : InstRW<[FalkorWr_4LD_3cyc],                      (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_4LD_3cyc],                      (instrs LD4Fourv2d)>;
-def : InstRW<[FalkorWr_4LD_3cyc],                      (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
-def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr],            (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
-def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr],            (instrs LD4Fourv2d_POST)>;
-def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr],            (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_4LD_3cyc],                           (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc],       (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_4LD_3cyc],                           (instrs LD4Fourv2d)>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc],       (instrs LD4Fourv2d_POST)>;
+def : InstRW<[FalkorWr_4LD_3cyc],                           (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_4LD_3cyc],       (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
 
-def : InstRW<[FalkorWr_1LD_4VXVY_4cyc],                (instregex "^LD4i(8|16|32)$")>;
-def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, WriteAdr],      (instregex "^LD4i(8|16|32)_POST$")>;
+def : InstRW<[FalkorWr_1LD_4VXVY_4cyc],                     (instregex "^LD4i(8|16|32)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4VXVY_4cyc], (instregex "^LD4i(8|16|32)_POST$")>;
 
-def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc],          (instregex "LD3Threev(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, WriteAdr],(instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc],               (instregex "^LD3Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1none_4cyc],
+                                                            (instregex "^LD3Threev(8b|4h|2s|1d)_POST$")>;
 
-def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc],          (instregex "^LD4Fourv(8b|4h|2s|1d)$")>;
-def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, WriteAdr],(instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc],               (instregex "^LD4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2none_4cyc],
+                                                            (instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>;
 
-def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc],      (instregex "LD3Threev(16b|8h|4s)$")>;
-def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc],      (instregex "^LD4Fourv(16b|8h|4s)$")>;
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc],           (instregex "^LD3Threev(16b|8h|4s)$")>;
 
-def : InstRW<[FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, WriteAdr],(instregex "LD3Threev(16b|8h|4s)_POST$")>;
-def : InstRW<[FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, WriteAdr],(instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc],           (instregex "^LD4Fourv(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc],
+                                                            (instregex "^LD3Threev(16b|8h|4s)_POST$")>;
+
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc],
+                                                            (instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
 
 // Arithmetic and Logical Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_ADD],          (instregex "^ADD(S)?(W|X)r(s|x)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(CCMN|CCMP)(W|X)(r|i)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^ADC(S)?(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^ADD(S)?(W|X)r(r|i)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(CSEL|CSINC|CSINV|CSNEG)(W|X)r$")>;
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^AND(S)?(W|X)r(i|r|s)$")>;
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^BIC(S)?(W|X)r(r|s)$")>;
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^EON(W|X)r(r|s)$")>;
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^EOR(W|X)r(i|r|s)$")>;
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^ORN(W|X)r(r|s)$")>;
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^ORR(W|X)r(i|r|s)$")>;
-def : InstRW<[FalkorWr_2XYZ_2cyc],    (instregex "^SUB(S)?(W|X)r(s|x)$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^SBC(S)?(W|X)r$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^SUB(S)?(W|X)r(r|i)$")>;
+def : InstRW<[FalkorWr_ADDSUBsx],     (instregex "^ADD(S)?(W|X)r(s|x|x64)$")>;
+def : InstRW<[FalkorWr_ADDSUBsx],     (instregex "^SUB(S)?(W|X)r(s|x|x64)$")>;
 
 // SIMD Miscellaneous Instructions
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^CPY(i8|i16|i32|i64)$")>;
 def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^INSv(i8|i16)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^(S|U)MOVv.*$")>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v8i8$")>;
@@ -287,35 +829,42 @@ def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs TBLv8i8One)>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs NOTv8i8)>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^REV(16|32|64)v.*$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN|XTN2)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>;
 
-def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(CLS|CLZ|CNT|RBIT)(v2i32|v4i16|v8i8)$")>;
 
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "(S|U)QXTU?Nv.*$")>;
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64, FRECPEv2f32, FRSQRTEv2f32)>;
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FRECPXv1i32, FRECPXv1i64)>;
 def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs URECPEv2i32, URSQRTEv2i32)>;
 
-def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+                                      (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
 
-def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+                                      (instrs FRECPS64, FRSQRTS64)>;
 
-def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],
+                                      (instregex "^INSv(i32|i64)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_2GTOV_1cyc],   (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v16i8$")>;
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs EXTv16i8)>;
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>;
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs NOTv16i8)>;
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs TBLv16i8One)>;
 
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>;
 def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>;
 def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs URECPEv4i32, URSQRTEv4i32)>;
 
 def : InstRW<[FalkorWr_2VXVY_4cyc],   (instrs TBLv8i8Two)>;
 def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^TBX(v8|v16)i8One$")>;
 
-def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>;
+def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc],
+                                      (instrs FRECPSv4f32, FRSQRTSv4f32)>;
 
-def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>;
+def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc],
+                                      (instrs FRECPSv2f64, FRSQRTSv2f64)>;
 
 def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^TBL(v8i8Three|v16i8Two)$")>;
 def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^TBX(v8i8Two|v16i8Two)$")>;
@@ -328,50 +877,95 @@ def : InstRW<[FalkorWr_5VXVY_7cyc],   (instregex "^TBX(v8i8Four|v16i8Four)$")>;
 
 // SIMD Store Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[WriteVST],              (instregex "^STP(D|S)(i)$")>;
-def : InstRW<[WriteVST, WriteAdr],    (instregex "^STP(D|S)(post|pre)$")>;
-def : InstRW<[FalkorWr_2XYZ_2ST_2VSD_0cyc], (instregex "^STRQro(W|X)$")>;
 
-def : InstRW<[WriteVST],                                                        (instregex "^ST1(One(v8b|v4h|v2s|v1d)(_POST)?|(i8|i16|i32|i64)(_POST)?|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
-def : InstRW<[WriteVST],                                                        (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>;
-def : InstRW<[WriteVST, WriteAdr],                                              (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
-def : InstRW<[WriteVST, WriteAdr],                                              (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STR(Q|D|S|H|B)ui$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+                                       (instregex "^STR(Q|D|S|H|B)(post|pre)$")>;
+def : InstRW<[FalkorWr_STRVro],        (instregex "^STR(D|S|H|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^STPQi$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2VSD_2ST_0cyc],
+                                       (instregex "^STPQ(post|pre)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STP(D|S)(i)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+                                       (instregex "^STP(D|S)(post|pre)$")>;
+def : InstRW<[FalkorWr_STRQro],        (instregex "^STRQro(W|X)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^STUR(Q|D|S|B|H)i$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instrs STNPDi, STNPSi)>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instrs STNPQi)>;
 
-def : InstRW<[WriteVST, WriteVST],                                              (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
-def : InstRW<[WriteVST, WriteVST],                                              (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[WriteVST, WriteVST],                                              (instregex "^ST3(i8|i16|i32|i64)$")>;
-def : InstRW<[WriteVST, WriteVST],                                              (instregex "^ST4(i8|i16|i32|i64)$")>;
-def : InstRW<[WriteVST, WriteVST, WriteAdr],                                    (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
-def : InstRW<[WriteVST, WriteVST, WriteAdr],                                    (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
-def : InstRW<[WriteVST, WriteVST, WriteAdr],                                    (instregex "^ST3(i8|i16|i32|i64)_POST$")>;
-def : InstRW<[WriteVST, WriteVST, WriteAdr],                                    (instregex "^ST4(i8|i16|i32|i64)_POST$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST1(One(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64)|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1VSD_1ST_0cyc],
+                                       (instregex "^ST1(One(v8b|v4h|v2s|v1d)_POST|(i8|i16|i32|i64)_POST)$")>;
+def : InstRW<[FalkorWr_1VSD_1ST_0cyc], (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc],
+                                       (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VSD_1ST_0cyc],
+                                       (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
 
-def : InstRW<[WriteV, WriteVST, WriteVST],                                      (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>;
-def : InstRW<[WriteV, WriteVST, WriteVST, WriteAdr],                            (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST3(i8|i16|i32|i64)$")>;
+def : InstRW<[FalkorWr_2VSD_2ST_0cyc], (instregex "^ST4(i8|i16|i32|i64)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+                                       (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+                                       (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+                                       (instregex "^ST3(i8|i16|i32|i64)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VSD_2ST_0cyc],
+                                       (instregex "^ST4(i8|i16|i32|i64)_POST$")>;
 
-def : InstRW<[WriteVST, WriteVST, WriteVST],                                    (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[WriteVST, WriteVST, WriteVST],                                    (instrs ST3Threev2d)>;
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr],                          (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr],                          (instrs ST3Threev2d_POST)>;
+def : InstRW<[FalkorWr_1VXVY_2ST_2VSD_0cyc],
+                                       (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_1VXVY_2ST_2VSD_0cyc],
+                                       (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>;
 
-def : InstRW<[WriteV, WriteV, WriteVST, WriteVST],                              (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>;
-def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteAdr],                    (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>;
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_3VSD_3ST_0cyc], (instrs ST3Threev2d)>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc],
+                                       (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_3VSD_3ST_0cyc],
+                                       (instrs ST3Threev2d_POST)>;
 
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],                          (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],                          (instrs ST4Fourv2d)>;
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],                (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
-def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],                (instrs ST4Fourv2d_POST)>;
+def : InstRW<[FalkorWr_2VXVY_2ST_2VSD_0cyc],
+                                       (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_2ST_2VSD_0cyc],
+                                       (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>;
 
-def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST],          (instregex "^ST3Three(v16b|v8h|v4s)$")>;
-def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST3Three(v16b|v8h|v4s)_POST$")>;
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[FalkorWr_4VSD_4ST_0cyc], (instrs ST4Fourv2d)>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc],
+                                       (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VSD_4ST_0cyc],
+                                       (instrs ST4Fourv2d_POST)>;
 
-def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST],          (instregex "^ST4Four(v16b|v8h|v4s)$")>;
-def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST4Four(v16b|v8h|v4s)_POST$")>;
+def : InstRW<[FalkorWr_2VXVY_4ST_4VSD_0cyc],
+                                       (instregex "^ST3Three(v16b|v8h|v4s)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_2VXVY_4ST_4VSD_0cyc],
+                                       (instregex "^ST3Three(v16b|v8h|v4s)_POST$")>;
+
+def : InstRW<[FalkorWr_4VXVY_4ST_4VSD_0cyc],
+                                       (instregex "^ST4Four(v16b|v8h|v4s)$")>;
+// FIXME: This is overly conservative in the imm POST case (no XYZ used in that case).
+def : InstRW<[FalkorWr_1XYZ_1cyc, FalkorWr_4VXVY_4ST_4VSD_0cyc],
+                                       (instregex "^ST4Four(v16b|v8h|v4s)_POST$")>;
 
 // Branch Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1none_0cyc],   (instrs B)>;
+def : InstRW<[FalkorWr_1none_0cyc],   (instrs B, TCRETURNdi)>;
 def : InstRW<[FalkorWr_1Z_0cyc],      (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ)(W|X))$")>;
+def : InstRW<[FalkorWr_1Z_0cyc],      (instrs RET_ReallyLR, TCRETURNri)>;
 def : InstRW<[FalkorWr_1ZB_0cyc],     (instrs Bcc)>;
 def : InstRW<[FalkorWr_1XYZB_0cyc],   (instrs BL)>;
 def : InstRW<[FalkorWr_1Z_1XY_0cyc],  (instrs BLR)>;
@@ -388,89 +982,103 @@ def : InstRW<[FalkorWr_4VXVY_3cyc],   (instrs SHA256SU1rrr)>;
 
 // FP Load Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[WriteLD],               (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
-def : InstRW<[WriteLD, WriteAdr],     (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
-def : InstRW<[WriteLD],               (instregex "^LDUR(Q|D|S|H|B)i$")>;
-def : InstRW<[FalkorWr_LDR],          (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDNPQi)>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDPQi)>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDNP(D|S)i$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDP(D|S)i$")>;
-def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi, WriteAdr],(instregex "LDP(D|S)(pre|post)$")>;
-def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi, WriteAdr],(instregex "^LDPQ(pre|post)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],
+                                      (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDUR(Q|D|S|H|B)i$")>;
+def : InstRW<[FalkorWr_LDRro],        (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+                                      (instrs LDNPQi)>;
+def : InstRW<[FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+                                      (instrs LDPQi)>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+                                      (instregex "LDNP(D|S)i$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+                                      (instregex "LDP(D|S)i$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_1none_3cyc, FalkorWr_none_3cyc],
+                                      (instregex "LDP(D|S)(pre|post)$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_2LD_3cyc, FalkorWr_none_3cyc],
+                                      (instregex "^LDPQ(pre|post)$")>;
 
 // FP Data Processing Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCCMP(E)?(H|S|D)rr$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCMP(E)?(H|S|D)r(r|i)$")>;
-def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVT(A|M|N|P)(S|U)U(W|X)(H|S|D)r$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(FABS|FNEG)(H|S|D)r$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCSEL(H|S|D)rrr$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCCMP(E)?(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCMP(E)?(S|D)r(r|i)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVT(A|M|N|P|Z)(S|U)U(W|X)(S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(FABS|FNEG)(S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCSEL(S|D)rrr$")>;
 
-def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^F(MAX|MIN)(NM)?(H|S|D)rr$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^F(MAX|MIN)(NM)?Pv2i(16|32|64)p$")>;
-def : InstRW<[FalkorWr_1VXVY_2cyc],   (instrs FCVTHSr, FCVTHDr)>;
-def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(H|S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^F(MAX|MIN)(NM)?(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^F(MAX|MIN)(NM)?Pv2i(32|64)p$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instrs FCVTSHr, FCVTDHr)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(S|D)r$")>;
 
-def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^FABD(16|32|64)$")>;
-def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(FADD|FSUB)(H|S|D)rr$")>;
-def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FCVTSHr, FCVTDHr)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^FABD(32|64)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(FADD|FSUB)(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FCVTHSr, FCVTHDr)>;
 
 def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs FCVTSDr, FCVTDSr)>;
 
-def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc],
+                                      (instregex "^F(N)?MULSrr$")>;
 
-def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
+                                      (instregex "^F(N)?MULDrr$")>;
 
-def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>;
-def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>;
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(S|D)rr$")>;
+def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(S|D)r$")>;
 
-def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], (instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
-def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], (instregex "^F(N)?M(ADD|SUB)Drrr$")>;
+def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32],
+                                      (instregex "^F(N)?M(ADD|SUB)Srrr$")>;
+def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64],
+                                      (instregex "^F(N)?M(ADD|SUB)Drrr$")>;
 
 // FP Miscellaneous Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_FMOV],         (instregex "^FMOV(H|S|D)i$")>;
-def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
-def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
-def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FMOV(Hr|Sr|Dr|v.*_ns)$")>;
-// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov 0.0
+def : InstRW<[FalkorWr_FMOV],         (instregex "^FMOV(WS|XD|XDHigh)r$")>;
+def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^FMOV(S|D)i$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVTZ(S|U)S(W|X)(D|S)ri$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVTZ(S|U)(d|s)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FMOV(SW|DX|DXHigh)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FMOV(Sr|Dr|v.*_ns)$")>;
+// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov wzr/xzr
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs FMOVD0, FMOVS0)>;
 
 def : InstRW<[FalkorWr_1GTOV_4cyc],   (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>;
-def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)CVTF(v1i32|v2i32|v1i64|v2f32|d|s)(_shift)?")>;
 
-def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)CVTF(v2i64|v4i32|v8i16|v2f64|v4f32|v8f16)(_shift)?")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)CVTF(v2i64|v4i32|v2f64|v4f32)(_shift)?")>;
 
 // Load Instructions
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFMui, PRFMl)>;
 def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFUMi)>;
-
-def : InstRW<[WriteLD, WriteLDHi],    (instregex "^LDNP(W|X)i$")>;
-def : InstRW<[WriteLD, WriteLDHi],    (instregex "^LDP(W|X)i$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR(B|H|W|X)ui$")>;
-def : InstRW<[WriteLD, WriteAdr],     (instregex "^LDR(B|H|W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+                                      (instregex "^LDNP(W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+                                      (instregex "^LDP(W|X)i$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc, FalkorWr_none_3cyc],
+                                      (instregex "^LDP(W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_3cyc],
+                                      (instregex "^LDR(BB|HH|W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_LDRro],        (instregex "^LDR(BB|HH|W|X)ro(W|X)$")>;
 def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR(W|X)l$")>;
 def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDTR(B|H|W|X)i$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDUR(B|H|W|X)i$")>;
-
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDUR(BB|HH|W|X)i$")>;
+def : InstRW<[FalkorWr_PRFMro],       (instregex "^PRFMro(W|X)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, FalkorWr_none_4cyc],
+                                      (instrs LDPSWi)>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc, FalkorWr_none_4cyc],
+                                      (instregex "^LDPSW(post|pre)$")>;
 def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1LD_4cyc],
+                                      (instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
+def : InstRW<[FalkorWr_LDRSro],       (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
 def : InstRW<[FalkorWr_1LD_4cyc],     (instrs LDRSWl)>;
 def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
 def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
 
-def : InstRW<[FalkorWr_PRFM],         (instregex "^PRFMro(W|X)$")>;
-def : InstRW<[FalkorWr_LDR],          (instregex "^LDR(B|H|W|X)ro(W|X)$")>;
-
-def : InstRW<[FalkorWr_LDRS],         (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
-
-def : InstRW<[FalkorWr_1LD_4cyc, WriteAdr],(instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
-def : InstRW<[WriteLD, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>;
-def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi],(instrs LDPSWi)>;
-def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi, WriteAdr],(instregex "^LDPSW(post|pre)$")>;
-
 // Miscellaneous Data-Processing Instructions
 // -----------------------------------------------------------------------------
 def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(S|U)?BFM(W|X)ri$")>;
@@ -480,17 +1088,22 @@ def : InstRW<[FalkorWr_2XYZ_2cyc],    (instregex "^EXTR(W|X)rri$")>;
 
 // Divide and Multiply Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
-def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], (instregex "^M(ADD|SUB)Wrrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64],
+                                        (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32],
+                                        (instregex "^M(ADD|SUB)Wrrr$")>;
 
-def : InstRW<[FalkorWr_IMUL64_1X_5cyc],                                            (instregex "^(S|U)MULHrr$")>;
-def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^M(ADD|SUB)Xrrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>;
+def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64],
+                                        (instregex "^M(ADD|SUB)Xrrr$")>;
 
-def : InstRW<[FalkorWr_1X_1Z_8cyc],   (instregex "^(S|U)DIVWr$")>;
-def : InstRW<[FalkorWr_1X_1Z_16cyc],  (instregex "^(S|U)DIVXr$")>;
+def : InstRW<[FalkorWr_1X_1Z_8cyc],     (instregex "^(S|U)DIVWr$")>;
+def : InstRW<[FalkorWr_1X_1Z_16cyc],    (instregex "^(S|U)DIVXr$")>;
 
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],                (instregex "^(S|U)MULLv.*$")>;
-def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^(S|U)(MLAL|MLSL)v.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc],
+                                        (instregex "^(S|U)MULLv.*$")>;
+def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA],
+                                        (instregex "^(S|U)(MLAL|MLSL)v.*$")>;
 
 // Move and Shift Instructions
 // -----------------------------------------------------------------------------
@@ -498,6 +1111,11 @@ def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(LSLV|LSRV|ASRV|RORV|MOVK)(W|
 def : InstRW<[FalkorWr_1XYZB_1cyc],   (instregex "^ADRP?$")>;
 def : InstRW<[FalkorWr_1XYZB_1cyc],   (instregex "^MOVN(W|X)i$")>;
 def : InstRW<[FalkorWr_MOVZ],         (instregex "^MOVZ(W|X)i$")>;
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instrs MOVi32imm, MOVi64imm)>;
+def : InstRW<[WriteSequence<[FalkorWr_1XYZ_1cyc, FalkorWr_1XYZ_1cyc]>],
+                                      (instrs MOVaddr, MOVaddrBA, MOVaddrCP, MOVaddrEXT, MOVaddrJT, MOVaddrTLS)>;
+def : InstRW<[WriteSequence<[FalkorWr_1LD_3cyc, FalkorWr_1XYZ_1cyc]>],
+                                      (instrs LOADgot)>;
 
 // Other Instructions
 // -----------------------------------------------------------------------------
@@ -507,13 +1125,12 @@ def : InstRW<[FalkorWr_1ST_0cyc],     (instrs SYSxt, SYSLxt)>;
 def : InstRW<[FalkorWr_1Z_0cyc],      (instrs MSRpstateImm1, MSRpstateImm4)>;
 
 def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^(LDAR(B|H|W|X)|LDAXP(W|X)|LDAXR(B|H|W|X)|LDXP(W|X)|LDXR(B|H|W|X))$")>;
-def : InstRW<[FalkorWr_1LD_3cyc],     (instrs MRS)>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instrs MRS, MOVbaseTLS)>;
 
 def : InstRW<[FalkorWr_1LD_1Z_3cyc],  (instrs DRPS)>;
 
 def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>;
-def : InstRW<[WriteVST],              (instrs STNPDi, STNPSi)>;
-def : InstRW<[WriteSTP],              (instrs STNPWi, STNPXi)>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs STNPWi, STNPXi)>;
 def : InstRW<[FalkorWr_2LD_1Z_3cyc],  (instrs ERET)>;
 
 def : InstRW<[FalkorWr_1ST_1SD_1LD_3cyc], (instregex "^LDC.*$")>;
@@ -523,20 +1140,16 @@ def : InstRW<[FalkorWr_1ST_1SD_1LD_0cyc], (instregex "^STXR(B|H|W|X)$")>;
 
 def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXP(W|X)$")>;
 def : InstRW<[FalkorWr_2LD_1ST_1SD_3cyc], (instregex "^STLXR(B|H|W|X)$")>;
-def : InstRW<[WriteVST, WriteVST],    (instrs STNPQi)>;
 
 // Store Instructions
 // -----------------------------------------------------------------------------
-def : InstRW<[WriteST],               (instregex "^STP(W|X)i$")>;
-def : InstRW<[WriteST, WriteAdr],     (instregex "^STP(W|X)(post|pre)$")>;
-def : InstRW<[WriteST],               (instregex "^STR(Q|D|S|BB|HH)ui$")>;
-def : InstRW<[WriteST],               (instregex "^STUR(Q|D|S|BB|HH)i$")>;
-def : InstRW<[WriteST],               (instregex "^STR(B|H|W|X)ui$")>;
-def : InstRW<[WriteST, WriteAdr],     (instregex "^STR(B|H|W|X)(post|pre)$")>;
-def : InstRW<[WriteST],               (instregex "^STTR(B|H|W|X)i$")>;
-def : InstRW<[WriteST],               (instregex "^STUR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc],     (instregex "^STP(W|X)i$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc],
+                                          (instregex "^STP(W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc],     (instregex "^STR(BB|HH|W|X)ui$")>;
+def : InstRW<[FalkorWr_none_1cyc, FalkorWr_1SD_1ST_0cyc],
+                                          (instregex "^STR(BB|HH|W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_STRro],            (instregex "^STR(BB|HH|W|X)ro(W|X)$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc],     (instregex "^STTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1SD_1ST_0cyc],     (instregex "^STUR(BB|HH|W|X)i$")>;
 
-def : InstRW<[WriteST, WriteAdr],     (instregex "^STR(B|H|W|X)ro(W|X)$")>;
-
-def : InstRW<[WriteVST, WriteVST],           (instregex "^STPQi$")>;
-def : InstRW<[WriteVST, WriteVST, WriteAdr], (instregex "^STPQ(post|pre)$")>;
diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
deleted file mode 100644
index 6526cc28e806..000000000000
--- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
+++ /dev/null
@@ -1,403 +0,0 @@
-//=- AArch64SchedFalkorWrRes.td - Falkor Write Res ---*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Contains all of the Falkor specific SchedWriteRes types. The approach
-// below is to define a generic SchedWriteRes for every combination of
-// latency and microOps. The naming conventions is to use a prefix, one field
-// for latency, and one or more microOp count/type designators.
-//   Prefix: FalkorWr
-//   MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD)
-//   Latency: #cyc
-//
-// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued
-//      down one Z pipe, six SD pipes, four VX pipes and the total latency is
-//      six cycles.
-//
-// Contains all of the Falkor specific ReadAdvance types for forwarding logic.
-//
-// Contains all of the Falkor specific WriteVariant types for immediate zero
-// and LSLFast.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Define 1 micro-op types
-
-def FalkorWr_1X_2cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 2; }
-def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
-def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; }
-def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; }
-def FalkorWr_1Z_0cyc    : SchedWriteRes<[FalkorUnitZ]>   { let Latency = 0; }
-def FalkorWr_1ZB_0cyc   : SchedWriteRes<[FalkorUnitZB]>  { let Latency = 0; }
-def FalkorWr_1LD_3cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 3; }
-def FalkorWr_1LD_4cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 4; }
-def FalkorWr_1XYZ_1cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; }
-def FalkorWr_1XYZ_2cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; }
-def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; }
-def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; }
-def FalkorWr_1none_0cyc : SchedWriteRes<[]>              { let Latency = 0; }
-
-def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
-def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
-def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
-def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
-def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
-def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
-def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
-def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
-
-def FalkorWr_1LD_0cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 0; }
-def FalkorWr_1ST_0cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 0; }
-def FalkorWr_1ST_3cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 3; }
-
-def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; }
-def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; }
-def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; }
-
-//===----------------------------------------------------------------------===//
-// Define 2 micro-op types
-
-def FalkorWr_2VXVY_1cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 1;
-  let NumMicroOps = 2;
-}
-def FalkorWr_2VXVY_2cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}
-def FalkorWr_2VXVY_3cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 3;
-  let NumMicroOps = 2;
-}
-def FalkorWr_2VXVY_4cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-}
-def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-}
-def FalkorWr_2VXVY_5cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-}
-def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-}
-def FalkorWr_2VXVY_6cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-}
-def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 6;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-}
-def FalkorWr_1XYZ_1LD_4cyc  : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-}
-def FalkorWr_2LD_3cyc   : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
-  let Latency = 3;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
-  let Latency = 10;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_2GTOV_1cyc    : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> {
-  let Latency = 1;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> {
-  let Latency = 4;
-  let NumMicroOps = 2;
-}
-def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
-  let Latency = 5;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_2XYZ_2cyc   : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> {
-  let Latency = 2;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> {
-  let Latency = 0;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_1X_1Z_8cyc  : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
-  let Latency = 8;
-  let ResourceCycles = [2, 8];
-}
-
-def FalkorWr_1X_1Z_16cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
-  let Latency = 16;
-  let ResourceCycles = [2, 16];
-}
-
-def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> {
-  let Latency = 3;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> {
-  let Latency = 3;
-  let NumMicroOps = 2;
-}
-
-def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> {
-  let Latency = 0;
-  let NumMicroOps = 2;
-}
-
-//===----------------------------------------------------------------------===//
-// Define 3 micro-op types
-
-def FalkorWr_1ST_1SD_1LD_0cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
-                                               FalkorUnitLD]> {
-  let Latency = 0;
-  let NumMicroOps = 3;
-}
-
-def FalkorWr_1ST_1SD_1LD_3cyc : SchedWriteRes<[FalkorUnitST, FalkorUnitSD,
-                                               FalkorUnitLD]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-}
-
-def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-}
-
-def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 3;
-}
-
-def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 5;
-  let NumMicroOps = 3;
-}
-
-def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 6;
-  let NumMicroOps = 3;
-}
-
-def FalkorWr_1LD_2VXVY_4cyc  : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 3;
-}
-
-def FalkorWr_2LD_1none_3cyc  : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-}
-
-def FalkorWr_3LD_3cyc        : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
-                                              FalkorUnitLD]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-}
-
-def FalkorWr_2LD_1Z_3cyc     : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
-                                             FalkorUnitZ]> {
-  let Latency = 3;
-  let NumMicroOps = 3;
-}
-
-//===----------------------------------------------------------------------===//
-// Define 4 micro-op types
-
-def FalkorWr_2VX_2VY_2cyc  : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
-                                            FalkorUnitVX, FalkorUnitVY]> {
-  let Latency = 2;
-  let NumMicroOps = 4;
-}
-
-def FalkorWr_4VXVY_2cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
-                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 2;
-  let NumMicroOps = 4;
-}
-def FalkorWr_4VXVY_3cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
-                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 3;
-  let NumMicroOps = 4;
-}
-def FalkorWr_4VXVY_4cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
-                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 4;
-}
-def FalkorWr_4VXVY_6cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
-                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 6;
-  let NumMicroOps = 4;
-}
-
-def FalkorWr_4LD_3cyc      : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
-                                            FalkorUnitLD, FalkorUnitLD]> {
-  let Latency = 3;
-  let NumMicroOps = 4;
-}
-
-def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
-                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 4;
-}
-
-def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
-  let Latency = 3;
-  let NumMicroOps = 4;
-}
-
-def FalkorWr_2LD_1ST_1SD_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitST,
-                                              FalkorUnitSD, FalkorUnitLD]> {
-  let Latency = 3;
-  let NumMicroOps = 4;
-}
-
-//===----------------------------------------------------------------------===//
-// Define 5 micro-op types
-
-def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
-                                            FalkorUnitVXVY, FalkorUnitVXVY,
-                                            FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 5;
-}
-def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
-                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 5;
-}
-def FalkorWr_5VXVY_7cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
-                                            FalkorUnitVXVY, FalkorUnitVXVY,
-                                            FalkorUnitVXVY]> {
-  let Latency = 7;
-  let NumMicroOps = 5;
-}
-
-//===----------------------------------------------------------------------===//
-// Define 6 micro-op types
-
-def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
-                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 6;
-}
-
-def FalkorWr_2XYZ_2ST_2VSD_0cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST,
-                                                FalkorUnitVSD, FalkorUnitXYZ,
-                                                FalkorUnitST, FalkorUnitVSD]> {
-  let Latency = 0;
-  let NumMicroOps = 6;
-}
-
-//===----------------------------------------------------------------------===//
-// Define 8 micro-op types
-
-def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
-                                             FalkorUnitVXVY, FalkorUnitVXVY,
-                                             FalkorUnitLD, FalkorUnitLD,
-                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 8;
-}
-
-//===----------------------------------------------------------------------===//
-// Define 9 micro-op types
-
-def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
-                                             FalkorUnitLD, FalkorUnitVXVY,
-                                             FalkorUnitVXVY, FalkorUnitLD,
-                                             FalkorUnitLD, FalkorUnitXYZ,
-                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 9;
-}
-
-def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
-                                             FalkorUnitLD, FalkorUnitVXVY,
-                                             FalkorUnitVXVY, FalkorUnitXYZ,
-                                             FalkorUnitLD, FalkorUnitLD,
-                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
-  let Latency = 4;
-  let NumMicroOps = 9;
-}
-
-// Forwarding logic is modeled for multiply add/accumulate.
-// -----------------------------------------------------------------------------
-def FalkorReadIMA32  : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>;
-def FalkorReadIMA64  : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>;
-def FalkorReadVMA    : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>;
-def FalkorReadFMA32  : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>;
-def FalkorReadFMA64  : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>;
-
-// SchedPredicates and WriteVariants for Immediate Zero and LSLFast
-// -----------------------------------------------------------------------------
-def FalkorImmZPred    : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>;
-def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>; 
-
-def FalkorWr_FMOV  : SchedWriteVariant<[
-                       SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
-                       SchedVar<NoSchedPred,    [FalkorWr_1GTOV_1cyc]>]>;
-
-def FalkorWr_MOVZ  : SchedWriteVariant<[
-                       SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
-                       SchedVar<NoSchedPred,    [FalkorWr_1XYZB_1cyc]>]>;
-
-def FalkorWr_LDR   : SchedWriteVariant<[
-                       SchedVar<FalkorLSLFastPred, [FalkorWr_1LD_3cyc]>,
-                       SchedVar<NoSchedPred,       [FalkorWr_1XYZ_1LD_4cyc]>]>;
-
-def FalkorWr_ADD   : SchedWriteVariant<[
-                       SchedVar<FalkorLSLFastPred, [FalkorWr_1XYZ_1cyc]>,
-                       SchedVar<NoSchedPred,       [FalkorWr_2XYZ_2cyc]>]>;
-
-def FalkorWr_PRFM  : SchedWriteVariant<[
-                       SchedVar<FalkorLSLFastPred, [FalkorWr_1ST_3cyc]>,
-                       SchedVar<NoSchedPred,       [FalkorWr_1XYZ_1ST_4cyc]>]>;
-
-def FalkorWr_LDRS  : SchedWriteVariant<[
-                       SchedVar<FalkorLSLFastPred, [FalkorWr_1LD_4cyc]>,
-                       SchedVar<NoSchedPred,       [FalkorWr_1XYZ_1LD_5cyc]>]>;
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index b369ee7e4ba2..d3cab1ad3397 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -90,7 +90,6 @@ void AArch64Subtarget::initializeProperties() {
     break;
   case Falkor:
     MaxInterleaveFactor = 4;
-    VectorInsertExtractBaseCost = 2;
     // FIXME: remove this to enable 64-bit SLP if performance looks good.
     MinVectorRegisterBitWidth = 128;
     break;
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 132f192f2a9a..cb3f72a524f5 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -10,10 +10,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64TargetMachine.h"
 #include "AArch64.h"
 #include "AArch64MacroFusion.h"
 #include "AArch64Subtarget.h"
-#include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
 #include "AArch64TargetTransformInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
@@ -277,7 +278,7 @@ public:
   ScheduleDAGInstrs *
   createPostMachineScheduler(MachineSchedContext *C) const override {
     const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
-    if (ST.hasFuseLiterals()) {
+    if (ST.hasFuseAES() || ST.hasFuseLiterals()) {
       // Run the Macro Fusion after RA again since literals are expanded from
       // pseudos then (v. addPreSched2()).
       ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
@@ -295,6 +296,7 @@ public:
   bool addIRTranslator() override;
   bool addLegalizeMachineIR() override;
   bool addRegBankSelect() override;
+  void addPreGlobalInstructionSelect() override;
   bool addGlobalInstructionSelect() override;
 #endif
   bool addILPOpts() override;
@@ -404,6 +406,12 @@ bool AArch64PassConfig::addRegBankSelect() {
   return false;
 }
 
+void AArch64PassConfig::addPreGlobalInstructionSelect() {
+  // Workaround the deficiency of the fast register allocator.
+  if (TM->getOptLevel() == CodeGenOpt::None)
+    addPass(new Localizer());
+}
+
 bool AArch64PassConfig::addGlobalInstructionSelect() {
   addPass(new InstructionSelect());
   return false;
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index b279bd61e180..e7ebb37a9d62 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -425,7 +425,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
    FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
    FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
    FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
-   FeatureFastFMAF32, FeatureDPP,
+   FeatureFastFMAF32, FeatureSDWA, FeatureDPP,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
   ]
 >;
@@ -534,10 +534,12 @@ def AMDGPUAsmVariants {
   int VOP3_ID = 1;
   string SDWA = "SDWA";
   int SDWA_ID = 2;
+  string SDWA9 = "SDWA9";
+  int SDWA9_ID = 3;
   string DPP = "DPP";
-  int DPP_ID = 3;
+  int DPP_ID = 4;
   string Disable = "Disable";
-  int Disable_ID = 4;
+  int Disable_ID = 5;
 }
 
 def DefaultAMDGPUAsmParserVariant : AsmParserVariant {
@@ -555,6 +557,12 @@ def SDWAAsmParserVariant : AsmParserVariant {
   let Name = AMDGPUAsmVariants.SDWA;
 }
 
+def SDWA9AsmParserVariant : AsmParserVariant {
+  let Variant = AMDGPUAsmVariants.SDWA9_ID;
+  let Name = AMDGPUAsmVariants.SDWA9;
+}
+
+
 def DPPAsmParserVariant : AsmParserVariant {
   let Variant = AMDGPUAsmVariants.DPP_ID;
   let Name = AMDGPUAsmVariants.DPP;
@@ -567,6 +575,7 @@ def AMDGPU : Target {
   let AssemblyParserVariants = [DefaultAMDGPUAsmParserVariant,
                                 VOP3AsmParserVariant,
                                 SDWAAsmParserVariant,
+                                SDWA9AsmParserVariant,
                                 DPPAsmParserVariant];
   let AssemblyWriters = [AMDGPUAsmWriter];
 }
@@ -607,7 +616,10 @@ def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
   AssemblerPredicate<"FeatureVOP3P">;
 
 def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
-  AssemblerPredicate<"FeatureSDWA">;
+  AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
+
+def HasSDWA9 : Predicate<"Subtarget->hasSDWA()">,
+  AssemblerPredicate<"FeatureSDWA,FeatureGFX9">;
 
 def HasDPP : Predicate<"Subtarget->hasDPP()">,
   AssemblerPredicate<"FeatureDPP">;
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 5ec46a8294c0..723e8a7b54e2 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -127,6 +127,29 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
   return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
 }
 
+bool AMDGPUTargetLowering::isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op)
+{
+  assert(Op.getOpcode() == ISD::OR);
+
+  SDValue N0 = Op->getOperand(0);
+  SDValue N1 = Op->getOperand(1);
+  EVT VT = N0.getValueType();
+
+  if (VT.isInteger() && !VT.isVector()) {
+    KnownBits LHSKnown, RHSKnown;
+    DAG.computeKnownBits(N0, LHSKnown);
+
+    if (LHSKnown.Zero.getBoolValue()) {
+      DAG.computeKnownBits(N1, RHSKnown);
+
+      if (!(~RHSKnown.Zero & ~LHSKnown.Zero))
+        return true;
+    }
+  }
+
+  return false;
+}
+
 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                                            const AMDGPUSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -2596,8 +2619,6 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
 SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
                                                 DAGCombinerInfo &DCI) const {
   EVT VT = N->getValueType(0);
-  if (VT != MVT::i64)
-    return SDValue();
 
   ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
   if (!RHS)
@@ -2618,6 +2639,8 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
   case ISD::SIGN_EXTEND:
   case ISD::ANY_EXTEND: {
     // shl (ext x) => zext (shl x), if shift does not overflow int
+    if (VT != MVT::i64)
+      break;
     KnownBits Known;
     SDValue X = LHS->getOperand(0);
     DAG.computeKnownBits(X, Known);
@@ -2628,7 +2651,22 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N,
     SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0));
     return DAG.getZExtOrTrunc(Shl, SL, VT);
   }
+  case ISD::OR:  if (!isOrEquivalentToAdd(DAG, LHS)) break;
+  case ISD::ADD: { // Fall through from above
+    // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
+    if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
+      SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0),
+                                SDValue(RHS, 0));
+      SDValue C2V = DAG.getConstant(C2->getAPIntValue() << RHSVal,
+                                    SDLoc(C2), VT);
+      return DAG.getNode(LHS->getOpcode(), SL, VT, Shl, C2V);
+    }
+    break;
   }
+  }
+
+  if (VT != MVT::i64)
+    return SDValue();
 
   // i64 (shl x, C) -> (build_pair 0, (shl x, C -32))
 
@@ -3440,7 +3478,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
                                        DL);
     }
 
-    if ((OffsetVal + WidthVal) >= 32) {
+    if ((OffsetVal + WidthVal) >= 32 &&
+        !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
       SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
       return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
                          BitsFrom, ShiftVal);
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index fb2f15022d25..0d066cdbdff4 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -34,6 +34,9 @@ private:
   /// compare.
   SDValue getFFBH_U32(SelectionDAG &DAG, SDValue Op, const SDLoc &DL) const;
 
+public:
+  static bool isOrEquivalentToAdd(SelectionDAG &DAG, SDValue Op);
+
 protected:
   const AMDGPUSubtarget *Subtarget;
   AMDGPUAS AMDGPUASI;
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 9de302994e68..57905be18813 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -36,6 +36,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() {
   setAction({G_CONSTANT, S32}, Legal);
   setAction({G_CONSTANT, S64}, Legal);
 
+  setAction({G_FCONSTANT, S32}, Legal);
+
   setAction({G_GEP, P1}, Legal);
   setAction({G_GEP, P2}, Legal);
   setAction({G_GEP, 1, S64}, Legal);
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 85184b363905..07f92918a43f 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -97,6 +97,9 @@ private:
                                        Instruction *UseInst,
                                        int OpIdx0, int OpIdx1) const;
 
+  /// Check whether we have enough local memory for promotion.
+  bool hasSufficientLocalMem(const Function &F);
+
 public:
   static char ID;
 
@@ -107,7 +110,7 @@ public:
 
   StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
 
-  void handleAlloca(AllocaInst &I);
+  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
@@ -147,105 +150,21 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
   if (!ST.isPromoteAllocaEnabled())
     return false;
+
   AS = AMDGPU::getAMDGPUAS(*F.getParent());
 
-  FunctionType *FTy = F.getFunctionType();
-
-  // If the function has any arguments in the local address space, then it's
-  // possible these arguments require the entire local memory space, so
-  // we cannot use local memory in the pass.
-  for (Type *ParamTy : FTy->params()) {
-    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
-    if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
-      LocalMemLimit = 0;
-      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
-                      "local memory disabled.\n");
-      return false;
-    }
-  }
-
-  LocalMemLimit = ST.getLocalMemorySize();
-  if (LocalMemLimit == 0)
-    return false;
-
-  const DataLayout &DL = Mod->getDataLayout();
-
-  // Check how much local memory is being used by global objects
-  CurrentLocalMemUsage = 0;
-  for (GlobalVariable &GV : Mod->globals()) {
-    if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
-      continue;
-
-    for (const User *U : GV.users()) {
-      const Instruction *Use = dyn_cast<Instruction>(U);
-      if (!Use)
-        continue;
-
-      if (Use->getParent()->getParent() == &F) {
-        unsigned Align = GV.getAlignment();
-        if (Align == 0)
-          Align = DL.getABITypeAlignment(GV.getValueType());
-
-        // FIXME: Try to account for padding here. The padding is currently
-        // determined from the inverse order of uses in the function. I'm not
-        // sure if the use list order is in any way connected to this, so the
-        // total reported size is likely incorrect.
-        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
-        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
-        CurrentLocalMemUsage += AllocSize;
-        break;
-      }
-    }
-  }
-
-  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
-                                                          F);
-
-  // Restrict local memory usage so that we don't drastically reduce occupancy,
-  // unless it is already significantly reduced.
-
-  // TODO: Have some sort of hint or other heuristics to guess occupancy based
-  // on other factors..
-  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
-  if (OccupancyHint == 0)
-    OccupancyHint = 7;
-
-  // Clamp to max value.
-  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
-
-  // Check the hint but ignore it if it's obviously wrong from the existing LDS
-  // usage.
-  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
-
-
-  // Round up to the next tier of usage.
-  unsigned MaxSizeWithWaveCount
-    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
-
-  // Program is possibly broken by using more local mem than available.
-  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
-    return false;
-
-  LocalMemLimit = MaxSizeWithWaveCount;
-
-  DEBUG(
-    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
-    << "  Rounding size to " << MaxSizeWithWaveCount
-    << " with a maximum occupancy of " << MaxOccupancy << '\n'
-    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
-    << " available for promotion\n"
-  );
-
+  bool SufficientLDS = hasSufficientLocalMem(F);
+  bool Changed = false;
   BasicBlock &EntryBB = *F.begin();
   for (auto I = EntryBB.begin(), E = EntryBB.end(); I != E; ) {
     AllocaInst *AI = dyn_cast<AllocaInst>(I);
 
     ++I;
     if (AI)
-      handleAlloca(*AI);
+      Changed |= handleAlloca(*AI, SufficientLDS);
   }
 
-  return true;
+  return Changed;
 }
 
 std::pair<Value *, Value *>
@@ -661,12 +580,105 @@ bool AMDGPUPromoteAlloca::collectUsesWithPtrTypes(
   return true;
 }
 
+bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
+
+  FunctionType *FTy = F.getFunctionType();
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
+
+  // If the function has any arguments in the local address space, then it's
+  // possible these arguments require the entire local memory space, so
+  // we cannot use local memory in the pass.
+  for (Type *ParamTy : FTy->params()) {
+    PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
+    if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
+      LocalMemLimit = 0;
+      DEBUG(dbgs() << "Function has local memory argument. Promoting to "
+                      "local memory disabled.\n");
+      return false;
+    }
+  }
+
+  LocalMemLimit = ST.getLocalMemorySize();
+  if (LocalMemLimit == 0)
+    return false;
+
+  const DataLayout &DL = Mod->getDataLayout();
+
+  // Check how much local memory is being used by global objects
+  CurrentLocalMemUsage = 0;
+  for (GlobalVariable &GV : Mod->globals()) {
+    if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
+      continue;
+
+    for (const User *U : GV.users()) {
+      const Instruction *Use = dyn_cast<Instruction>(U);
+      if (!Use)
+        continue;
+
+      if (Use->getParent()->getParent() == &F) {
+        unsigned Align = GV.getAlignment();
+        if (Align == 0)
+          Align = DL.getABITypeAlignment(GV.getValueType());
+
+        // FIXME: Try to account for padding here. The padding is currently
+        // determined from the inverse order of uses in the function. I'm not
+        // sure if the use list order is in any way connected to this, so the
+        // total reported size is likely incorrect.
+        uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+        CurrentLocalMemUsage += AllocSize;
+        break;
+      }
+    }
+  }
+
+  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
+                                                          F);
+
+  // Restrict local memory usage so that we don't drastically reduce occupancy,
+  // unless it is already significantly reduced.
+
+  // TODO: Have some sort of hint or other heuristics to guess occupancy based
+  // on other factors..
+  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
+  if (OccupancyHint == 0)
+    OccupancyHint = 7;
+
+  // Clamp to max value.
+  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
+
+  // Check the hint but ignore it if it's obviously wrong from the existing LDS
+  // usage.
+  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
+
+
+  // Round up to the next tier of usage.
+  unsigned MaxSizeWithWaveCount
+    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
+
+  // Program is possibly broken by using more local mem than available.
+  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
+    return false;
+
+  LocalMemLimit = MaxSizeWithWaveCount;
+
+  DEBUG(
+    dbgs() << F.getName() << " uses " << CurrentLocalMemUsage << " bytes of LDS\n"
+    << "  Rounding size to " << MaxSizeWithWaveCount
+    << " with a maximum occupancy of " << MaxOccupancy << '\n'
+    << " and " << (LocalMemLimit - CurrentLocalMemUsage)
+    << " available for promotion\n"
+  );
+
+  return true;
+}
+
 // FIXME: Should try to pick the most likely to be profitable allocas first.
-void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
+bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   // Array allocations are probably not worth handling, since an allocation of
   // the array type is the canonical form.
   if (!I.isStaticAlloca() || I.isArrayAllocation())
-    return;
+    return false;
 
   IRBuilder<> Builder(&I);
 
@@ -675,10 +687,8 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 
   DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I, AS)) {
-    DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
-    return;
-  }
+  if (tryPromoteAllocaToVector(&I, AS))
+    return true; // Promoted to vector.
 
   const Function &ContainingFunction = *I.getParent()->getParent();
   CallingConv::ID CC = ContainingFunction.getCallingConv();
@@ -692,9 +702,13 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
     break;
   default:
     DEBUG(dbgs() << " promote alloca to LDS not supported with calling convention.\n");
-    return;
+    return false;
   }
 
+  // Not likely to have sufficient local memory for promotion.
+  if (!SufficientLDS)
+    return false;
+
   const AMDGPUSubtarget &ST =
     TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
@@ -718,7 +732,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
   if (NewSize > LocalMemLimit) {
     DEBUG(dbgs() << "  " << AllocSize
           << " bytes of local memory not available to promote\n");
-    return;
+    return false;
   }
 
   CurrentLocalMemUsage = NewSize;
@@ -727,7 +741,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 
   if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
     DEBUG(dbgs() << " Do not know how to convert all uses\n");
-    return;
+    return false;
   }
 
   DEBUG(dbgs() << "Promoting alloca to local memory\n");
@@ -873,6 +887,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       llvm_unreachable("Don't know how to promote alloca intrinsic use.");
     }
   }
+  return true;
 }
 
 FunctionPass *llvm::createAMDGPUPromoteAlloca() {
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index e543cae07ada..660879426810 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -416,6 +416,10 @@ public:
     return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
   }
 
+  bool hasSDWA() const {
+    return HasSDWA;
+  }
+
   /// \brief Returns the offset in bytes from the start of the input buffer
   ///        of the first explicit kernel argument.
   unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
@@ -670,10 +674,6 @@ public:
     return HasInv2PiInlineImm;
   }
 
-  bool hasSDWA() const {
-    return HasSDWA;
-  }
-
   bool hasDPP() const {
     return HasDPP;
   }
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b52ea2b3a2c6..f5541e08e1b7 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -881,6 +881,10 @@ public:
     return AMDGPU::isVI(getSTI());
   }
 
+  bool isGFX9() const {
+    return AMDGPU::isGFX9(getSTI());
+  }
+
   bool hasInv2PiInlineImm() const {
     return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
   }
@@ -989,7 +993,6 @@ private:
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
   unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
-  bool isSGPR(unsigned Reg);
 
 public:
   OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
@@ -1042,9 +1045,10 @@ public:
   OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands);
   void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);
   void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands);
+  void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands);
   void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);
   void cvtSDWA(MCInst &Inst, const OperandVector &Operands,
-               uint64_t BasicInstType);
+                uint64_t BasicInstType, bool skipVcc = false);
 };
 
 struct OptionalOperand {
@@ -1966,7 +1970,8 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
   }
 
   if (isForcedSDWA()) {
-    static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA};
+    static const unsigned Variants[] = {AMDGPUAsmVariants::SDWA,
+                                        AMDGPUAsmVariants::SDWA9};
     return makeArrayRef(Variants);
   }
 
@@ -1977,7 +1982,7 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
 
   static const unsigned Variants[] = {
     AMDGPUAsmVariants::DEFAULT, AMDGPUAsmVariants::VOP3,
-    AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::DPP
+    AMDGPUAsmVariants::SDWA, AMDGPUAsmVariants::SDWA9, AMDGPUAsmVariants::DPP
   };
 
   return makeArrayRef(Variants);
@@ -2000,14 +2005,6 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
   return AMDGPU::NoRegister;
 }
 
-bool AMDGPUAsmParser::isSGPR(unsigned Reg) {
-  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
-  const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
-  const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
-  return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||
-         Reg == AMDGPU::SCC;
-}
-
 // NB: This code is correct only when used to check constant
 // bus limitations because GFX7 support no f16 inline constants.
 // Note that there are no cases when a GFX7 opcode violates
@@ -2049,7 +2046,8 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
   if (MO.isImm()) {
     return !isInlineConstant(Inst, OpIdx);
   }
-  return !MO.isReg() || isSGPR(mc2PseudoReg(MO.getReg()));
+  return !MO.isReg() ||
+         isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo());
 }
 
 bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
@@ -2060,7 +2058,8 @@ bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
   if (Desc.TSFlags &
       (SIInstrFlags::VOPC |
        SIInstrFlags::VOP1 | SIInstrFlags::VOP2 |
-       SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)) {
+       SIInstrFlags::VOP3 | SIInstrFlags::VOP3P |
+       SIInstrFlags::SDWA)) {
 
     // Check special imm operands (used by madmk, etc)
     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) {
@@ -4151,14 +4150,19 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) {
   cvtSDWA(Inst, Operands, SIInstrFlags::VOP2);
 }
 
+void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) {
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true);
+}
+
 void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
-  cvtSDWA(Inst, Operands, SIInstrFlags::VOPC);
+  cvtSDWA(Inst, Operands, SIInstrFlags::VOPC, isVI());
 }
 
 void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
-                              uint64_t BasicInstType) {
+                              uint64_t BasicInstType, bool skipVcc) {
   using namespace llvm::AMDGPU::SDWA;
   OptionalImmIndexMap OptionalIdx;
+  bool skippedVcc = false;
 
   unsigned I = 1;
   const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
@@ -4168,15 +4172,22 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
 
   for (unsigned E = Operands.size(); I != E; ++I) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
-    // Add the register arguments
-    if ((BasicInstType == SIInstrFlags::VOPC ||
-         BasicInstType == SIInstrFlags::VOP2)&&
-        Op.isReg() &&
-        Op.Reg.RegNo == AMDGPU::VCC) {
-      // VOPC and VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
-      // Skip it.
-      continue;
-    } else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
+    if (skipVcc && !skippedVcc && Op.isReg() && Op.Reg.RegNo == AMDGPU::VCC) {
+      // VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
+      // Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3)
+      // or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand.
+      // Skip VCC only if we didn't skip it on previous iteration.
+      if (BasicInstType == SIInstrFlags::VOP2 &&
+          (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) {
+        skippedVcc = true;
+        continue;
+      } else if (BasicInstType == SIInstrFlags::VOPC &&
+                 Inst.getNumOperands() == 0) {
+        skippedVcc = true;
+        continue;
+      }
+    }
+    if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
       Op.addRegWithInputModsOperands(Inst, 2);
     } else if (Op.isImm()) {
       // Handle optional arguments
@@ -4184,20 +4195,30 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
     } else {
       llvm_unreachable("Invalid operand type");
     }
+    skippedVcc = false;
   }
 
-  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
-
-  if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
+  if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx9 &&
+      Inst.getOpcode() != AMDGPU::V_NOP_sdwa_vi) {
     // V_NOP_sdwa_vi has no optional sdwa arguments
     switch (BasicInstType) {
     case SIInstrFlags::VOP1:
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+      if (isGFX9() &&
+          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
+      }
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
       break;
 
     case SIInstrFlags::VOP2:
+      addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+      if (isGFX9() &&
+          AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::omod) != -1) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
+      }
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
@@ -4205,6 +4226,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
       break;
 
     case SIInstrFlags::VOPC:
+      if (isVI()) {
+        addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
+      }
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
       addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
       break;
@@ -4220,10 +4244,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
       Inst.getOpcode() == AMDGPU::V_MAC_F16_sdwa_vi)  {
     auto it = Inst.begin();
     std::advance(
-        it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
+      it, AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::src2));
     Inst.insert(it, Inst.getOperand(0)); // src2 = dst
   }
-
 }
 
 /// Force static initialization.
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 137b5cca96ce..9b3cde7c4df6 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -62,32 +62,33 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
   return addOperand(Inst, MCOperand::createImm(Imm));
 }
 
-#define DECODE_OPERAND2(RegClass, DecName) \
-static DecodeStatus Decode##RegClass##RegisterClass(MCInst &Inst, \
-                                                    unsigned Imm, \
-                                                    uint64_t /*Addr*/, \
-                                                    const void *Decoder) { \
+#define DECODE_OPERAND(StaticDecoderName, DecoderName) \
+static DecodeStatus StaticDecoderName(MCInst &Inst, \
+                                       unsigned Imm, \
+                                       uint64_t /*Addr*/, \
+                                       const void *Decoder) { \
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder); \
-  return addOperand(Inst, DAsm->decodeOperand_##DecName(Imm)); \
+  return addOperand(Inst, DAsm->DecoderName(Imm)); \
 }
 
-#define DECODE_OPERAND(RegClass) DECODE_OPERAND2(RegClass, RegClass)
+#define DECODE_OPERAND_REG(RegClass) \
+DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)
 
-DECODE_OPERAND(VGPR_32)
-DECODE_OPERAND(VS_32)
-DECODE_OPERAND(VS_64)
+DECODE_OPERAND_REG(VGPR_32)
+DECODE_OPERAND_REG(VS_32)
+DECODE_OPERAND_REG(VS_64)
 
-DECODE_OPERAND(VReg_64)
-DECODE_OPERAND(VReg_96)
-DECODE_OPERAND(VReg_128)
+DECODE_OPERAND_REG(VReg_64)
+DECODE_OPERAND_REG(VReg_96)
+DECODE_OPERAND_REG(VReg_128)
 
-DECODE_OPERAND(SReg_32)
-DECODE_OPERAND(SReg_32_XM0_XEXEC)
-DECODE_OPERAND(SReg_64)
-DECODE_OPERAND(SReg_64_XEXEC)
-DECODE_OPERAND(SReg_128)
-DECODE_OPERAND(SReg_256)
-DECODE_OPERAND(SReg_512)
+DECODE_OPERAND_REG(SReg_32)
+DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)
+DECODE_OPERAND_REG(SReg_64)
+DECODE_OPERAND_REG(SReg_64_XEXEC)
+DECODE_OPERAND_REG(SReg_128)
+DECODE_OPERAND_REG(SReg_256)
+DECODE_OPERAND_REG(SReg_512)
 
 
 static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
@@ -106,6 +107,13 @@ static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
   return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
 }
 
+#define DECODE_SDWA9(DecName) \
+DECODE_OPERAND(decodeSDWA9##DecName, decodeSDWA9##DecName)
+
+DECODE_SDWA9(Src32)
+DECODE_SDWA9(Src16)
+DECODE_SDWA9(VopcDst)
+
 #include "AMDGPUGenDisassemblerTables.inc"
 
 //===----------------------------------------------------------------------===//
@@ -164,6 +172,9 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
 
       Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
       if (Res) break;
+
+      Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
+      if (Res) break;
     }
 
     // Reinitialize Bytes as DPP64 could have eaten too much
@@ -582,6 +593,48 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
   return errOperand(Val, "unknown operand encoding " + Twine(Val));
 }
 
+MCOperand AMDGPUDisassembler::decodeSDWA9Src(const OpWidthTy Width,
+                                             unsigned Val) const {
+  using namespace AMDGPU::SDWA;
+
+  if (SDWA9EncValues::SRC_VGPR_MIN <= Val &&
+      Val <= SDWA9EncValues::SRC_VGPR_MAX) {
+    return createRegOperand(getVgprClassId(Width),
+                            Val - SDWA9EncValues::SRC_VGPR_MIN);
+  } 
+  if (SDWA9EncValues::SRC_SGPR_MIN <= Val &&
+      Val <= SDWA9EncValues::SRC_SGPR_MAX) {
+    return createSRegOperand(getSgprClassId(Width),
+                             Val - SDWA9EncValues::SRC_SGPR_MIN);
+  }
+
+  return decodeSpecialReg32(Val - SDWA9EncValues::SRC_SGPR_MIN);
+}
+
+MCOperand AMDGPUDisassembler::decodeSDWA9Src16(unsigned Val) const {
+  return decodeSDWA9Src(OPW16, Val);
+}
+
+MCOperand AMDGPUDisassembler::decodeSDWA9Src32(unsigned Val) const {
+  return decodeSDWA9Src(OPW32, Val);
+}
+
+
+MCOperand AMDGPUDisassembler::decodeSDWA9VopcDst(unsigned Val) const {
+  using namespace AMDGPU::SDWA;
+
+  if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
+    Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
+    if (Val > AMDGPU::EncValues::SGPR_MAX) {
+      return decodeSpecialReg64(Val);
+    } else {
+      return createSRegOperand(getSgprClassId(OPW64), Val);
+    }
+  } else {
+    return createRegOperand(AMDGPU::VCC);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUSymbolizer
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 620bae0a6d1a..0ff405a71e9b 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -104,6 +104,11 @@ public:
   MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
   MCOperand decodeSpecialReg32(unsigned Val) const;
   MCOperand decodeSpecialReg64(unsigned Val) const;
+
+  MCOperand decodeSDWA9Src(const OpWidthTy Width, unsigned Val) const;
+  MCOperand decodeSDWA9Src16(unsigned Val) const;
+  MCOperand decodeSDWA9Src32(unsigned Val) const;
+  MCOperand decodeSDWA9VopcDst(unsigned Val) const;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 3bb5c9bc22b7..8ead48067336 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -191,6 +191,7 @@ public:
   }
 };
 
+namespace {
 // just a stub to make base class happy
 class SchedStrategyStub : public MachineSchedStrategy {
 public:
@@ -202,6 +203,7 @@ public:
   void releaseTopNode(SUnit *SU) override {}
   void releaseBottomNode(SUnit *SU) override {}
 };
+} // namespace
 
 GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,
                                              StrategyKind S)
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index c6d0f2179950..d378df674be9 100644
--- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -17,6 +17,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "misched"
 
+namespace {
 class GCNMinRegScheduler {
   struct Candidate : ilist_node<Candidate> {
     const SUnit *SU;
@@ -71,6 +72,7 @@ public:
   std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,
                                      const ScheduleDAG &DAG);
 };
+} // namespace
 
 void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) {
   NumPreds.resize(SUnits.size());
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 18374dca3f84..390a8286c76a 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -211,9 +211,9 @@ static LaneBitmask getUsedRegMask(const MachineOperand &MO,
   return getLiveLaneMask(MO.getReg(), SI, LIS, MRI);
 }
 
-SmallVector<RegisterMaskPair, 8> collectVirtualRegUses(const MachineInstr &MI,
-                                              const LiveIntervals &LIS,
-                                              const MachineRegisterInfo &MRI) {
+static SmallVector<RegisterMaskPair, 8>
+collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS,
+                      const MachineRegisterInfo &MRI) {
   SmallVector<RegisterMaskPair, 8> Res;
   for (const auto &MO : MI.operands()) {
     if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 3d3858ab47ec..a856b17a228f 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -52,6 +52,18 @@ public:
     return 0;
   }
 
+  virtual unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+    return 0;
+  }
+
+  virtual unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+    return 0;
+  }
+
 protected:
   uint64_t computeAvailableFeatures(const FeatureBitset &FB) const;
   void verifyInstructionPredicates(const MCInst &MI,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index bda0928036fd..e02acf516c0d 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -69,6 +69,14 @@ public:
   unsigned getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const override;
+
+  unsigned getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const override;
+
+  unsigned getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const override;
 };
 
 } // end anonymous namespace
@@ -319,6 +327,44 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
   return getMachineOpValue(MI, MO, Fixups, STI);
 }
 
+unsigned
+SIMCCodeEmitter::getSDWA9SrcEncoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+  using namespace AMDGPU::SDWA;
+  
+  uint64_t RegEnc = 0;
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  unsigned Reg = MO.getReg();
+  RegEnc |= MRI.getEncodingValue(Reg);
+  RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
+  if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
+    RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
+  }
+  return RegEnc;
+}
+
+unsigned
+SIMCCodeEmitter::getSDWA9VopcDstEncoding(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  using namespace AMDGPU::SDWA;
+
+  uint64_t RegEnc = 0;
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  unsigned Reg = MO.getReg();
+  if (Reg != AMDGPU::VCC) {
+    RegEnc |= MRI.getEncodingValue(Reg);
+    RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
+    RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK;
+  }
+  return RegEnc;
+}
+
 uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
                                             const MCOperand &MO,
                                        SmallVectorImpl<MCFixup> &Fixups,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index 3590a9b05e1d..60b913cfd39a 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1618,6 +1618,14 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
    return VT.changeVectorElementTypeToInteger();
 }
 
+bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const {
+  // Local and Private addresses do not handle vectors. Limit to i32
+  if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) {
+    return (MemVT.getSizeInBits() <= 32);
+  }
+  return true;
+}
+
 bool R600TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                         unsigned AddrSpace,
                                                         unsigned Align,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h
index 9700ce14c6f3..d6a0876a6ee7 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/lib/Target/AMDGPU/R600ISelLowering.h
@@ -44,6 +44,8 @@ public:
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
                          EVT VT) const override;
 
+  bool canMergeStoresTo(unsigned AS, EVT MemVT) const override;
+
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
                                       bool *IsFast) const override;
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.td b/lib/Target/AMDGPU/R600RegisterInfo.td
index cc667d985a82..3c1e8527284c 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.td
+++ b/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -226,7 +226,7 @@ def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
     R600_Addr,
     R600_KC0, R600_KC1,
     ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF,
-    ALU_CONST, ALU_PARAM, OQAP
+    ALU_CONST, ALU_PARAM, OQAP, INDIRECT_BASE_ADDR
     )>;
 
 def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index a01330cb9171..80967edee0ab 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -118,6 +118,10 @@ namespace AMDGPU {
     // Operand for source modifiers for VOP instructions
     OPERAND_INPUT_MODS,
 
+    // Operand for GFX9 SDWA instructions
+    OPERAND_SDWA9_SRC,
+    OPERAND_SDWA9_VOPC_DST,
+
     /// Operand with 32-bit immediate that uses the constant bus.
     OPERAND_KIMM32,
     OPERAND_KIMM16
@@ -160,7 +164,8 @@ namespace AMDGPUAsmVariants {
     DEFAULT = 0,
     VOP3 = 1,
     SDWA = 2,
-    DPP = 3
+    SDWA9 = 3,
+    DPP = 4
   };
 }
 
@@ -294,6 +299,18 @@ enum DstUnused {
   UNUSED_PRESERVE = 2,
 };
 
+enum SDWA9EncValues{
+  SRC_SGPR_MASK = 0x100,
+  SRC_VGPR_MASK = 0xFF,
+  VOPC_DST_VCC_MASK = 0x80,
+  VOPC_DST_SGPR_MASK = 0x7F,
+
+  SRC_VGPR_MIN = 0,
+  SRC_VGPR_MAX = 255,
+  SRC_SGPR_MIN = 256,
+  SRC_SGPR_MAX = 357,
+};
+
 } // namespace SDWA
 } // namespace AMDGPU
 
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 01c1f78e7ca4..76c2644867aa 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -698,6 +698,18 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
   }
 }
 
+bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const {
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) {
+    return (MemVT.getSizeInBits() <= 4 * 32);
+  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
+    unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
+    return (MemVT.getSizeInBits() <= MaxPrivateBits);
+  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
+    return (MemVT.getSizeInBits() <= 2 * 32);
+  }
+  return true;
+}
+
 bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
                                                       unsigned AddrSpace,
                                                       unsigned Align,
@@ -4229,12 +4241,40 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
   SDValue RHS = N->getOperand(1);
 
 
-  if (VT == MVT::i64) {
-    const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
-    if (CRHS) {
-      if (SDValue Split
-          = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
-        return Split;
+  const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
+  if (VT == MVT::i64 && CRHS) {
+    if (SDValue Split
+        = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
+      return Split;
+  }
+
+  if (CRHS && VT == MVT::i32) {
+    // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
+    // nb = number of trailing zeroes in mask
+    // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
+    // given that we are selecting 8 or 16 bit fields starting at byte boundary.
+    uint64_t Mask = CRHS->getZExtValue();
+    unsigned Bits = countPopulation(Mask);
+    if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
+        (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
+      if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
+        unsigned Shift = CShift->getZExtValue();
+        unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
+        unsigned Offset = NB + Shift;
+        if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
+          SDLoc SL(N);
+          SDValue BFE = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
+                                    LHS->getOperand(0),
+                                    DAG.getConstant(Offset, SL, MVT::i32),
+                                    DAG.getConstant(Bits, SL, MVT::i32));
+          EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
+          SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
+                                    DAG.getValueType(NarrowVT));
+          SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
+                                    DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
+          return Shl;
+        }
+      }
     }
   }
 
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index e68837747491..8e2ec40b224c 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -150,6 +150,8 @@ public:
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
+  bool canMergeStoresTo(unsigned AS, EVT MemVT) const override;
+
   bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS,
                                       unsigned Align,
                                       bool *IsFast) const override;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 38a16b525a75..36d29b8ecf06 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2331,6 +2331,10 @@ static bool isSubRegOf(const SIRegisterInfo &TRI,
 bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
                                     StringRef &ErrInfo) const {
   uint16_t Opcode = MI.getOpcode();
+
+  if (SIInstrInfo::isGenericOpcode(MI.getOpcode()))
+    return true;
+
   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
   int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 7b052844f177..c5287c7f64ba 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -439,6 +439,27 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
   let ParserMatchClass = VReg32OrOffClass;
 }
 
+class SDWA9Src : RegisterOperand<VS_32> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_SDWA9_SRC";
+  let EncoderMethod = "getSDWA9SrcEncoding";
+}
+
+def SDWA9Src32 : SDWA9Src {
+  let DecoderMethod = "decodeSDWA9Src32";
+}
+
+def SDWA9Src16 : SDWA9Src {
+  let DecoderMethod = "decodeSDWA9Src16";
+}
+
+def SDWA9VopcDst : VOPDstOperand<SReg_64> {
+  let OperandNamespace = "AMDGPU";
+  let OperandType = "OPERAND_SDWA9_VOPC_DST";
+  let EncoderMethod = "getSDWA9VopcDstEncoding";
+  let DecoderMethod = "decodeSDWA9VopcDst";
+}
+
 class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
   let Name = "Imm"#CName;
   let PredicateMethod = "is"#CName;
@@ -588,6 +609,16 @@ class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass>
 def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
 def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
 
+def FPRegInputModsMatchClass : AsmOperandClass {
+  let Name = "RegWithFPInputMods";
+  let ParserMethod = "parseRegWithFPInputMods";
+  let PredicateMethod = "isRegKind";
+}
+
+def FPRegInputMods : InputMods <FPRegInputModsMatchClass> {
+  let PrintMethod = "printOperandAndFPInputMods";
+}
+
 def FPVRegInputModsMatchClass : AsmOperandClass {
   let Name = "VRegWithFPInputMods";
   let ParserMethod = "parseRegWithFPInputMods";
@@ -598,6 +629,17 @@ def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
   let PrintMethod = "printOperandAndFPInputMods";
 }
 
+
+def IntRegInputModsMatchClass : AsmOperandClass {
+  let Name = "RegWithIntInputMods";
+  let ParserMethod = "parseRegWithIntInputMods";
+  let PredicateMethod = "isRegKind";
+}
+
+def IntRegInputMods : InputMods <IntRegInputModsMatchClass> {
+  let PrintMethod = "printOperandAndIntInputMods";
+}
+
 def IntVRegInputModsMatchClass : AsmOperandClass {
   let Name = "VRegWithIntInputMods";
   let ParserMethod = "parseRegWithIntInputMods";
@@ -783,6 +825,14 @@ class getVALUDstForVT<ValueType VT> {
                               VOPDstOperand<SReg_64>)))); // else VT == i1
 }
 
+// Returns the register class to use for the destination of VOP[12C]
+// instructions with GFX9 SDWA extension
+class getSDWA9DstForVT<ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 1),
+                            SDWA9VopcDst, // VOPC
+                            VOPDstOperand<VGPR_32>); // VOP1/2 32-bit dst
+}
+
 // Returns the register class to use for source 0 of VOP[12C]
 // instructions for the given VT.
 class getVOPSrc0ForVT<ValueType VT> {
@@ -823,6 +873,9 @@ class getVregSrcForVT<ValueType VT> {
                         !if(!eq(VT.Size, 64), VReg_64, VGPR_32));
 }
 
+class getSDWA9SrcForVT <ValueType VT> {
+  RegisterOperand ret = !if(!eq(VT.Size, 16), SDWA9Src16, SDWA9Src32);
+}
 
 // Returns the register class to use for sources of VOP3 instructions for the
 // given VT.
@@ -926,6 +979,15 @@ class getSrcModExt <ValueType VT> {
   Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
 }
 
+// Return type of input modifiers operand specified input operand for SDWA 9
+class getSrcModSDWA9 <ValueType VT> {
+    bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+               !if(!eq(VT.Value, f32.Value), 1,
+               !if(!eq(VT.Value, f64.Value), 1,
+               0)));
+  Operand ret = !if(isFP, FPRegInputMods, IntRegInputMods);
+}
+
 // Returns the input arguments for VOP[12C] instructions for the given SrcVT.
 class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
   dag ret = !if(!eq(NumSrcArgs, 1), (ins Src0RC:$src0),               // VOP1
@@ -1062,6 +1124,7 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
                // VOP1 without input operands (V_NOP)
                (ins),
             !if(!eq(NumSrcArgs, 1),
+               // VOP1_SDWA
                (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                     clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
                     src0_sel:$src0_sel),
@@ -1071,7 +1134,7 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
                   (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                        Src1Mod:$src1_modifiers, Src1RC:$src1,
                        clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel),
-                  // VOP2_SDWA or VOPC_SDWA with modifiers
+                  // VOP2_SDWA with modifiers
                   (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                        Src1Mod:$src1_modifiers, Src1RC:$src1,
                        clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
@@ -1079,12 +1142,65 @@ class getInsSDWA <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
             (ins)/* endif */)));
 }
 
+// Ins for GFX9 SDWA
+class getInsSDWA9 <RegisterOperand Src0RC, RegisterOperand Src1RC, int NumSrcArgs,
+                   bit HasSDWAOMod, Operand Src0Mod, Operand Src1Mod,
+                   ValueType DstVT> {
+
+  dag ret = !if(!eq(NumSrcArgs, 0),
+               // VOP1 without input operands (V_NOP)
+               (ins),
+            !if(!eq(NumSrcArgs, 1),
+               // VOP1
+               !if(!eq(HasSDWAOMod, 0),
+                  // VOP1_SDWA9 without omod
+                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                       clampmod:$clamp,
+                       dst_sel:$dst_sel, dst_unused:$dst_unused,
+                       src0_sel:$src0_sel),
+                  // VOP1_SDWA9 with omod
+                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                       clampmod:$clamp, omod:$omod,
+                       dst_sel:$dst_sel, dst_unused:$dst_unused,
+                       src0_sel:$src0_sel)),
+            !if(!eq(NumSrcArgs, 2),
+               !if(!eq(DstVT.Size, 1),
+                  // VOPC_SDWA9
+                  (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                       Src1Mod:$src1_modifiers, Src1RC:$src1,
+                       src0_sel:$src0_sel, src1_sel:$src1_sel),
+                  // VOP2_SDWA9
+                  !if(!eq(HasSDWAOMod, 0),
+                     // VOP2_SDWA9 without omod
+                     (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                          Src1Mod:$src1_modifiers, Src1RC:$src1,
+                          clampmod:$clamp,
+                          dst_sel:$dst_sel, dst_unused:$dst_unused,
+                          src0_sel:$src0_sel, src1_sel:$src1_sel),
+                     // VOP1_SDWA9 with omod
+                     (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+                          Src1Mod:$src1_modifiers, Src1RC:$src1,
+                          clampmod:$clamp, omod:$omod,
+                          dst_sel:$dst_sel, dst_unused:$dst_unused,
+                          src0_sel:$src0_sel, src1_sel:$src1_sel))),
+            (ins)/* endif */)));
+}
+
 // Outs for DPP and SDWA
-class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCDPP> {
+class getOutsExt <bit HasDst, ValueType DstVT, RegisterOperand DstRCExt> {
   dag ret = !if(HasDst,
                 !if(!eq(DstVT.Size, 1),
                     (outs), // no dst for VOPC, we use "vcc"-token as dst in SDWA VOPC instructions
-                    (outs DstRCDPP:$vdst)),
+                    (outs DstRCExt:$vdst)),
+                (outs)); // V_NOP
+}
+
+// Outs for GFX9 SDWA
+class getOutsSDWA9 <bit HasDst, ValueType DstVT, RegisterOperand DstRCSDWA9> {
+  dag ret = !if(HasDst,
+                !if(!eq(DstVT.Size, 1),
+                    (outs DstRCSDWA9:$sdst),
+                    (outs DstRCSDWA9:$vdst)),
                 (outs)); // V_NOP
 }
 
@@ -1153,8 +1269,7 @@ class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT =
   string ret = dst#args#" $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
 }
 
-class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,
-                  ValueType DstVT = i32> {
+class getAsmSDWA <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
   string dst = !if(HasDst,
                    !if(!eq(DstVT.Size, 1),
                        " vcc", // use vcc token as dst for VOPC instructioins
@@ -1182,6 +1297,35 @@ class getAsmSDWA <bit HasDst, int NumSrcArgs, bit HasFloatModifiers,
   string ret = dst#args#sdwa;
 }
 
+class getAsmSDWA9 <bit HasDst, bit HasOMod, int NumSrcArgs,
+                   ValueType DstVT = i32> {
+  string dst = !if(HasDst,
+                   !if(!eq(DstVT.Size, 1),
+                       "$sdst", // VOPC
+                       "$vdst"), // VOP1/2
+                    "");
+  string src0 = "$src0_modifiers";
+  string src1 = "$src1_modifiers";
+  string out_mods = !if(!eq(HasOMod, 0), "$clamp", "$clamp$omod");
+  string args = !if(!eq(NumSrcArgs, 0), "",
+                    !if(!eq(NumSrcArgs, 1),
+                        ", "#src0,
+                        ", "#src0#", "#src1
+                     )
+                );
+  string sdwa = !if(!eq(NumSrcArgs, 0), "",
+                    !if(!eq(NumSrcArgs, 1),
+                        out_mods#" $dst_sel $dst_unused $src0_sel",
+                        !if(!eq(DstVT.Size, 1),
+                            " $src0_sel $src1_sel", // No dst_sel, dst_unused and output modifiers for VOPC
+                            out_mods#" $dst_sel $dst_unused $src0_sel $src1_sel"
+                        )
+                    )
+                );
+  string ret = dst#args#sdwa;
+}
+
+
 // Function that checks if instruction supports DPP and SDWA
 class getHasExt <int NumSrcArgs, ValueType DstVT = i32, ValueType Src0VT = i32,
                  ValueType Src1VT = i32> {
@@ -1219,6 +1363,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field RegisterOperand DstRC = getVALUDstForVT<DstVT>.ret;
   field RegisterOperand DstRCDPP = getVALUDstForVT<DstVT>.ret;
   field RegisterOperand DstRCSDWA = getVALUDstForVT<DstVT>.ret;
+  field RegisterOperand DstRCSDWA9 = getSDWA9DstForVT<DstVT>.ret;
   field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT>.ret;
   field RegisterClass Src1RC32 = getVregSrcForVT<Src1VT>.ret;
   field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
@@ -1228,6 +1373,8 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
   field RegisterClass Src0SDWA = getVregSrcForVT<Src0VT>.ret;
   field RegisterClass Src1SDWA = getVregSrcForVT<Src1VT>.ret;
+  field RegisterOperand Src0SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;
+  field RegisterOperand Src1SDWA9 = getSDWA9SrcForVT<Src0VT>.ret;
   field Operand Src0Mod = getSrcMod<Src0VT>.ret;
   field Operand Src1Mod = getSrcMod<Src1VT>.ret;
   field Operand Src2Mod = getSrcMod<Src2VT>.ret;
@@ -1235,6 +1382,8 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field Operand Src1ModDPP = getSrcModExt<Src1VT>.ret;
   field Operand Src0ModSDWA = getSrcModExt<Src0VT>.ret;
   field Operand Src1ModSDWA = getSrcModExt<Src1VT>.ret;
+  field Operand Src0ModSDWA9 = getSrcModSDWA9<Src0VT>.ret;
+  field Operand Src1ModSDWA9 = getSrcModSDWA9<Src1VT>.ret;
 
 
   field bit HasDst = !if(!eq(DstVT.Value, untyped.Value), 0, 1);
@@ -1261,14 +1410,16 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
 
   field bit HasClamp = HasModifiers;
-  field bit HasSDWAClamp = HasSrc0;
+  field bit HasSDWAClamp = EmitDst;
   field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;
 
   field bit IsPacked = isPackedType<Src0VT>.ret;
   field bit HasOpSel = IsPacked;
   field bit HasOMod = !if(HasOpSel, 0, HasModifiers);
+  field bit HasSDWAOMod = isFloatType<DstVT>.ret;
 
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
+  field bit HasSDWA9 = HasExt;
 
   field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
   field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
@@ -1282,6 +1433,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field dag Outs64 = Outs;
   field dag OutsDPP = getOutsExt<HasDst, DstVT, DstRCDPP>.ret;
   field dag OutsSDWA = getOutsExt<HasDst, DstVT, DstRCSDWA>.ret;
+  field dag OutsSDWA9 = getOutsSDWA9<HasDst, DstVT, DstRCSDWA9>.ret;
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
@@ -1296,16 +1448,21 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
                                  HasModifiers, Src0ModSDWA, Src1ModSDWA,
                                  DstVT>.ret;
+  field dag InsSDWA9 = getInsSDWA9<Src0SDWA9, Src1SDWA9, NumSrcArgs,
+                                   HasSDWAOMod, Src0ModSDWA9, Src1ModSDWA9,
+                                   DstVT>.ret;
 
   field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
   field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret;
   field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret;
   field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
-  field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, DstVT>.ret;
+  field string AsmSDWA9 = getAsmSDWA9<HasDst, HasSDWAOMod, NumSrcArgs, DstVT>.ret;
 }
 
 class VOP_NO_EXT <VOPProfile p> : VOPProfile <p.ArgVT> {
   let HasExt = 0;
+  let HasSDWA9 = 0;
 }
 
 def VOP_F16_F16 : VOPProfile <[f16, f16, untyped, untyped]>;
@@ -1446,6 +1603,15 @@ def getSDWAOp : InstrMapping {
   let ValueCols = [["SDWA"]];
 }
 
+// Maps ordinary instructions to their SDWA GFX9 counterparts
+def getSDWA9Op : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["AsmVariantName"];
+  let KeyCol = ["Default"];
+  let ValueCols = [["SDWA9"]];
+}
+
 def getMaskedMIMGOp : InstrMapping {
   let FilterClass = "MIMG_Mask";
   let RowFields = ["Op"];
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index f2d8b6f7b7a4..ec29a66c8bbb 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -184,7 +184,9 @@ def S_BITSET0_B32 : SOP1_32    <"s_bitset0_b32">;
 def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64">;
 def S_BITSET1_B32 : SOP1_32    <"s_bitset1_b32">;
 def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64">;
-def S_GETPC_B64 : SOP1_64_0  <"s_getpc_b64">;
+def S_GETPC_B64 : SOP1_64_0  <"s_getpc_b64",
+  [(set i64:$sdst, (int_amdgcn_s_getpc))]
+>;
 
 let isTerminator = 1, isBarrier = 1, SchedRW = [WriteBranch] in {
 
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 2abd4afad3b6..630f469eabf0 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -544,6 +544,17 @@ bool isVI(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
 }
 
+bool isGFX9(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
+}
+
+bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
+  const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
+  const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
+  return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||
+    Reg == AMDGPU::SCC;
+}
+
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
 
   switch(Reg) {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 8e74aa2cc9a8..19888ad7556a 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -273,6 +273,10 @@ inline bool isKernel(CallingConv::ID CC) {
 bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
 bool isVI(const MCSubtargetInfo &STI);
+bool isGFX9(const MCSubtargetInfo &STI);
+
+/// \brief Is Reg - scalar register
+bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
 
 /// If \p Reg is a pseudo reg, return the correct hardware register given
 /// \p STI otherwise return \p Reg.
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 1febc6bf8ec2..95b5ef0a49db 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -30,6 +30,15 @@ class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
   let Inst{31-25} = 0x3f; // encoding
 }
 
+class VOP1_SDWA9Ae <bits<8> op, VOPProfile P> : VOP_SDWA9Ae <P> {
+  bits<8> vdst;
+
+  let Inst{8-0}   = 0xf9; // sdwa
+  let Inst{16-9}  = op;
+  let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{31-25} = 0x3f; // encoding
+}
+
 class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> :
   InstSI <P.Outs32, P.Ins32, "", pattern>,
   VOP <opName>,
@@ -84,6 +93,11 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOP1";
 }
 
+class VOP1_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+  VOP_SDWA9_Pseudo <OpName, P, pattern> {
+  let AsmMatchConverter = "cvtSdwaVOP1";
+}
+
 class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret =
     !if(P.HasModifiers,
@@ -103,6 +117,7 @@ multiclass VOP1Inst <string opName, VOPProfile P,
   def _e32 : VOP1_Pseudo <opName, P>;
   def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
   def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+  def _sdwa9 : VOP1_SDWA9_Pseudo <opName, P>;
 }
 
 // Special profile for instructions which have clamp
@@ -243,6 +258,7 @@ def VOP_I32_VI32_NO_EXT : VOPProfile<[i32, i32, untyped, untyped]> {
   let Src0RC64 = VRegSrc_32;
 
   let HasExt = 0;
+  let HasSDWA9 = 0;
 }
 
 // Special case because there are no true output operands.  Hack vdst
@@ -258,16 +274,21 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
   let Ins64 = (ins Src0RC64:$vdst, VSrc_b32:$src0);
   let InsDPP = (ins Src0RC32:$vdst, Src0RC32:$src0, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
-  let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, VCSrc_b32:$src0,
+  let InsSDWA = (ins Src0RC32:$vdst, Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel);
+  let InsSDWA9 = (ins Src0RC32:$vdst, Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
+                      clampmod:$clamp, omod:$omod, dst_sel:$dst_sel, dst_unused:$dst_unused,
+                      src0_sel:$src0_sel);
 
   let Asm32 = getAsm32<1, 1>.ret;
   let Asm64 = getAsm64<1, 1, 0, 1>.ret;
   let AsmDPP = getAsmDPP<1, 1, 0>.ret;
-  let AsmSDWA = getAsmSDWA<1, 1, 0>.ret;
+  let AsmSDWA = getAsmSDWA<1, 1>.ret;
+  let AsmSDWA9 = getAsmSDWA9<1, 0, 1>.ret;
 
   let HasExt = 0;
+  let HasSDWA9 = 0;
   let HasDst = 0;
   let EmitDst = 1; // force vdst emission
 }
@@ -324,7 +345,7 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
 } // End SubtargetPredicate = isCIVI
 
 
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = Has16BitInsts in {
 
 defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
 defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
@@ -347,7 +368,7 @@ defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
 
 }
 
-let Predicates = [isVI] in {
+let Predicates = [Has16BitInsts] in {
 
 def : Pat<
     (f32 (f16_to_fp i16:$src)),
@@ -523,6 +544,10 @@ multiclass VOP1_Real_vi <bits<10> op> {
     VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
+  def _sdwa_gfx9 :
+    VOP_SDWA9_Real <!cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9")>,
+    VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+
   // For now left dpp only for asm/dasm
   // TODO: add corresponding pseudo
   def _dpp : VOP1_DPP<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")>;
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 4a11d9471f1d..657cacaa792c 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -48,6 +48,18 @@ class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {
   let Inst{31}    = 0x0; // encoding
 }
 
+class VOP2_SDWA9Ae <bits<6> op, VOPProfile P> : VOP_SDWA9Ae <P> {
+  bits<8> vdst;
+  bits<9> src1;
+
+  let Inst{8-0}   = 0xf9; // sdwa
+  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
+  let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{30-25} = op;
+  let Inst{31}    = 0x0; // encoding
+  let Inst{63}    = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr
+}
+
 class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suffix = "_e32"> :
   InstSI <P.Outs32, P.Ins32, "", pattern>,
   VOP <opName>,
@@ -102,6 +114,11 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOP2";
 }
 
+class VOP2_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+  VOP_SDWA9_Pseudo <OpName, P, pattern> {
+  let AsmMatchConverter = "cvtSdwaVOP2";
+}
+
 class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret = !if(P.HasModifiers,
     [(set P.DstVT:$vdst,
@@ -121,10 +138,10 @@ multiclass VOP2Inst <string opName,
   def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
              Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
 
-  def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+  def _sdwa  : VOP2_SDWA_Pseudo <opName, P>;
+  def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P>;
 }
 
-// TODO: add SDWA pseudo instructions for VOP2bInst and VOP2eInst
 multiclass VOP2bInst <string opName,
                       VOPProfile P,
                       SDPatternOperator node = null_frag,
@@ -136,7 +153,13 @@ multiclass VOP2bInst <string opName,
       def _e32 : VOP2_Pseudo <opName, P>,
                  Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
 
-      def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+      def _sdwa  : VOP2_SDWA_Pseudo <opName, P> {
+        let AsmMatchConverter = "cvtSdwaVOP2b";
+      }
+
+      def _sdwa9 : VOP2_SDWA9_Pseudo <opName, P> {
+        let AsmMatchConverter = "cvtSdwaVOP2b";
+      }
     }
 
     def _e64 : VOP3_Pseudo <opName, P, getVOP2Pat64<node, P>.ret>,
@@ -203,13 +226,21 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
                      VGPR_32:$src2, // stub argument
                      clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
+  let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
+                      Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
+                      VGPR_32:$src2, // stub argument
+                      clampmod:$clamp, omod:$omod,
+                      dst_sel:$dst_sel, dst_unused:$dst_unused,
+                      src0_sel:$src0_sel, src1_sel:$src1_sel);
   let Asm32 = getAsm32<1, 2, vt>.ret;
   let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret;
   let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
-  let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret;
+  let AsmSDWA = getAsmSDWA<1, 2, vt>.ret;
+  let AsmSDWA9 = getAsmSDWA9<1, 1, 2, vt>.ret;
   let HasSrc2 = 0;
   let HasSrc2Mods = 0;
   let HasExt = 1;
+  let HasSDWA9 = 0;
 }
 
 def VOP_MAC_F16 : VOP_MAC <f16> {
@@ -229,6 +260,7 @@ def VOP2b_I32_I1_I32_I32 : VOPProfile<[i32, i32, i32, untyped]> {
   let Asm32 = "$vdst, vcc, $src0, $src1";
   let Asm64 = "$vdst, $sdst, $src0, $src1";
   let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers$clamp $dst_sel $dst_unused $src0_sel $src1_sel";
   let AsmDPP = "$vdst, vcc, $src0, $src1 $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
   let Outs32 = (outs DstRC:$vdst);
   let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
@@ -246,6 +278,7 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
   let Asm32 = "$vdst, vcc, $src0, $src1, vcc";
   let Asm64 = "$vdst, $sdst, $src0, $src1, $src2";
   let AsmSDWA = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
+  let AsmSDWA9 = "$vdst, vcc, $src0_modifiers, $src1_modifiers, vcc $clamp $dst_sel $dst_unused $src0_sel $src1_sel";
   let AsmDPP = "$vdst, vcc, $src0, $src1, vcc $dpp_ctrl$row_mask$bank_mask$bound_ctrl";
   let Outs32 = (outs DstRC:$vdst);
   let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
@@ -254,16 +287,23 @@ def VOP2b_I32_I1_I32_I32_I1 : VOPProfile<[i32, i32, i32, i1]> {
   // implicit VCC use.
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1);
 
-  let InsSDWA = (ins Src0Mod:$src0_modifiers, Src0SDWA:$src0,
-                     Src1Mod:$src1_modifiers, Src1SDWA:$src1,
+  let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
+                     Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
 
+  let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
+                      Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
+                      clampmod:$clamp, omod:$omod,
+                      dst_sel:$dst_sel, dst_unused:$dst_unused,
+                      src0_sel:$src0_sel, src1_sel:$src1_sel);
+
   let InsDPP = (ins Src0Mod:$src0_modifiers, Src0DPP:$src0,
                     Src1Mod:$src1_modifiers, Src1DPP:$src1,
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
   let HasExt = 1;
+  let HasSDWA9 = 1;
 }
 
 // Read in from vcc or arbitrary SGPR
@@ -387,7 +427,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
 
 } // End let SubtargetPredicate = SICI
 
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = Has16BitInsts in {
 
 def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
 defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
@@ -418,7 +458,7 @@ defm V_MAC_F16 : VOP2Inst <"v_mac_f16", VOP_MAC_F16>;
 }
 } // End isCommutable = 1
 
-} // End SubtargetPredicate = isVI
+} // End SubtargetPredicate = Has16BitInsts
 
 // Note: 16-bit instructions produce a 0 result in the high 16-bits.
 multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
@@ -468,7 +508,7 @@ class ZExt_i16_i1_Pat <SDNode ext> : Pat <
   (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
 >;
 
-let Predicates = [isVI] in {
+let Predicates = [Has16BitInsts] in {
 
 defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
 defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
@@ -513,7 +553,7 @@ def : Pat<
   (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
 >;
 
-} // End Predicates = [isVI]
+} // End Predicates = [Has16BitInsts]
 
 //===----------------------------------------------------------------------===//
 // SI
@@ -686,15 +726,21 @@ multiclass VOP2_SDWA_Real <bits<6> op> {
     VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 }
 
+multiclass VOP2_SDWA9_Real <bits<6> op> {
+  def _sdwa_gfx9 :
+    VOP_SDWA9_Real <!cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9")>,
+    VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+}
+
 multiclass VOP2be_Real_e32e64_vi <bits<6> op> :
-  Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {
+  Base_VOP2be_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
   // For now left dpp only for asm/dasm
   // TODO: add corresponding pseudo
   def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
 }
 
 multiclass VOP2_Real_e32e64_vi <bits<6> op> :
-  Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op> {
+  Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
   // For now left dpp only for asm/dasm
   // TODO: add corresponding pseudo
   def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index c0b5069948fb..001fc960b228 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -243,7 +243,7 @@ def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 } // End SubtargetPredicate = isCIVI
 
 
-let SubtargetPredicate = isVI in {
+let SubtargetPredicate = Has16BitInsts in {
 
 let isCommutable = 1 in {
 
@@ -258,12 +258,13 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
 def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
 
 }  // End isCommutable = 1
+} // End SubtargetPredicate = Has16BitInsts
 
+let SubtargetPredicate = isVI in {
 def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
-
 } // End SubtargetPredicate = isVI
 
-let Predicates = [isVI] in {
+let Predicates = [Has16BitInsts] in {
 
 multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
                              Instruction inst, SDPatternOperator op3> {
@@ -288,7 +289,7 @@ def : Pat<
 defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
 defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
 
-} // End Predicates = [isVI]
+} // End Predicates = [Has16BitInsts]
 
 let SubtargetPredicate = isGFX9 in {
 def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>;
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index a3550a63677b..cd347b86d305 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -34,6 +34,17 @@ class VOPC_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
   let Inst{44-43} = SDWA.UNUSED_PRESERVE;
 }
 
+class VOPC_SDWA9e <bits<8> op, VOPProfile P> : VOP_SDWA9Be <P> {
+  bits<9> src1;
+
+  let Inst{8-0}   = 0xf9; // sdwa
+  let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e; // encoding
+  let Inst{63}    = !if(P.HasSrc1, src1{8}, 0); // src1_sgpr
+}
+
+
 //===----------------------------------------------------------------------===//
 // VOPC classes
 //===----------------------------------------------------------------------===//
@@ -102,6 +113,11 @@ class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let AsmMatchConverter = "cvtSdwaVOPC";
 }
 
+class VOPC_SDWA9_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
+  VOP_SDWA9_Pseudo <OpName, P, pattern> {
+  let AsmMatchConverter = "cvtSdwaVOPC";
+}
+
 // This class is used only with VOPC instructions. Use $sdst for out operand
 class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> :
   InstAlias <ps.OpName#" "#p.Asm32, (inst)>, PredicateControl {
@@ -173,6 +189,13 @@ multiclass VOPC_Pseudos <string opName,
     let isConvergent = DefExec;
     let isCompare = 1;
   }
+
+  def _sdwa9 : VOPC_SDWA9_Pseudo <opName, P> {
+    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+    let SchedRW = P.Schedule;
+    let isConvergent = DefExec;
+    let isCompare = 1;
+  }
 }
 
 def VOPC_I1_F16_F16 : VOPC_Profile<[Write32Bit], f16>;
@@ -520,7 +543,11 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType vt> :
   let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
                      Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
                      clampmod:$clamp, src0_sel:$src0_sel, src1_sel:$src1_sel);
+  let InsSDWA9 = (ins Src0ModSDWA9:$src0_modifiers, Src0SDWA9:$src0,
+                      Src1ModSDWA9:$src1_modifiers, Src1SDWA9:$src1,
+                      src0_sel:$src0_sel, src1_sel:$src1_sel);
   let AsmSDWA = " vcc, $src0_modifiers, $src1_modifiers$clamp $src0_sel $src1_sel";
+  //let AsmSDWA9 = " $sdst, $src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
   let HasSrc1Mods = 0;
   let HasClamp = 0;
   let HasOMod = 0;
@@ -553,6 +580,12 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec> {
     let SchedRW = p.Schedule;
     let isConvergent = DefExec;
   }
+
+  def _sdwa9 : VOPC_SDWA9_Pseudo <opName, p> {
+    let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
+    let SchedRW = p.Schedule;
+    let isConvergent = DefExec;
+  }
 }
 
 def VOPC_I1_F16_I32 : VOPC_Class_Profile<[Write32Bit], f16>;
@@ -920,6 +953,10 @@ multiclass VOPC_Real_vi <bits<10> op> {
     VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
     VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
 
+  def _sdwa_gfx9 :
+    VOP_SDWA9_Real <!cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9")>,
+    VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA9_Pseudo>(NAME#"_sdwa9").Pfl>;
+
   def : VOPCInstAlias <!cast<VOP3_Pseudo>(NAME#"_e64"),
                        !cast<Instruction>(NAME#"_e32_vi")> {
     let AssemblerPredicate = isVI;
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 69906c419db3..4da654f84f9d 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -293,11 +293,52 @@ class VOP_SDWAe<VOPProfile P> : Enc64 {
   let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
   let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0);
   let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
-  let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
   let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
+  let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
   let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
-  let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
   let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
+  let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
+}
+
+// gfx9 SDWA basic encoding
+class VOP_SDWA9e<VOPProfile P> : Enc64 {
+  bits<9> src0; // {src0_sgpr{0}, src0{7-0}}
+  bits<3> src0_sel;
+  bits<2> src0_modifiers; // float: {abs,neg}, int {sext}
+  bits<3> src1_sel;
+  bits<2> src1_modifiers;
+  bits<1> src1_sgpr;
+
+  let Inst{39-32} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{50-48} = !if(P.HasSrc0, src0_sel{2-0}, SDWA.DWORD);
+  let Inst{51}    = !if(P.HasSrc0IntMods, src0_modifiers{0}, 0);
+  let Inst{53-52} = !if(P.HasSrc0FloatMods, src0_modifiers{1-0}, 0);
+  let Inst{55}    = !if(P.HasSrc0, src0{8}, 0);
+  let Inst{58-56} = !if(P.HasSrc1, src1_sel{2-0}, SDWA.DWORD);
+  let Inst{59}    = !if(P.HasSrc1IntMods, src1_modifiers{0}, 0);
+  let Inst{61-60} = !if(P.HasSrc1FloatMods, src1_modifiers{1-0}, 0);
+  let Inst{63}    = 0; // src1_sgpr - should be specified in subclass
+}
+
+// gfx9 SDWA-A
+class VOP_SDWA9Ae<VOPProfile P> : VOP_SDWA9e<P> {
+  bits<3> dst_sel;
+  bits<2> dst_unused;
+  bits<1> clamp;
+  bits<2> omod;
+
+  let Inst{42-40} = !if(P.EmitDst, dst_sel{2-0}, SDWA.DWORD);
+  let Inst{44-43} = !if(P.EmitDst, dst_unused{1-0}, SDWA.UNUSED_PRESERVE);
+  let Inst{45}    = !if(P.HasSDWAClamp, clamp{0}, 0);
+  let Inst{47-46} = !if(P.HasSDWAOMod, omod{1-0}, 0);
+}
+
+// gfx9 SDWA-B
+class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> {
+  bits<8> sdst; // {vcc_sdst{0}, sdst{6-0}}
+
+  let Inst{46-40} = !if(P.EmitDst, sdst{6-0}, 0);
+  let Inst{47} = !if(P.EmitDst, sdst{7}, 0);
 }
 
 class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
@@ -331,6 +372,50 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   VOPProfile Pfl = P;
 }
 
+// GFX9 adds two features to SDWA:
+// 1.	Add 3 fields to the SDWA microcode word: S0, S1 and OMOD.
+//    a. S0 and S1 indicate that source 0 and 1 respectively are SGPRs rather
+//       than VGPRs (at most 1 can be an SGPR);
+//    b. OMOD is the standard output modifier (result *2, *4, /2)
+// 2.	Add a new version of the SDWA microcode word for VOPC: SDWAB. This
+//    replaces OMOD and the dest fields with SD and SDST (SGPR destination)
+//    field.
+//    a. When SD=1, the SDST is used as the destination for the compare result;
+//    b.when SD=0, VCC is used.
+// 
+// In GFX9, V_MAC_F16, V_MAC_F32 opcodes cannot be used with SDWA
+
+class VOP_SDWA9_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
+  InstSI <P.OutsSDWA9, P.InsSDWA9, "", pattern>,
+  VOP <opName>,
+  SIMCInstr <opName#"_sdwa9", SIEncodingFamily.NONE>,
+  MnemonicAlias <opName#"_sdwa9", opName> {
+
+  let isPseudo = 1;
+  let isCodeGenOnly = 1;
+  let UseNamedOperandTable = 1;
+
+  string Mnemonic = opName;
+  string AsmOperands = P.AsmSDWA9;
+
+  let Size = 8;
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+
+  let VALU = 1;
+  let SDWA = 1;
+  let Uses = [EXEC];
+
+  let SubtargetPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);
+  let AssemblerPredicate = !if(P.HasSDWA9, HasSDWA9, DisableInst);
+  let AsmVariantName = !if(P.HasSDWA9, AMDGPUAsmVariants.SDWA9,
+                                     AMDGPUAsmVariants.Disable);
+  let DecoderNamespace = "SDWA9";
+
+  VOPProfile Pfl = P;
+}
+
 class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
   InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
   SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
@@ -358,6 +443,33 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
   let TSFlags              = ps.TSFlags;
 }
 
+class VOP_SDWA9_Real <VOP_SDWA9_Pseudo ps> :
+  InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
+  SIMCInstr <ps.PseudoInstr, SIEncodingFamily.VI> {
+
+  let isPseudo = 0;
+  let isCodeGenOnly = 0;
+
+  let Defs = ps.Defs;
+  let Uses = ps.Uses;
+  let SchedRW = ps.SchedRW;
+  let hasSideEffects = ps.hasSideEffects;
+
+  let Constraints     = ps.Constraints;
+  let DisableEncoding = ps.DisableEncoding;
+
+  // Copy relevant pseudo op flags
+  let SubtargetPredicate   = ps.SubtargetPredicate;
+  let AssemblerPredicate   = ps.AssemblerPredicate;
+  let AsmMatchConverter    = ps.AsmMatchConverter;
+  let AsmVariantName       = ps.AsmVariantName;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let DecoderNamespace     = ps.DecoderNamespace;
+  let Constraints          = ps.Constraints;
+  let DisableEncoding      = ps.DisableEncoding;
+  let TSFlags              = ps.TSFlags;
+}
+
 class VOP_DPPe<VOPProfile P> : Enc64 {
   bits<2> src0_modifiers;
   bits<8> src0;
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 46ac4d0ad933..31a2f499a9a7 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -34,6 +34,9 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI)
 
 static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI,
                             Type *T) {
+  if (T->isArrayTy())
+    return true;
+
   EVT VT = TLI.getValueType(DL, T, true);
   if (!VT.isSimple() || VT.isVector() ||
       !(VT.isInteger() || VT.isFloatingPoint()))
@@ -148,23 +151,47 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
 };
 } // End anonymous namespace.
 
-void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
-                                        SmallVectorImpl<ArgInfo> &SplitArgs,
-                                        const DataLayout &DL,
-                                        MachineRegisterInfo &MRI) const {
+void ARMCallLowering::splitToValueTypes(
+    const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
+    MachineFunction &MF, const SplitArgTy &PerformArgSplit) const {
   const ARMTargetLowering &TLI = *getTLI<ARMTargetLowering>();
   LLVMContext &Ctx = OrigArg.Ty->getContext();
+  const DataLayout &DL = MF.getDataLayout();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const Function *F = MF.getFunction();
 
   SmallVector<EVT, 4> SplitVTs;
   SmallVector<uint64_t, 4> Offsets;
   ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
 
-  assert(SplitVTs.size() == 1 && "Unsupported type");
+  if (SplitVTs.size() == 1) {
+    // Even if there is no splitting to do, we still want to replace the
+    // original type (e.g. pointer type -> integer).
+    SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
+                           OrigArg.Flags, OrigArg.IsFixed);
+    return;
+  }
 
-  // Even if there is no splitting to do, we still want to replace the original
-  // type (e.g. pointer type -> integer).
-  SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
-                         OrigArg.Flags, OrigArg.IsFixed);
+  unsigned FirstRegIdx = SplitArgs.size();
+  for (unsigned i = 0, e = SplitVTs.size(); i != e; ++i) {
+    EVT SplitVT = SplitVTs[i];
+    Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
+    auto Flags = OrigArg.Flags;
+    bool NeedsConsecutiveRegisters =
+        TLI.functionArgumentNeedsConsecutiveRegisters(
+            SplitTy, F->getCallingConv(), F->isVarArg());
+    if (NeedsConsecutiveRegisters) {
+      Flags.setInConsecutiveRegs();
+      if (i == e - 1)
+        Flags.setInConsecutiveRegsLast();
+    }
+    SplitArgs.push_back(
+        ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)),
+                SplitTy, Flags, OrigArg.IsFixed});
+  }
+
+  for (unsigned i = 0; i < Offsets.size(); ++i)
+    PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8);
 }
 
 /// Lower the return value for the already existing \p Ret. This assumes that
@@ -187,7 +214,9 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
   SmallVector<ArgInfo, 4> SplitVTs;
   ArgInfo RetInfo(VReg, Val->getType());
   setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
-  splitToValueTypes(RetInfo, SplitVTs, DL, MF.getRegInfo());
+  splitToValueTypes(RetInfo, SplitVTs, MF, [&](unsigned Reg, uint64_t Offset) {
+    MIRBuilder.buildExtract(Reg, VReg, Offset);
+  });
 
   CCAssignFn *AssignFn =
       TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
@@ -307,6 +336,26 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
     return 1;
   }
 
+  /// Merge the values in \p SrcRegs into \p DstReg at offsets \p SrcOffsets.
+  /// Note that the source registers are not required to have homogeneous types,
+  /// so we use G_INSERT rather than G_MERGE_VALUES.
+  // FIXME: Use G_MERGE_VALUES if the types are homogeneous.
+  void mergeRegisters(unsigned DstReg, ArrayRef<unsigned> SrcRegs,
+                      ArrayRef<uint64_t> SrcOffsets) {
+    LLT Ty = MRI.getType(DstReg);
+
+    unsigned Dst = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildUndef(Dst);
+
+    for (unsigned i = 0; i < SrcRegs.size(); ++i) {
+      unsigned Tmp = MRI.createGenericVirtualRegister(Ty);
+      MIRBuilder.buildInsert(Tmp, Dst, SrcRegs[i], SrcOffsets[i]);
+      Dst = Tmp;
+    }
+
+    MIRBuilder.buildCopy(DstReg, Dst);
+  }
+
   /// Marking a physical register as used is different between formal
   /// parameters, where it's a basic block live-in, and call returns, where it's
   /// an implicit-def of the call instruction.
@@ -335,6 +384,7 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
     return false;
 
   auto &MF = MIRBuilder.getMF();
+  auto &MBB = MIRBuilder.getMBB();
   auto DL = MF.getDataLayout();
   auto &TLI = *getTLI<ARMTargetLowering>();
 
@@ -350,17 +400,34 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   CCAssignFn *AssignFn =
       TLI.CCAssignFnForCall(F.getCallingConv(), F.isVarArg());
 
+  FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(),
+                              AssignFn);
+
   SmallVector<ArgInfo, 8> ArgInfos;
+  SmallVector<unsigned, 4> SplitRegs;
+  SmallVector<uint64_t, 4> RegOffsets;
   unsigned Idx = 0;
   for (auto &Arg : F.args()) {
     ArgInfo AInfo(VRegs[Idx], Arg.getType());
     setArgFlags(AInfo, Idx + AttributeList::FirstArgIndex, DL, F);
-    splitToValueTypes(AInfo, ArgInfos, DL, MF.getRegInfo());
+
+    SplitRegs.clear();
+    RegOffsets.clear();
+
+    splitToValueTypes(AInfo, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) {
+      SplitRegs.push_back(Reg);
+      RegOffsets.push_back(Offset);
+    });
+
+    if (!SplitRegs.empty())
+      ArgHandler.mergeRegisters(VRegs[Idx], SplitRegs, RegOffsets);
+
     Idx++;
   }
 
-  FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(),
-                              AssignFn);
+  if (!MBB.empty())
+    MIRBuilder.setInstr(*MBB.begin());
+
   return handleAssignments(MIRBuilder, ArgInfos, ArgHandler);
 }
 
@@ -407,7 +474,9 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     if (!Arg.IsFixed)
       return false;
 
-    splitToValueTypes(Arg, ArgInfos, DL, MRI);
+    splitToValueTypes(Arg, ArgInfos, MF, [&](unsigned Reg, uint64_t Offset) {
+      MIRBuilder.buildExtract(Reg, Arg.Reg, Offset);
+    });
   }
 
   auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
@@ -423,12 +492,24 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
       return false;
 
     ArgInfos.clear();
-    splitToValueTypes(OrigRet, ArgInfos, DL, MRI);
+    SmallVector<uint64_t, 8> RegOffsets;
+    SmallVector<unsigned, 8> SplitRegs;
+    splitToValueTypes(OrigRet, ArgInfos, MF,
+                      [&](unsigned Reg, uint64_t Offset) {
+                        RegOffsets.push_back(Offset);
+                        SplitRegs.push_back(Reg);
+                      });
 
     auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, /*IsVarArg=*/false);
     CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn);
     if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
       return false;
+
+    if (!RegOffsets.empty()) {
+      // We have split the value and allocated each individual piece, now build
+      // it up again.
+      RetHandler.mergeRegisters(OrigRet.Reg, SplitRegs, RegOffsets);
+    }
   }
 
   // We now know the size of the stack - update the ADJCALLSTACKDOWN
diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h
index 6404c7a2689e..f5a6872336f6 100644
--- a/lib/Target/ARM/ARMCallLowering.h
+++ b/lib/Target/ARM/ARMCallLowering.h
@@ -42,11 +42,14 @@ private:
   bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
                       unsigned VReg, MachineInstrBuilder &Ret) const;
 
+  typedef std::function<void(unsigned Reg, uint64_t Offset)> SplitArgTy;
+
   /// Split an argument into one or more arguments that the CC lowering can cope
   /// with (e.g. replace pointers with integers).
   void splitToValueTypes(const ArgInfo &OrigArg,
                          SmallVectorImpl<ArgInfo> &SplitArgs,
-                         const DataLayout &DL, MachineRegisterInfo &MRI) const;
+                         MachineFunction &MF,
+                         const SplitArgTy &PerformArgSplit) const;
 };
 } // End of namespace llvm
 #endif
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 78a9144bd321..90baabcdb652 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -779,7 +779,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP(MachineBasicBlock &MBB,
   MachineOperand &Desired = MI.getOperand(3);
   MachineOperand &New = MI.getOperand(4);
 
-  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LivePhysRegs LiveRegs(TII->getRegisterInfo());
   LiveRegs.addLiveOuts(MBB);
   for (auto I = std::prev(MBB.end()); I != MBBI; --I)
     LiveRegs.stepBackward(*I);
@@ -903,7 +903,7 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   unsigned DesiredLo = TRI->getSubReg(Desired.getReg(), ARM::gsub_0);
   unsigned DesiredHi = TRI->getSubReg(Desired.getReg(), ARM::gsub_1);
 
-  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LivePhysRegs LiveRegs(TII->getRegisterInfo());
   LiveRegs.addLiveOuts(MBB);
   for (auto I = std::prev(MBB.end()); I != MBBI; --I)
     LiveRegs.stepBackward(*I);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index f8b584db7b99..62e774d869da 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -127,7 +127,7 @@ static cl::opt<bool> EnableConstpoolPromotion(
     "arm-promote-constant", cl::Hidden,
     cl::desc("Enable / disable promotion of unnamed_addr constants into "
              "constant pools"),
-    cl::init(true));
+    cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
 static cl::opt<unsigned> ConstpoolPromotionMaxSize(
     "arm-promote-constant-max-size", cl::Hidden,
     cl::desc("Maximum size of constant to promote into a constant pool"),
@@ -12147,12 +12147,6 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
     }
   }
 
-  // Lowering to i32/i16 if the size permits.
-  if (Size >= 4)
-    return MVT::i32;
-  else if (Size >= 2)
-    return MVT::i16;
-
   // Let the target-independent logic figure it out.
   return MVT::Other;
 }
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 875c06210ae6..26da528c19e6 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -510,7 +510,7 @@ class InstrItineraryData;
     bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
                                    unsigned &Cost) const override;
 
-    bool canMergeStoresTo(EVT MemVT) const override {
+    bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT) const override {
       // Do not merge to larger than i32.
       return (MemVT.getSizeInBits() <= 32);
     }
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 51290e5a5b93..858136a82078 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -674,7 +674,7 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
 class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd),
           (ins AddrMode:$Rn), IIC_VLD1,
-          "vld1", Dt, "$Vd, $Rn", "", []> {
+          "vld1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVLD1]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -682,7 +682,7 @@ class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode>
 class VLD1Q<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd),
           (ins AddrMode:$Rn), IIC_VLD1x2,
-          "vld1", Dt, "$Vd, $Rn", "", []> {
+          "vld1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVLD2]> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -703,7 +703,7 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10, 0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
                      (ins AddrMode:$Rn), IIC_VLD1u,
                      "vld1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -711,7 +711,7 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
                         (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -720,7 +720,7 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
                     (ins AddrMode:$Rn), IIC_VLD1x2u,
                      "vld1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -728,7 +728,7 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
                         (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -747,7 +747,7 @@ defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
 class VLD1D3<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd),
           (ins AddrMode:$Rn), IIC_VLD1x3, "vld1", Dt,
-          "$Vd, $Rn", "", []> {
+          "$Vd, $Rn", "", []>, Sched<[WriteVLD3]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -756,7 +756,7 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
                     (ins AddrMode:$Rn), IIC_VLD1x2u,
                      "vld1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -764,7 +764,7 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
                         (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -780,15 +780,15 @@ defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>;
 defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>;
 defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>;
 
-def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>;
-def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>;
-def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>;
+def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
+def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>, Sched<[WriteVLD3]>;
 
 // ...with 4 registers
 class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd),
           (ins AddrMode:$Rn), IIC_VLD1x4, "vld1", Dt,
-          "$Vd, $Rn", "", []> {
+          "$Vd, $Rn", "", []>, Sched<[WriteVLD4]> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -797,7 +797,7 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb),
                     (ins AddrMode:$Rn), IIC_VLD1x2u,
                      "vld1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -805,7 +805,7 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
                         (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -821,9 +821,9 @@ defm VLD1d16Qwb  : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
 defm VLD1d32Qwb  : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
 defm VLD1d64Qwb  : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
-def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>;
-def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>;
-def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>;
+def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
+def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>, Sched<[WriteVLD4]>;
 
 //   VLD2     : Vector Load (multiple 2-element structures)
 class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
@@ -837,22 +837,22 @@ class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
 }
 
 def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2,
-                     addrmode6align64or128>;
+                     addrmode6align64or128>, Sched<[WriteVLD2]>;
 def  VLD2d16  : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2,
-                     addrmode6align64or128>;
+                     addrmode6align64or128>, Sched<[WriteVLD2]>;
 def  VLD2d32  : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2,
-                     addrmode6align64or128>;
+                     addrmode6align64or128>, Sched<[WriteVLD2]>;
 
 def  VLD2q8   : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2,
-                     addrmode6align64or128or256>;
+                     addrmode6align64or128or256>, Sched<[WriteVLD4]>;
 def  VLD2q16  : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2,
-                     addrmode6align64or128or256>;
+                     addrmode6align64or128or256>, Sched<[WriteVLD4]>;
 def  VLD2q32  : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2,
-                     addrmode6align64or128or256>;
+                     addrmode6align64or128or256>, Sched<[WriteVLD4]>;
 
-def  VLD2q8Pseudo  : VLDQQPseudo<IIC_VLD2x2>;
-def  VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>;
-def  VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>;
+def  VLD2q8Pseudo  : VLDQQPseudo<IIC_VLD2x2>, Sched<[WriteVLD4]>;
+def  VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>, Sched<[WriteVLD4]>;
+def  VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>, Sched<[WriteVLD4]>;
 
 // ...with address register writeback:
 multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
@@ -875,45 +875,45 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
 }
 
 defm VLD2d8wb  : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u,
-                        addrmode6align64or128>;
+                        addrmode6align64or128>, Sched<[WriteVLD2]>;
 defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u,
-                        addrmode6align64or128>;
+                        addrmode6align64or128>, Sched<[WriteVLD2]>;
 defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u,
-                        addrmode6align64or128>;
+                        addrmode6align64or128>, Sched<[WriteVLD2]>;
 
 defm VLD2q8wb  : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u,
-                        addrmode6align64or128or256>;
+                        addrmode6align64or128or256>, Sched<[WriteVLD4]>;
 defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u,
-                        addrmode6align64or128or256>;
+                        addrmode6align64or128or256>, Sched<[WriteVLD4]>;
 defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u,
-                        addrmode6align64or128or256>;
+                        addrmode6align64or128or256>, Sched<[WriteVLD4]>;
 
-def VLD2q8PseudoWB_fixed     : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
-def VLD2q16PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
-def VLD2q32PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
-def VLD2q8PseudoWB_register  : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
-def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
-def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
+def VLD2q8PseudoWB_fixed     : VLDQQWBfixedPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q16PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q32PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q8PseudoWB_register  : VLDQQWBregisterPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
+def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>, Sched<[WriteVLD4]>;
 
 // ...with double-spaced registers
 def  VLD2b8    : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2,
-                      addrmode6align64or128>;
+                      addrmode6align64or128>, Sched<[WriteVLD2]>;
 def  VLD2b16   : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2,
-                      addrmode6align64or128>;
+                      addrmode6align64or128>, Sched<[WriteVLD2]>;
 def  VLD2b32   : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2,
-                      addrmode6align64or128>;
+                      addrmode6align64or128>, Sched<[WriteVLD2]>;
 defm VLD2b8wb  : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u,
-                        addrmode6align64or128>;
+                        addrmode6align64or128>, Sched<[WriteVLD2]>;
 defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u,
-                        addrmode6align64or128>;
+                        addrmode6align64or128>, Sched<[WriteVLD2]>;
 defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u,
-                        addrmode6align64or128>;
+                        addrmode6align64or128>, Sched<[WriteVLD2]>;
 
 //   VLD3     : Vector Load (multiple 3-element structures)
 class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
           (ins addrmode6:$Rn), IIC_VLD3,
-          "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []> {
+          "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn", "", []>, Sched<[WriteVLD3]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST3Instruction";
@@ -923,9 +923,9 @@ def  VLD3d8   : VLD3D<0b0100, {0,0,0,?}, "8">;
 def  VLD3d16  : VLD3D<0b0100, {0,1,0,?}, "16">;
 def  VLD3d32  : VLD3D<0b0100, {1,0,0,?}, "32">;
 
-def  VLD3d8Pseudo  : VLDQQPseudo<IIC_VLD3>;
-def  VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>;
-def  VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>;
+def  VLD3d8Pseudo  : VLDQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
+def  VLD3d16Pseudo : VLDQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
+def  VLD3d32Pseudo : VLDQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
 
 // ...with address register writeback:
 class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -933,7 +933,7 @@ class VLD3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
           (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD3u,
           "vld3", Dt, "\\{$Vd, $dst2, $dst3\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
+          "$Rn.addr = $wb", []>, Sched<[WriteVLD3]> {
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST3Instruction";
 }
@@ -942,9 +942,9 @@ def VLD3d8_UPD  : VLD3DWB<0b0100, {0,0,0,?}, "8">;
 def VLD3d16_UPD : VLD3DWB<0b0100, {0,1,0,?}, "16">;
 def VLD3d32_UPD : VLD3DWB<0b0100, {1,0,0,?}, "32">;
 
-def VLD3d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3u>;
-def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
-def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>;
+def VLD3d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
 
 // ...with double-spaced registers:
 def VLD3q8      : VLD3D<0b0101, {0,0,0,?}, "8">;
@@ -954,25 +954,26 @@ def VLD3q8_UPD  : VLD3DWB<0b0101, {0,0,0,?}, "8">;
 def VLD3q16_UPD : VLD3DWB<0b0101, {0,1,0,?}, "16">;
 def VLD3q32_UPD : VLD3DWB<0b0101, {1,0,0,?}, "32">;
 
-def VLD3q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>;
-def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
-def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
 
 // ...alternate versions to be allocated odd register numbers:
-def VLD3q8oddPseudo   : VLDQQQQPseudo<IIC_VLD3>;
-def VLD3q16oddPseudo  : VLDQQQQPseudo<IIC_VLD3>;
-def VLD3q32oddPseudo  : VLDQQQQPseudo<IIC_VLD3>;
+def VLD3q8oddPseudo   : VLDQQQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
+def VLD3q16oddPseudo  : VLDQQQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
+def VLD3q32oddPseudo  : VLDQQQQPseudo<IIC_VLD3>, Sched<[WriteVLD3]>;
 
-def VLD3q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>;
-def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
-def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>;
+def VLD3q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
+def VLD3q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD3u>, Sched<[WriteVLD3]>;
 
 //   VLD4     : Vector Load (multiple 4-element structures)
 class VLD4D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b10, op11_8, op7_4,
           (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4),
           (ins addrmode6:$Rn), IIC_VLD4,
-          "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []> {
+          "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn", "", []>,
+    Sched<[WriteVLD4]> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST4Instruction";
@@ -982,9 +983,9 @@ def  VLD4d8   : VLD4D<0b0000, {0,0,?,?}, "8">;
 def  VLD4d16  : VLD4D<0b0000, {0,1,?,?}, "16">;
 def  VLD4d32  : VLD4D<0b0000, {1,0,?,?}, "32">;
 
-def  VLD4d8Pseudo  : VLDQQPseudo<IIC_VLD4>;
-def  VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>;
-def  VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>;
+def  VLD4d8Pseudo  : VLDQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
+def  VLD4d16Pseudo : VLDQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
+def  VLD4d32Pseudo : VLDQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
 
 // ...with address register writeback:
 class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -992,7 +993,7 @@ class VLD4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
           (ins addrmode6:$Rn, am6offset:$Rm), IIC_VLD4u,
           "vld4", Dt, "\\{$Vd, $dst2, $dst3, $dst4\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
+          "$Rn.addr = $wb", []>, Sched<[WriteVLD4]> {
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST4Instruction";
 }
@@ -1001,9 +1002,9 @@ def VLD4d8_UPD  : VLD4DWB<0b0000, {0,0,?,?}, "8">;
 def VLD4d16_UPD : VLD4DWB<0b0000, {0,1,?,?}, "16">;
 def VLD4d32_UPD : VLD4DWB<0b0000, {1,0,?,?}, "32">;
 
-def VLD4d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4u>;
-def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
-def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>;
+def VLD4d8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4d16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4d32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
 
 // ...with double-spaced registers:
 def VLD4q8      : VLD4D<0b0001, {0,0,?,?}, "8">;
@@ -1013,18 +1014,18 @@ def VLD4q8_UPD  : VLD4DWB<0b0001, {0,0,?,?}, "8">;
 def VLD4q16_UPD : VLD4DWB<0b0001, {0,1,?,?}, "16">;
 def VLD4q32_UPD : VLD4DWB<0b0001, {1,0,?,?}, "32">;
 
-def VLD4q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
-def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
-def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q8Pseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4q16Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4q32Pseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
 
 // ...alternate versions to be allocated odd register numbers:
-def VLD4q8oddPseudo   : VLDQQQQPseudo<IIC_VLD4>;
-def VLD4q16oddPseudo  : VLDQQQQPseudo<IIC_VLD4>;
-def VLD4q32oddPseudo  : VLDQQQQPseudo<IIC_VLD4>;
+def VLD4q8oddPseudo   : VLDQQQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
+def VLD4q16oddPseudo  : VLDQQQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
+def VLD4q32oddPseudo  : VLDQQQQPseudo<IIC_VLD4>, Sched<[WriteVLD4]>;
 
-def VLD4q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>;
-def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
-def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>;
+def VLD4q8oddPseudo_UPD  : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4q16oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
+def VLD4q32oddPseudo_UPD : VLDQQQQWBPseudo<IIC_VLD4u>, Sched<[WriteVLD4]>;
 
 } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
@@ -1076,11 +1077,12 @@ class VLD1LN32<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
           "$src = $Vd",
           [(set DPR:$Vd, (vector_insert (Ty DPR:$src),
                                          (i32 (LoadOp addrmode6oneL32:$Rn)),
-                                         imm:$lane))]> {
+                                         imm:$lane))]>, Sched<[WriteVLD1]> {
   let Rm = 0b1111;
   let DecoderMethod = "DecodeVLD1LN";
 }
-class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln> {
+class VLD1QLNPseudo<ValueType Ty, PatFrag LoadOp> : VLDQLNPseudo<IIC_VLD1ln>,
+                                                    Sched<[WriteVLD1]> {
   let Pattern = [(set QPR:$dst, (vector_insert (Ty QPR:$src),
                                                (i32 (LoadOp addrmode6:$addr)),
                                                imm:$lane))];
@@ -1117,7 +1119,7 @@ class VLD1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, am6offset:$Rm,
            DPR:$src, nohash_imm:$lane), IIC_VLD1lnu, "vld1", Dt,
           "\\{$Vd[$lane]\\}, $Rn$Rm",
-          "$src = $Vd, $Rn.addr = $wb", []> {
+          "$src = $Vd, $Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
   let DecoderMethod = "DecodeVLD1LN";
 }
 
@@ -1134,16 +1136,16 @@ def VLD1LNd32_UPD : VLD1LNWB<0b1000, {?,0,?,?}, "32"> {
   let Inst{4} = Rn{4};
 }
 
-def VLD1LNq8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD1lnu>;
-def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
-def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>;
+def VLD1LNq8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD1lnu>, Sched<[WriteVLD1]>;
+def VLD1LNq16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>, Sched<[WriteVLD1]>;
+def VLD1LNq32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD1lnu>, Sched<[WriteVLD1]>;
 
 //   VLD2LN   : Vector Load (single 2-element structure to one lane)
 class VLD2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdStLn<1, 0b10, op11_8, op7_4, (outs DPR:$Vd, DPR:$dst2),
           (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, nohash_imm:$lane),
           IIC_VLD2ln, "vld2", Dt, "\\{$Vd[$lane], $dst2[$lane]\\}, $Rn",
-          "$src1 = $Vd, $src2 = $dst2", []> {
+          "$src1 = $Vd, $src2 = $dst2", []>, Sched<[WriteVLD1]> {
   let Rm = 0b1111;
   let Inst{4}   = Rn{4};
   let DecoderMethod = "DecodeVLD2LN";
@@ -1159,9 +1161,9 @@ def VLD2LNd32 : VLD2LN<0b1001, {?,0,0,?}, "32"> {
   let Inst{7} = lane{0};
 }
 
-def VLD2LNd8Pseudo  : VLDQLNPseudo<IIC_VLD2ln>;
-def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
-def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>;
+def VLD2LNd8Pseudo  : VLDQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
+def VLD2LNd16Pseudo : VLDQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
+def VLD2LNd32Pseudo : VLDQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
 
 // ...with double-spaced registers:
 def VLD2LNq16 : VLD2LN<0b0101, {?,?,1,?}, "16"> {
@@ -1171,8 +1173,8 @@ def VLD2LNq32 : VLD2LN<0b1001, {?,1,0,?}, "32"> {
   let Inst{7} = lane{0};
 }
 
-def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
-def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>;
+def VLD2LNq16Pseudo : VLDQQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
+def VLD2LNq32Pseudo : VLDQQLNPseudo<IIC_VLD2ln>, Sched<[WriteVLD1]>;
 
 // ...with address register writeback:
 class VLD2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1195,9 +1197,9 @@ def VLD2LNd32_UPD : VLD2LNWB<0b1001, {?,0,0,?}, "32"> {
   let Inst{7} = lane{0};
 }
 
-def VLD2LNd8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD2lnu>;
-def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
-def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNd8Pseudo_UPD  : VLDQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
+def VLD2LNd16Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
+def VLD2LNd32Pseudo_UPD : VLDQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
 
 def VLD2LNq16_UPD : VLD2LNWB<0b0101, {?,?,1,?}, "16"> {
   let Inst{7-6} = lane{1-0};
@@ -1206,8 +1208,8 @@ def VLD2LNq32_UPD : VLD2LNWB<0b1001, {?,1,0,?}, "32"> {
   let Inst{7} = lane{0};
 }
 
-def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
-def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>;
+def VLD2LNq16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
+def VLD2LNq32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD2lnu>, Sched<[WriteVLD1]>;
 
 //   VLD3LN   : Vector Load (single 3-element structure to one lane)
 class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1215,7 +1217,7 @@ class VLD3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3,
           nohash_imm:$lane), IIC_VLD3ln, "vld3", Dt,
           "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn",
-          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []> {
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3", []>, Sched<[WriteVLD2]> {
   let Rm = 0b1111;
   let DecoderMethod = "DecodeVLD3LN";
 }
@@ -1230,9 +1232,9 @@ def VLD3LNd32 : VLD3LN<0b1010, {?,0,0,0}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VLD3LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD3ln>;
-def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
-def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
+def VLD3LNd16Pseudo : VLDQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
+def VLD3LNd32Pseudo : VLDQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
 
 // ...with double-spaced registers:
 def VLD3LNq16 : VLD3LN<0b0110, {?,?,1,0}, "16"> {
@@ -1242,8 +1244,8 @@ def VLD3LNq32 : VLD3LN<0b1010, {?,1,0,0}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
-def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>;
+def VLD3LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
+def VLD3LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD3ln>, Sched<[WriteVLD2]>;
 
 // ...with address register writeback:
 class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1254,7 +1256,7 @@ class VLD3LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           IIC_VLD3lnu, "vld3", Dt,
           "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane]\\}, $Rn$Rm",
           "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $Rn.addr = $wb",
-          []> {
+          []>, Sched<[WriteVLD2]> {
   let DecoderMethod = "DecodeVLD3LN";
 }
 
@@ -1268,9 +1270,9 @@ def VLD3LNd32_UPD : VLD3LNWB<0b1010, {?,0,0,0}, "32"> {
   let Inst{7} = lane{0};
 }
 
-def VLD3LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD3lnu>;
-def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
-def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
+def VLD3LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
+def VLD3LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
 
 def VLD3LNq16_UPD : VLD3LNWB<0b0110, {?,?,1,0}, "16"> {
   let Inst{7-6} = lane{1-0};
@@ -1279,8 +1281,8 @@ def VLD3LNq32_UPD : VLD3LNWB<0b1010, {?,1,0,0}, "32"> {
   let Inst{7} = lane{0};
 }
 
-def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
-def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>;
+def VLD3LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
+def VLD3LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD3lnu>, Sched<[WriteVLD2]>;
 
 //   VLD4LN   : Vector Load (single 4-element structure to one lane)
 class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1289,7 +1291,8 @@ class VLD4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, DPR:$src1, DPR:$src2, DPR:$src3, DPR:$src4,
           nohash_imm:$lane), IIC_VLD4ln, "vld4", Dt,
           "\\{$Vd[$lane], $dst2[$lane], $dst3[$lane], $dst4[$lane]\\}, $Rn",
-          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []> {
+          "$src1 = $Vd, $src2 = $dst2, $src3 = $dst3, $src4 = $dst4", []>,
+    Sched<[WriteVLD2]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD4LN";
@@ -1306,9 +1309,9 @@ def VLD4LNd32 : VLD4LN<0b1011, {?,0,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VLD4LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD4ln>;
-def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
-def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNd8Pseudo  : VLDQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
+def VLD4LNd16Pseudo : VLDQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
+def VLD4LNd32Pseudo : VLDQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
 
 // ...with double-spaced registers:
 def VLD4LNq16 : VLD4LN<0b0111, {?,?,1,?}, "16"> {
@@ -1319,8 +1322,8 @@ def VLD4LNq32 : VLD4LN<0b1011, {?,1,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
-def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>;
+def VLD4LNq16Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
+def VLD4LNq32Pseudo : VLDQQQQLNPseudo<IIC_VLD4ln>, Sched<[WriteVLD2]>;
 
 // ...with address register writeback:
 class VLD4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1347,9 +1350,9 @@ def VLD4LNd32_UPD : VLD4LNWB<0b1011, {?,0,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VLD4LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD4lnu>;
-def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
-def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNd8Pseudo_UPD  : VLDQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
+def VLD4LNd16Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
+def VLD4LNd32Pseudo_UPD : VLDQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
 
 def VLD4LNq16_UPD : VLD4LNWB<0b0111, {?,?,1,?}, "16"> {
   let Inst{7-6} = lane{1-0};
@@ -1359,8 +1362,8 @@ def VLD4LNq32_UPD : VLD4LNWB<0b1011, {?,1,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
-def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
+def VLD4LNq16Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
+def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>, Sched<[WriteVLD2]>;
 
 } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
@@ -1371,7 +1374,8 @@ class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
           (ins AddrMode:$Rn),
           IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "",
           [(set VecListOneDAllLanes:$Vd,
-                (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
+                (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]>,
+   Sched<[WriteVLD2]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD1DupInstruction";
@@ -1434,7 +1438,7 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
                      (outs VecListDPairAllLanes:$Vd, GPR:$wb),
                      (ins AddrMode:$Rn), IIC_VLD1dupu,
                      "vld1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD1DupInstruction";
@@ -1491,7 +1495,7 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
                      (outs VdTy:$Vd, GPR:$wb),
                      (ins AddrMode:$Rn), IIC_VLD2dupu,
                      "vld2", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD2DupInstruction";
@@ -1500,7 +1504,7 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
                         (outs VdTy:$Vd, GPR:$wb),
                         (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD2dupu,
                         "vld2", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVLD1]> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD2DupInstruction";
   }
@@ -1524,7 +1528,8 @@ defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
 class VLD3DUP<bits<4> op7_4, string Dt>
   : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3),
           (ins addrmode6dup:$Rn), IIC_VLD3dup,
-          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []> {
+          "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn", "", []>,
+    Sched<[WriteVLD2]> {
   let Rm = 0b1111;
   let Inst{4} = 0;
   let DecoderMethod = "DecodeVLD3DupInstruction";
@@ -1534,9 +1539,9 @@ def VLD3DUPd8  : VLD3DUP<{0,0,0,?}, "8">;
 def VLD3DUPd16 : VLD3DUP<{0,1,0,?}, "16">;
 def VLD3DUPd32 : VLD3DUP<{1,0,0,?}, "32">;
 
-def VLD3DUPd8Pseudo  : VLDQQPseudo<IIC_VLD3dup>;
-def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>;
-def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>;
+def VLD3DUPd8Pseudo  : VLDQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
+def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>, Sched<[WriteVLD2]>;
 
 // ...with double-spaced registers (not used for codegen):
 def VLD3DUPq8  : VLD3DUP<{0,0,1,?}, "8">;
@@ -1548,7 +1553,7 @@ class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
           (ins AddrMode:$Rn, am6offset:$Rm), IIC_VLD3dupu,
           "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
+          "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> {
   let Inst{4} = 0;
   let DecoderMethod = "DecodeVLD3DupInstruction";
 }
@@ -1561,9 +1566,9 @@ def VLD3DUPq8_UPD  : VLD3DUPWB<{0,0,1,0}, "8",  addrmode6dupalign64>;
 def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16", addrmode6dupalign64>;
 def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32", addrmode6dupalign64>;
 
-def VLD3DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3dupu>;
-def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
-def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
+def VLD3DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
+def VLD3DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>, Sched<[WriteVLD2]>;
 
 //   VLD4DUP  : Vector Load (single 4-element structure to all lanes)
 class VLD4DUP<bits<4> op7_4, string Dt>
@@ -1580,9 +1585,9 @@ def VLD4DUPd8  : VLD4DUP<{0,0,0,?}, "8">;
 def VLD4DUPd16 : VLD4DUP<{0,1,0,?}, "16">;
 def VLD4DUPd32 : VLD4DUP<{1,?,0,?}, "32"> { let Inst{6} = Rn{5}; }
 
-def VLD4DUPd8Pseudo  : VLDQQPseudo<IIC_VLD4dup>;
-def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>;
-def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>;
+def VLD4DUPd8Pseudo  : VLDQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPd16Pseudo : VLDQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
+def VLD4DUPd32Pseudo : VLDQQPseudo<IIC_VLD4dup>, Sched<[WriteVLD2]>;
 
 // ...with double-spaced registers (not used for codegen):
 def VLD4DUPq8  : VLD4DUP<{0,0,1,?}, "8">;
@@ -1595,7 +1600,7 @@ class VLD4DUPWB<bits<4> op7_4, string Dt>
           (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, DPR:$dst4, GPR:$wb),
           (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD4dupu,
           "vld4", Dt, "\\{$Vd[], $dst2[], $dst3[], $dst4[]\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
+          "$Rn.addr = $wb", []>, Sched<[WriteVLD2]> {
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD4DupInstruction";
 }
@@ -1608,9 +1613,9 @@ def VLD4DUPq8_UPD  : VLD4DUPWB<{0,0,1,0}, "8">;
 def VLD4DUPq16_UPD : VLD4DUPWB<{0,1,1,?}, "16">;
 def VLD4DUPq32_UPD : VLD4DUPWB<{1,?,1,?}, "32"> { let Inst{6} = Rn{5}; }
 
-def VLD4DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4dupu>;
-def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
-def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>;
+def VLD4DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+def VLD4DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
+def VLD4DUPd32Pseudo_UPD : VLDQQWBPseudo<IIC_VLD4dupu>, Sched<[WriteVLD2]>;
 
 } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
@@ -1657,14 +1662,14 @@ class VSTQQQQWBPseudo<InstrItinClass itin>
 //   VST1     : Vector Store (multiple single elements)
 class VST1D<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins AddrMode:$Rn, VecListOneD:$Vd),
-          IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> {
+          IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST1]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
 class VST1Q<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins AddrMode:$Rn, VecListDPair:$Vd),
-          IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> {
+          IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST2]> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1685,7 +1690,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00, 0b0111,op7_4, (outs GPR:$wb),
                      (ins AddrMode:$Rn, VecListOneD:$Vd), IIC_VLD1u,
                      "vst1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVST1]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1694,7 +1699,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
                         (ins AddrMode:$Rn, rGPR:$Rm, VecListOneD:$Vd),
                         IIC_VLD1u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVST1]> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -1703,7 +1708,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
                     (ins AddrMode:$Rn, VecListDPair:$Vd), IIC_VLD1x2u,
                      "vst1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVST2]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1712,7 +1717,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
                         (ins AddrMode:$Rn, rGPR:$Rm, VecListDPair:$Vd),
                         IIC_VLD1x2u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVST2]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -1732,7 +1737,7 @@ defm VST1q64wb : VST1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
 class VST1D3<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
           (ins AddrMode:$Rn, VecListThreeD:$Vd),
-          IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
+          IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []>, Sched<[WriteVST3]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1741,7 +1746,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
                     (ins AddrMode:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u,
                      "vst1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVST3]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1750,7 +1755,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
                         (ins AddrMode:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
                         IIC_VLD1x3u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVST3]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -1766,16 +1771,16 @@ defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>;
 defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>;
 defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>;
 
-def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>;
-def VST1d64TPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x3u>;
-def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>;
+def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>, Sched<[WriteVST3]>;
+def VST1d64TPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
+def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>, Sched<[WriteVST3]>;
 
 // ...with 4 registers
 class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
           (ins AddrMode:$Rn, VecListFourD:$Vd),
           IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "",
-          []> {
+          []>, Sched<[WriteVST4]> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1784,7 +1789,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
                     (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1x4u,
                      "vst1", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
@@ -1793,7 +1798,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
                         (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
                         IIC_VLD1x4u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
@@ -1809,9 +1814,9 @@ defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
 defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
 defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
-def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>;
-def VST1d64QPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x4u>;
-def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>;
+def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>, Sched<[WriteVST4]>;
+def VST1d64QPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
+def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>, Sched<[WriteVST4]>;
 
 //   VST2     : Vector Store (multiple 2-element structures)
 class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
@@ -1824,22 +1829,22 @@ class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
 }
 
 def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListDPair, IIC_VST2,
-                     addrmode6align64or128>;
+                     addrmode6align64or128>, Sched<[WriteVST2]>;
 def  VST2d16  : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2,
-                     addrmode6align64or128>;
+                     addrmode6align64or128>, Sched<[WriteVST2]>;
 def  VST2d32  : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2,
-                     addrmode6align64or128>;
+                     addrmode6align64or128>, Sched<[WriteVST2]>;
 
 def  VST2q8   : VST2<0b0011, {0,0,?,?}, "8",  VecListFourD, IIC_VST2x2,
-                     addrmode6align64or128or256>;
+                     addrmode6align64or128or256>, Sched<[WriteVST4]>;
 def  VST2q16  : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2,
-                     addrmode6align64or128or256>;
+                     addrmode6align64or128or256>, Sched<[WriteVST4]>;
 def  VST2q32  : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2,
-                     addrmode6align64or128or256>;
+                     addrmode6align64or128or256>, Sched<[WriteVST4]>;
 
-def  VST2q8Pseudo  : VSTQQPseudo<IIC_VST2x2>;
-def  VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>;
-def  VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>;
+def  VST2q8Pseudo  : VSTQQPseudo<IIC_VST2x2>, Sched<[WriteVST4]>;
+def  VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>, Sched<[WriteVST4]>;
+def  VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>, Sched<[WriteVST4]>;
 
 // ...with address register writeback:
 multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
@@ -1847,7 +1852,7 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
   def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
                      (ins AddrMode:$Rn, VdTy:$Vd), IIC_VLD1u,
                      "vst2", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVST2]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
@@ -1855,7 +1860,7 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
   def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
                         (ins AddrMode:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
                         "vst2", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVST2]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
   }
@@ -1864,7 +1869,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
                      (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1u,
                      "vst2", Dt, "$Vd, $Rn!",
-                     "$Rn.addr = $wb", []> {
+                     "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
@@ -1873,7 +1878,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
                         (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
                         IIC_VLD1u,
                         "vst2", Dt, "$Vd, $Rn, $Rm",
-                        "$Rn.addr = $wb", []> {
+                        "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
   }
@@ -1890,12 +1895,12 @@ defm VST2q8wb    : VST2QWB<{0,0,?,?}, "8", addrmode6align64or128or256>;
 defm VST2q16wb   : VST2QWB<{0,1,?,?}, "16", addrmode6align64or128or256>;
 defm VST2q32wb   : VST2QWB<{1,0,?,?}, "32", addrmode6align64or128or256>;
 
-def VST2q8PseudoWB_fixed     : VSTQQWBfixedPseudo<IIC_VST2x2u>;
-def VST2q16PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>;
-def VST2q32PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>;
-def VST2q8PseudoWB_register  : VSTQQWBregisterPseudo<IIC_VST2x2u>;
-def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
-def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
+def VST2q8PseudoWB_fixed     : VSTQQWBfixedPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q16PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q32PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q8PseudoWB_register  : VSTQQWBregisterPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
+def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>, Sched<[WriteVST4]>;
 
 // ...with double-spaced registers
 def VST2b8      : VST2<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced, IIC_VST2,
@@ -1915,7 +1920,7 @@ defm VST2b32wb  : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced,
 class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3,
-          "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []> {
+          "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn", "", []>, Sched<[WriteVST3]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST3Instruction";
@@ -1925,9 +1930,9 @@ def  VST3d8   : VST3D<0b0100, {0,0,0,?}, "8">;
 def  VST3d16  : VST3D<0b0100, {0,1,0,?}, "16">;
 def  VST3d32  : VST3D<0b0100, {1,0,0,?}, "32">;
 
-def  VST3d8Pseudo  : VSTQQPseudo<IIC_VST3>;
-def  VST3d16Pseudo : VSTQQPseudo<IIC_VST3>;
-def  VST3d32Pseudo : VSTQQPseudo<IIC_VST3>;
+def  VST3d8Pseudo  : VSTQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
+def  VST3d16Pseudo : VSTQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
+def  VST3d32Pseudo : VSTQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
 
 // ...with address register writeback:
 class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1935,7 +1940,7 @@ class VST3DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, am6offset:$Rm,
            DPR:$Vd, DPR:$src2, DPR:$src3), IIC_VST3u,
           "vst3", Dt, "\\{$Vd, $src2, $src3\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
+          "$Rn.addr = $wb", []>, Sched<[WriteVST3]> {
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST3Instruction";
 }
@@ -1944,9 +1949,9 @@ def VST3d8_UPD  : VST3DWB<0b0100, {0,0,0,?}, "8">;
 def VST3d16_UPD : VST3DWB<0b0100, {0,1,0,?}, "16">;
 def VST3d32_UPD : VST3DWB<0b0100, {1,0,0,?}, "32">;
 
-def VST3d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST3u>;
-def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
-def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>;
+def VST3d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
 
 // ...with double-spaced registers:
 def VST3q8      : VST3D<0b0101, {0,0,0,?}, "8">;
@@ -1956,25 +1961,25 @@ def VST3q8_UPD  : VST3DWB<0b0101, {0,0,0,?}, "8">;
 def VST3q16_UPD : VST3DWB<0b0101, {0,1,0,?}, "16">;
 def VST3q32_UPD : VST3DWB<0b0101, {1,0,0,?}, "32">;
 
-def VST3q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>;
-def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
-def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
 
 // ...alternate versions to be allocated odd register numbers:
-def VST3q8oddPseudo   : VSTQQQQPseudo<IIC_VST3>;
-def VST3q16oddPseudo  : VSTQQQQPseudo<IIC_VST3>;
-def VST3q32oddPseudo  : VSTQQQQPseudo<IIC_VST3>;
+def VST3q8oddPseudo   : VSTQQQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
+def VST3q16oddPseudo  : VSTQQQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
+def VST3q32oddPseudo  : VSTQQQQPseudo<IIC_VST3>, Sched<[WriteVST3]>;
 
-def VST3q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>;
-def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
-def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>;
+def VST3q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
+def VST3q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST3u>, Sched<[WriteVST3]>;
 
 //   VST4     : Vector Store (multiple 4-element structures)
 class VST4D<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdSt<0, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4),
           IIC_VST4, "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn",
-          "", []> {
+          "", []>, Sched<[WriteVST4]> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST4Instruction";
@@ -1984,9 +1989,9 @@ def  VST4d8   : VST4D<0b0000, {0,0,?,?}, "8">;
 def  VST4d16  : VST4D<0b0000, {0,1,?,?}, "16">;
 def  VST4d32  : VST4D<0b0000, {1,0,?,?}, "32">;
 
-def  VST4d8Pseudo  : VSTQQPseudo<IIC_VST4>;
-def  VST4d16Pseudo : VSTQQPseudo<IIC_VST4>;
-def  VST4d32Pseudo : VSTQQPseudo<IIC_VST4>;
+def  VST4d8Pseudo  : VSTQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
+def  VST4d16Pseudo : VSTQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
+def  VST4d32Pseudo : VSTQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
 
 // ...with address register writeback:
 class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1994,7 +1999,7 @@ class VST4DWB<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, am6offset:$Rm,
            DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4), IIC_VST4u,
            "vst4", Dt, "\\{$Vd, $src2, $src3, $src4\\}, $Rn$Rm",
-          "$Rn.addr = $wb", []> {
+          "$Rn.addr = $wb", []>, Sched<[WriteVST4]> {
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST4Instruction";
 }
@@ -2003,9 +2008,9 @@ def VST4d8_UPD  : VST4DWB<0b0000, {0,0,?,?}, "8">;
 def VST4d16_UPD : VST4DWB<0b0000, {0,1,?,?}, "16">;
 def VST4d32_UPD : VST4DWB<0b0000, {1,0,?,?}, "32">;
 
-def VST4d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST4u>;
-def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
-def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>;
+def VST4d8Pseudo_UPD  : VSTQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4d16Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4d32Pseudo_UPD : VSTQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
 
 // ...with double-spaced registers:
 def VST4q8      : VST4D<0b0001, {0,0,?,?}, "8">;
@@ -2015,18 +2020,18 @@ def VST4q8_UPD  : VST4DWB<0b0001, {0,0,?,?}, "8">;
 def VST4q16_UPD : VST4DWB<0b0001, {0,1,?,?}, "16">;
 def VST4q32_UPD : VST4DWB<0b0001, {1,0,?,?}, "32">;
 
-def VST4q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
-def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
-def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q8Pseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4q16Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4q32Pseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
 
 // ...alternate versions to be allocated odd register numbers:
-def VST4q8oddPseudo   : VSTQQQQPseudo<IIC_VST4>;
-def VST4q16oddPseudo  : VSTQQQQPseudo<IIC_VST4>;
-def VST4q32oddPseudo  : VSTQQQQPseudo<IIC_VST4>;
+def VST4q8oddPseudo   : VSTQQQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
+def VST4q16oddPseudo  : VSTQQQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
+def VST4q32oddPseudo  : VSTQQQQPseudo<IIC_VST4>, Sched<[WriteVST4]>;
 
-def VST4q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>;
-def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
-def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>;
+def VST4q8oddPseudo_UPD  : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4q16oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
+def VST4q32oddPseudo_UPD : VSTQQQQWBPseudo<IIC_VST4u>, Sched<[WriteVST4]>;
 
 } // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
 
@@ -2060,12 +2065,13 @@ class VST1LN<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
   : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
           (ins AddrMode:$Rn, DPR:$Vd, nohash_imm:$lane),
           IIC_VST1ln, "vst1", Dt, "\\{$Vd[$lane]\\}, $Rn", "",
-          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]> {
+          [(StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane), AddrMode:$Rn)]>,
+     Sched<[WriteVST1]> {
   let Rm = 0b1111;
   let DecoderMethod = "DecodeVST1LN";
 }
 class VST1QLNPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
-  : VSTQLNPseudo<IIC_VST1ln> {
+  : VSTQLNPseudo<IIC_VST1ln>, Sched<[WriteVST1]> {
   let Pattern = [(StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
                           addrmode6:$addr)];
 }
@@ -2104,11 +2110,12 @@ class VST1LNWB<bits<4> op11_8, bits<4> op7_4, string Dt, ValueType Ty,
           "\\{$Vd[$lane]\\}, $Rn$Rm",
           "$Rn.addr = $wb",
           [(set GPR:$wb, (StoreOp (ExtractOp (Ty DPR:$Vd), imm:$lane),
-                                  AdrMode:$Rn, am6offset:$Rm))]> {
+                                  AdrMode:$Rn, am6offset:$Rm))]>,
+    Sched<[WriteVST1]> {
   let DecoderMethod = "DecodeVST1LN";
 }
 class VST1QLNWBPseudo<ValueType Ty, PatFrag StoreOp, SDNode ExtractOp>
-  : VSTQLNWBPseudo<IIC_VST1lnu> {
+  : VSTQLNWBPseudo<IIC_VST1lnu>, Sched<[WriteVST1]> {
   let Pattern = [(set GPR:$wb, (StoreOp (ExtractOp (Ty QPR:$src), imm:$lane),
                                         addrmode6:$addr, am6offset:$offset))];
 }
@@ -2139,7 +2146,7 @@ class VST2LN<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, nohash_imm:$lane),
           IIC_VST2ln, "vst2", Dt, "\\{$Vd[$lane], $src2[$lane]\\}, $Rn",
-          "", []> {
+          "", []>, Sched<[WriteVST1]> {
   let Rm = 0b1111;
   let Inst{4}   = Rn{4};
   let DecoderMethod = "DecodeVST2LN";
@@ -2155,9 +2162,9 @@ def VST2LNd32 : VST2LN<0b1001, {?,0,0,?}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VST2LNd8Pseudo  : VSTQLNPseudo<IIC_VST2ln>;
-def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>;
-def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>;
+def VST2LNd8Pseudo  : VSTQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
+def VST2LNd16Pseudo : VSTQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
+def VST2LNd32Pseudo : VSTQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
 
 // ...with double-spaced registers:
 def VST2LNq16 : VST2LN<0b0101, {?,?,1,?}, "16"> {
@@ -2169,8 +2176,8 @@ def VST2LNq32 : VST2LN<0b1001, {?,1,0,?}, "32"> {
   let Inst{4}   = Rn{4};
 }
 
-def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
-def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>;
+def VST2LNq16Pseudo : VSTQQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
+def VST2LNq32Pseudo : VSTQQLNPseudo<IIC_VST2ln>, Sched<[WriteVST1]>;
 
 // ...with address register writeback:
 class VST2LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -2193,9 +2200,9 @@ def VST2LNd32_UPD : VST2LNWB<0b1001, {?,0,0,?}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VST2LNd8Pseudo_UPD  : VSTQLNWBPseudo<IIC_VST2lnu>;
-def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
-def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNd8Pseudo_UPD  : VSTQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
+def VST2LNd16Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
+def VST2LNd32Pseudo_UPD : VSTQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
 
 def VST2LNq16_UPD : VST2LNWB<0b0101, {?,?,1,?}, "16"> {
   let Inst{7-6} = lane{1-0};
@@ -2204,15 +2211,16 @@ def VST2LNq32_UPD : VST2LNWB<0b1001, {?,1,0,?}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
-def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>;
+def VST2LNq16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
+def VST2LNq32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST2lnu>, Sched<[WriteVST1]>;
 
 //   VST3LN   : Vector Store (single 3-element structure from one lane)
 class VST3LN<bits<4> op11_8, bits<4> op7_4, string Dt>
   : NLdStLn<1, 0b00, op11_8, op7_4, (outs),
           (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3,
            nohash_imm:$lane), IIC_VST3ln, "vst3", Dt,
-          "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []> {
+          "\\{$Vd[$lane], $src2[$lane], $src3[$lane]\\}, $Rn", "", []>,
+    Sched<[WriteVST2]> {
   let Rm = 0b1111;
   let DecoderMethod = "DecodeVST3LN";
 }
@@ -2227,9 +2235,9 @@ def VST3LNd32 : VST3LN<0b1010, {?,0,0,0}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VST3LNd8Pseudo  : VSTQQLNPseudo<IIC_VST3ln>;
-def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
-def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>;
+def VST3LNd8Pseudo  : VSTQQLNPseudo<IIC_VST3ln>, Sched<[WriteVST2]>;
+def VST3LNd16Pseudo : VSTQQLNPseudo<IIC_VST3ln>, Sched<[WriteVST2]>;
+def VST3LNd32Pseudo : VSTQQLNPseudo<IIC_VST3ln>, Sched<[WriteVST2]>;
 
 // ...with double-spaced registers:
 def VST3LNq16 : VST3LN<0b0110, {?,?,1,0}, "16"> {
@@ -2263,9 +2271,9 @@ def VST3LNd32_UPD : VST3LNWB<0b1010, {?,0,0,0}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VST3LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST3lnu>;
-def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
-def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
+def VST3LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
+def VST3LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
 
 def VST3LNq16_UPD : VST3LNWB<0b0110, {?,?,1,0}, "16"> {
   let Inst{7-6} = lane{1-0};
@@ -2274,8 +2282,8 @@ def VST3LNq32_UPD : VST3LNWB<0b1010, {?,1,0,0}, "32"> {
   let Inst{7}   = lane{0};
 }
 
-def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
-def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>;
+def VST3LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
+def VST3LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST3lnu>, Sched<[WriteVST2]>;
 
 //   VST4LN   : Vector Store (single 4-element structure from one lane)
 class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -2283,7 +2291,7 @@ class VST4LN<bits<4> op11_8, bits<4> op7_4, string Dt>
           (ins addrmode6:$Rn, DPR:$Vd, DPR:$src2, DPR:$src3, DPR:$src4,
            nohash_imm:$lane), IIC_VST4ln, "vst4", Dt,
           "\\{$Vd[$lane], $src2[$lane], $src3[$lane], $src4[$lane]\\}, $Rn",
-          "", []> {
+          "", []>, Sched<[WriteVST2]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVST4LN";
@@ -2300,9 +2308,9 @@ def VST4LNd32 : VST4LN<0b1011, {?,0,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VST4LNd8Pseudo  : VSTQQLNPseudo<IIC_VST4ln>;
-def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
-def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>;
+def VST4LNd8Pseudo  : VSTQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
+def VST4LNd16Pseudo : VSTQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
+def VST4LNd32Pseudo : VSTQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
 
 // ...with double-spaced registers:
 def VST4LNq16 : VST4LN<0b0111, {?,?,1,?}, "16"> {
@@ -2313,8 +2321,8 @@ def VST4LNq32 : VST4LN<0b1011, {?,1,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
-def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>;
+def VST4LNq16Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
+def VST4LNq32Pseudo : VSTQQQQLNPseudo<IIC_VST4ln>, Sched<[WriteVST2]>;
 
 // ...with address register writeback:
 class VST4LNWB<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -2339,9 +2347,9 @@ def VST4LNd32_UPD : VST4LNWB<0b1011, {?,0,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VST4LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST4lnu>;
-def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
-def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNd8Pseudo_UPD  : VSTQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
+def VST4LNd16Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
+def VST4LNd32Pseudo_UPD : VSTQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
 
 def VST4LNq16_UPD : VST4LNWB<0b0111, {?,?,1,?}, "16"> {
   let Inst{7-6} = lane{1-0};
@@ -2351,8 +2359,8 @@ def VST4LNq32_UPD : VST4LNWB<0b1011, {?,1,?,?}, "32"> {
   let Inst{5} = Rn{5};
 }
 
-def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
-def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>;
+def VST4LNq16Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
+def VST4LNq32Pseudo_UPD : VSTQQQQLNWBPseudo<IIC_VST4lnu>, Sched<[WriteVST2]>;
 
 } // mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1
 
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index 87eb4c2b9074..ec5b97cba8cd 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -131,6 +131,17 @@ def WriteFPDIV64 : SchedWrite;
 def WriteFPSQRT32 : SchedWrite;
 def WriteFPSQRT64 : SchedWrite;
 
+// Vector load and stores
+def WriteVLD1 : SchedWrite;
+def WriteVLD2 : SchedWrite;
+def WriteVLD3 : SchedWrite;
+def WriteVLD4 : SchedWrite;
+def WriteVST1 : SchedWrite;
+def WriteVST2 : SchedWrite;
+def WriteVST3 : SchedWrite;
+def WriteVST4 : SchedWrite;
+
+
 // Define TII for use in SchedVariant Predicates.
 def : PredicateProlog<[{
   const ARMBaseInstrInfo *TII =
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 8fb8a2a3b6d2..4e72b13d94cb 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1981,6 +1981,15 @@ def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; }
 def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
 def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; }
 
+def : WriteRes<WriteVLD1, []>;
+def : WriteRes<WriteVLD2, []>;
+def : WriteRes<WriteVLD3, []>;
+def : WriteRes<WriteVLD4, []>;
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
 // Reserve A9UnitFP for 2 consecutive cycles.
 def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
   let Latency = 4;
diff --git a/lib/Target/ARM/ARMScheduleR52.td b/lib/Target/ARM/ARMScheduleR52.td
index 537e5da9669f..782be9b60a7a 100644
--- a/lib/Target/ARM/ARMScheduleR52.td
+++ b/lib/Target/ARM/ARMScheduleR52.td
@@ -120,6 +120,12 @@ def : WriteRes<WriteFPDIV64, [R52UnitDiv]> {
 def : WriteRes<WriteFPSQRT32, [R52UnitDiv]> { let Latency = 7; }
 def : WriteRes<WriteFPSQRT64, [R52UnitDiv]> { let Latency = 17; }
 
+// Overriden via InstRW for this processor.
+def : WriteRes<WriteVST1, []>;
+def : WriteRes<WriteVST2, []>;
+def : WriteRes<WriteVST3, []>;
+def : WriteRes<WriteVST4, []>;
+
 def : ReadAdvance<ReadFPMUL, 1>; // mul operand read in F1
 def : ReadAdvance<ReadFPMAC, 1>; // fp-mac operand read in F1
 
@@ -712,20 +718,20 @@ def R52WriteSTM : SchedWriteVariant<[
 
 // Vector Load/Stores. Can issue only in slot-0. Can dual-issue with
 // another instruction in slot-1, but only in the last issue.
-def R52WriteVLD1Mem  : SchedWriteRes<[R52UnitLd]> { let Latency = 5;}
-def R52WriteVLD2Mem  : SchedWriteRes<[R52UnitLd]> {
+def : WriteRes<WriteVLD1, [R52UnitLd]> { let Latency = 5;}
+def : WriteRes<WriteVLD2, [R52UnitLd]> {
   let Latency = 6;
   let NumMicroOps = 3;
   let ResourceCycles = [2];
   let SingleIssue = 1;
 }
-def R52WriteVLD3Mem  : SchedWriteRes<[R52UnitLd]> {
+def : WriteRes<WriteVLD3, [R52UnitLd]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [3];
   let SingleIssue = 1;
 }
-def R52WriteVLD4Mem  : SchedWriteRes<[R52UnitLd]> {
+def : WriteRes<WriteVLD4, [R52UnitLd]> {
   let Latency = 8;
   let NumMicroOps = 7;
   let ResourceCycles = [4];
@@ -828,95 +834,6 @@ def : InstRW<[R52WriteFPALU_F5, R52Read_F1, R52Read_F1],
 def : InstRW<[R52WriteFPALU_F4, R52Read_F1, R52Read_F1], (instregex "VRSHL", "VRSHR", "VRSHRN", "VTB")>;
 def : InstRW<[R52WriteFPALU_F3, R52Read_F1, R52Read_F1], (instregex "VSWP", "VTRN", "VUZP", "VZIP")>;
 
-//---
-// VLDx. Vector Loads
-//---
-// 1-element structure load
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD1q(8|16|32|64)$")>;
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)T$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Q$")>;
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD1d64TPseudo$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD1d64QPseudo$")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)d(8|16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1LNdAsm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo$")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)wb")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1q(8|16|32|64)wb")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Twb")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d(8|16|32|64)Qwb")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64TPseudoWB")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1d64QPseudoWB")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNd(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1LNdWB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1DUP(d|q)(8|16|32)wb")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD1(LN|DUP)q(8|16|32)Pseudo_UPD")>;
-
-// 2-element structure load
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2(d|b)(8|16|32)wb")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)wb")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD2q(8|16|32)Pseudo$")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2q(8|16|32)PseudoWB")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNdAsm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNqAsm_(16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2$")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo")>;
-def : InstRW<[R52WriteVLD1Mem, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNdWB_(fixed|register)_Asm_(8|16|32)")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)_UPD")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNqWB_(fixed|register)_Asm_(16|32)")>;
-
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)wb")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2DUPd(8|16|32)x2wb")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNd(8|16|32)Pseudo_UPD")>;
-def : InstRW<[R52WriteVLD1Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD2LNq(16|32)Pseudo_UPD")>;
-
-// 3-element structure load
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)Asm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD3Mem, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo")>;
-def : InstRW<[R52WriteVLD3Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
-
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)Asm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo$")>;
-
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
-
-// 4-element structure load
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)Asm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD4Mem, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD4Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(d|q)(8|16|32)(oddP|P)seudo_UPD")>;
-
-
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)Asm_(8|16|32)$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4LN(d|q)(8|16|32)Pseudo$")>;
-def : InstRW<[R52WriteVLD2Mem, R52Read_ISS], (instregex "VLD4DUPd(8|16|32)Pseudo$")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)_UPD")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)WB_(fixed|register)_Asm_(8|16|32)")>;
-def : InstRW<[R52WriteVLD2Mem, R52WriteAdr, R52Read_ISS], (instregex "VLD4(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
-
 //---
 // VSTx. Vector Stores
 //---
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index dc041c6c6006..b838688c6f04 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -1070,6 +1070,16 @@ let SchedModel = SwiftModel in {
   def : ReadAdvance<ReadFPMUL, 0>;
   def : ReadAdvance<ReadFPMAC, 0>;
 
+  // Overriden via InstRW for this processor.
+  def : WriteRes<WriteVLD1, []>;
+  def : WriteRes<WriteVLD2, []>;
+  def : WriteRes<WriteVLD3, []>;
+  def : WriteRes<WriteVLD4, []>;
+  def : WriteRes<WriteVST1, []>;
+  def : WriteRes<WriteVST2, []>;
+  def : WriteRes<WriteVST3, []>;
+  def : WriteRes<WriteVST4, []>;
+
   // Not specified.
   def : InstRW<[SwiftWriteP01OneCycle2x], (instregex "ABS")>;
   // Preload.
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 1979cbf50125..c4f23c66e4ea 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -85,9 +85,9 @@ namespace llvm {
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget());
+  RegisterTargetMachine<ARMLETargetMachine> A(getTheThumbLETarget());
   RegisterTargetMachine<ARMBETargetMachine> Y(getTheARMBETarget());
-  RegisterTargetMachine<ThumbLETargetMachine> A(getTheThumbLETarget());
-  RegisterTargetMachine<ThumbBETargetMachine> B(getTheThumbBETarget());
+  RegisterTargetMachine<ARMBETargetMachine> B(getTheThumbBETarget());
 
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeGlobalISel(Registry);
@@ -263,6 +263,11 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
     else
       this->Options.EABIVersion = EABI::EABI5;
   }
+
+  initAsmInfo();
+  if (!Subtarget.isThumb() && !Subtarget.hasARMOps())
+    report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
+                       "support ARM mode execution!");
 }
 
 ARMBaseTargetMachine::~ARMBaseTargetMachine() = default;
@@ -355,22 +360,6 @@ TargetIRAnalysis ARMBaseTargetMachine::getTargetIRAnalysis() {
   });
 }
 
-void ARMTargetMachine::anchor() {}
-
-ARMTargetMachine::ARMTargetMachine(const Target &T, const Triple &TT,
-                                   StringRef CPU, StringRef FS,
-                                   const TargetOptions &Options,
-                                   Optional<Reloc::Model> RM,
-                                   CodeModel::Model CM, CodeGenOpt::Level OL,
-                                   bool isLittle)
-    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
-  initAsmInfo();
-  if (!Subtarget.hasARMOps())
-    report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
-                       "support ARM mode execution!");
-}
-
-void ARMLETargetMachine::anchor() {}
 
 ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
@@ -378,9 +367,7 @@ ARMLETargetMachine::ARMLETargetMachine(const Target &T, const Triple &TT,
                                        Optional<Reloc::Model> RM,
                                        CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-    : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
-
-void ARMBETargetMachine::anchor() {}
+    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
 
 ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
                                        StringRef CPU, StringRef FS,
@@ -388,39 +375,7 @@ ARMBETargetMachine::ARMBETargetMachine(const Target &T, const Triple &TT,
                                        Optional<Reloc::Model> RM,
                                        CodeModel::Model CM,
                                        CodeGenOpt::Level OL)
-    : ARMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
-
-void ThumbTargetMachine::anchor() {}
-
-ThumbTargetMachine::ThumbTargetMachine(const Target &T, const Triple &TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Optional<Reloc::Model> RM,
-                                       CodeModel::Model CM,
-                                       CodeGenOpt::Level OL, bool isLittle)
-    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, isLittle) {
-  initAsmInfo();
-}
-
-void ThumbLETargetMachine::anchor() {}
-
-ThumbLETargetMachine::ThumbLETargetMachine(const Target &T, const Triple &TT,
-                                           StringRef CPU, StringRef FS,
-                                           const TargetOptions &Options,
-                                           Optional<Reloc::Model> RM,
-                                           CodeModel::Model CM,
-                                           CodeGenOpt::Level OL)
-    : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true) {}
-
-void ThumbBETargetMachine::anchor() {}
-
-ThumbBETargetMachine::ThumbBETargetMachine(const Target &T, const Triple &TT,
-                                           StringRef CPU, StringRef FS,
-                                           const TargetOptions &Options,
-                                           Optional<Reloc::Model> RM,
-                                           CodeModel::Model CM,
-                                           CodeGenOpt::Level OL)
-    : ThumbTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
+    : ARMBaseTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
 namespace {
 
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index f0ca9427d9fb..e5eb27114c72 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -62,23 +62,9 @@ public:
   }
 };
 
-/// ARM target machine.
+/// ARM/Thumb little endian target machine.
 ///
-class ARMTargetMachine : public ARMBaseTargetMachine {
-  virtual void anchor();
-
-public:
-   ARMTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                    StringRef FS, const TargetOptions &Options,
-                    Optional<Reloc::Model> RM, CodeModel::Model CM,
-                    CodeGenOpt::Level OL, bool isLittle);
-};
-
-/// ARM little endian target machine.
-///
-class ARMLETargetMachine : public ARMTargetMachine {
-  void anchor() override;
-
+class ARMLETargetMachine : public ARMBaseTargetMachine {
 public:
   ARMLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
@@ -86,11 +72,9 @@ public:
                      CodeGenOpt::Level OL);
 };
 
-/// ARM big endian target machine.
+/// ARM/Thumb big endian target machine.
 ///
-class ARMBETargetMachine : public ARMTargetMachine {
-  void anchor() override;
-
+class ARMBETargetMachine : public ARMBaseTargetMachine {
 public:
   ARMBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                      StringRef FS, const TargetOptions &Options,
@@ -98,44 +82,6 @@ public:
                      CodeGenOpt::Level OL);
 };
 
-/// Thumb target machine.
-/// Due to the way architectures are handled, this represents both
-///   Thumb-1 and Thumb-2.
-///
-class ThumbTargetMachine : public ARMBaseTargetMachine {
-  virtual void anchor();
-
-public:
-  ThumbTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                     StringRef FS, const TargetOptions &Options,
-                     Optional<Reloc::Model> RM, CodeModel::Model CM,
-                     CodeGenOpt::Level OL, bool isLittle);
-};
-
-/// Thumb little endian target machine.
-///
-class ThumbLETargetMachine : public ThumbTargetMachine {
-  void anchor() override;
-
-public:
-  ThumbLETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                       StringRef FS, const TargetOptions &Options,
-                       Optional<Reloc::Model> RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
-};
-
-/// Thumb big endian target machine.
-///
-class ThumbBETargetMachine : public ThumbTargetMachine {
-  void anchor() override;
-
-public:
-  ThumbBETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
-                       StringRef FS, const TargetOptions &Options,
-                       Optional<Reloc::Model> RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL);
-};
-
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_ARM_ARMTARGETMACHINE_H
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 94f9e8dfebbf..edbf2b99126c 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -30,8 +30,8 @@ using namespace dwarf;
 
 void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
                                         const TargetMachine &TM) {
-  const ARMTargetMachine &ARM_TM = static_cast<const ARMTargetMachine &>(TM);
-  bool isAAPCS_ABI = ARM_TM.TargetABI == ARMTargetMachine::ARMABI::ARM_ABI_AAPCS;
+  const ARMBaseTargetMachine &ARM_TM = static_cast<const ARMBaseTargetMachine &>(TM);
+  bool isAAPCS_ABI = ARM_TM.TargetABI == ARMBaseTargetMachine::ARMABI::ARM_ABI_AAPCS;
   genExecuteOnly = ARM_TM.getSubtargetImpl()->genExecuteOnly();
 
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 1a17d4e33e4f..f917c35b9ceb 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -535,14 +535,14 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
 
   // Look for a temporary register to use.
   // First, compute the liveness information.
-  LivePhysRegs UsedRegs(STI.getRegisterInfo());
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  LivePhysRegs UsedRegs(TRI);
   UsedRegs.addLiveOuts(MBB);
   // The semantic of pristines changed recently and now,
   // the callee-saved registers that are touched in the function
   // are not part of the pristines set anymore.
   // Add those callee-saved now.
-  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
-  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
   for (unsigned i = 0; CSRegs[i]; ++i)
     UsedRegs.addReg(CSRegs[i]);
 
@@ -561,12 +561,12 @@ bool Thumb1FrameLowering::emitPopSpecialFixUp(MachineBasicBlock &MBB,
   // And some temporary register, just in case.
   unsigned TemporaryReg = 0;
   BitVector PopFriendly =
-      TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::tGPRRegClassID));
+      TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::tGPRRegClassID));
   assert(PopFriendly.any() && "No allocatable pop-friendly register?!");
   // Rebuild the GPRs from the high registers because they are removed
   // form the GPR reg class for thumb1.
   BitVector GPRsNoLRSP =
-      TRI->getAllocatableSet(MF, TRI->getRegClass(ARM::hGPRRegClassID));
+      TRI.getAllocatableSet(MF, TRI.getRegClass(ARM::hGPRRegClassID));
   GPRsNoLRSP |= PopFriendly;
   GPRsNoLRSP.reset(ARM::LR);
   GPRsNoLRSP.reset(ARM::SP);
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index 06ad2b3ffdf8..f10ca394f36c 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -902,7 +902,6 @@ let Defs = [SREG] in
 
   // CPI Rd, K
   // Compares a register with an 8 bit immediate.
-  let Uses = [SREG] in
   def CPIRdK : FRdK<0b0011,
                     (outs),
                     (ins GPR8:$rd, imm_ldi8:$k),
diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp
index 6897161c903c..cc7a7c3849bc 100644
--- a/lib/Target/BPF/BPFISelLowering.cpp
+++ b/lib/Target/BPF/BPFISelLowering.cpp
@@ -132,6 +132,10 @@ BPFTargetLowering::BPFTargetLowering(const TargetMachine &TM,
   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 128;
 }
 
+bool BPFTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  return false;
+}
+
 SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
   case ISD::BR_CC:
@@ -496,8 +500,11 @@ const char *BPFTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
 SDValue BPFTargetLowering::LowerGlobalAddress(SDValue Op,
                                               SelectionDAG &DAG) const {
+  auto N = cast<GlobalAddressSDNode>(Op);
+  assert(N->getOffset() == 0 && "Invalid offset for global address");
+
   SDLoc DL(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  const GlobalValue *GV = N->getGlobal();
   SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i64);
 
   return DAG.getNode(BPFISD::Wrapper, DL, MVT::i64, GA);
diff --git a/lib/Target/BPF/BPFISelLowering.h b/lib/Target/BPF/BPFISelLowering.h
index 3d1726be286e..0b8a8ca20c3b 100644
--- a/lib/Target/BPF/BPFISelLowering.h
+++ b/lib/Target/BPF/BPFISelLowering.h
@@ -42,6 +42,10 @@ public:
   // This method returns the name of a target specific DAG node.
   const char *getTargetNodeName(unsigned Opcode) const override;
 
+  // This method decides whether folding a constant offset
+  // with the given GlobalAddress is legal.
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *BB) const override;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index a04aca4afa0f..25018b9ed510 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1657,7 +1657,7 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
   // defined. From the point of view of the liveness tracking, it is ok to
   // store it as a whole, but if we break it up we may end up storing a
   // register that is entirely undefined.
-  LivePhysRegs LPR(&HRI);
+  LivePhysRegs LPR(HRI);
   LPR.addLiveIns(B);
   SmallVector<std::pair<unsigned, const MachineOperand*>,2> Clobbers;
   for (auto R = B.begin(); R != It; ++R) {
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 03794511414e..66e07c67958e 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1254,7 +1254,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       const MachineOperand &Op1 = MI.getOperand(1);
       const MachineOperand &Op2 = MI.getOperand(2);
       const MachineOperand &Op3 = MI.getOperand(3);
-      LivePhysRegs LiveAtMI(&HRI);
+      LivePhysRegs LiveAtMI(HRI);
       getLiveRegsAt(LiveAtMI, MI);
       bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
       if (Op0.getReg() != Op2.getReg()) {
@@ -1283,7 +1283,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       MachineOperand &Op1 = MI.getOperand(1);
       MachineOperand &Op2 = MI.getOperand(2);
       MachineOperand &Op3 = MI.getOperand(3);
-      LivePhysRegs LiveAtMI(&HRI);
+      LivePhysRegs LiveAtMI(HRI);
       getLiveRegsAt(LiveAtMI, MI);
       bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
 
diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
index 0f99dfe342b8..93fb688fc1c0 100644
--- a/lib/Target/Hexagon/HexagonPseudo.td
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -412,6 +412,15 @@ def PS_vstorerwu_ai: STrivv_template<VecDblRegs, V6_vS32Ub_ai>,
 def PS_vstorerwu_ai_128B: STrivv_template<VecDblRegs128B, V6_vS32Ub_ai_128B>,
       Requires<[HasV60T,UseHVXDbl]>;
 
+let isPseudo = 1, isCodeGenOnly = 1, mayStore = 1, hasSideEffects = 0 in {
+  def PS_vstorerq_ai: Pseudo<(outs),
+        (ins IntRegs:$Rs, s32_0Imm:$Off, VecPredRegs:$Qt), "", []>,
+        Requires<[HasV60T,UseHVXSgl]>;
+  def PS_vstorerq_ai_128B: Pseudo<(outs),
+        (ins IntRegs:$Rs, s32_0Imm:$Off, VecPredRegs128B:$Qt), "", []>,
+        Requires<[HasV60T,UseHVXDbl]>;
+}
+
 // Vector load pseudos
 let Predicates = [HasV60T, UseHVX], isPseudo = 1, isCodeGenOnly = 1,
     mayLoad = 1, hasSideEffects = 0 in
@@ -429,30 +438,16 @@ def PS_vloadrwu_ai: LDrivv_template<VecDblRegs, V6_vL32Ub_ai>,
 def PS_vloadrwu_ai_128B: LDrivv_template<VecDblRegs128B, V6_vL32Ub_ai_128B>,
       Requires<[HasV60T,UseHVXDbl]>;
 
-// Store vector predicate pseudo.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
-  def PS_vstorerq_ai : STInst<(outs),
-              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs:$src1),
-              ".error \"should not emit\" ", []>,
-              Requires<[HasV60T,UseHVXSgl]>;
-
-  def PS_vstorerq_ai_128B : STInst<(outs),
-              (ins IntRegs:$base, s32_0Imm:$offset, VectorRegs:$src1),
-              ".error \"should not emit\" ", []>,
-            Requires<[HasV60T,UseHVXSgl]>;
-
-  def PS_vloadrq_ai : STInst<(outs),
-              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
-              ".error \"should not emit\" ", []>,
-            Requires<[HasV60T,UseHVXDbl]>;
-
-  def PS_vloadrq_ai_128B : STInst<(outs),
-              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
-              ".error \"should not emit\" ", []>,
-            Requires<[HasV60T,UseHVXDbl]>;
+let isPseudo = 1, isCodeGenOnly = 1, mayLoad = 1, hasSideEffects = 0 in {
+  def PS_vloadrq_ai: Pseudo<(outs VecPredRegs:$Qd),
+        (ins IntRegs:$Rs, s32_0Imm:$Off), "", []>,
+        Requires<[HasV60T,UseHVXSgl]>;
+  def PS_vloadrq_ai_128B: Pseudo<(outs VecPredRegs128B:$Qd),
+        (ins IntRegs:$Rs, s32_0Imm:$Off), "", []>,
+        Requires<[HasV60T,UseHVXDbl]>;
 }
 
+
 let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
 class VSELInst<dag outs, dag ins, InstHexagon rootInst>
   : InstHexagon<outs, ins, "", [], "", rootInst.Itinerary, rootInst.Type>;
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 2a1bb63af789..1fc157900ed5 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -50,11 +50,6 @@ bool HexagonRegisterInfo::isEHReturnCalleeSaveReg(unsigned R) const {
          R == Hexagon::R3 || R == Hexagon::D0 || R == Hexagon::D1;
 }
 
-bool HexagonRegisterInfo::isCalleeSaveReg(unsigned Reg) const {
-  return Hexagon::R16 <= Reg && Reg <= Hexagon::R27;
-}
-
-
 const MCPhysReg *
 HexagonRegisterInfo::getCallerSavedRegs(const MachineFunction *MF,
       const TargetRegisterClass *RC) const {
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 8a3f175b8488..5f65fad2cc04 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -77,7 +77,6 @@ public:
   unsigned getFirstCallerSavedNonParamReg() const;
 
   bool isEHReturnCalleeSaveReg(unsigned Reg) const;
-  bool isCalleeSaveReg(unsigned Reg) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index c21b6e2515d3..cd474921d4bc 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -214,12 +214,12 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
   for (auto &MB : MF) {
     auto Begin = MB.begin(), End = MB.end();
     while (Begin != End) {
-      // First the first non-boundary starting from the end of the last
+      // Find the first non-boundary starting from the end of the last
       // scheduling region.
       MachineBasicBlock::iterator RB = Begin;
       while (RB != End && HII->isSchedulingBoundary(*RB, &MB, MF))
         ++RB;
-      // First the first boundary starting from the beginning of the new
+      // Find the first boundary starting from the beginning of the new
       // region.
       MachineBasicBlock::iterator RE = RB;
       while (RE != End && !HII->isSchedulingBoundary(*RE, &MB, MF))
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 8be2a898e380..34b966df7761 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -29,6 +29,7 @@ subdirectories =
  MSP430
  NVPTX
  Mips
+ Nios2
  PowerPC
  RISCV
  Sparc
diff --git a/lib/Target/MSP430/MSP430.td b/lib/Target/MSP430/MSP430.td
index dfea669f3ba1..203864dd4065 100644
--- a/lib/Target/MSP430/MSP430.td
+++ b/lib/Target/MSP430/MSP430.td
@@ -22,6 +22,18 @@ def FeatureX
  : SubtargetFeature<"ext", "ExtendedInsts", "true",
                     "Enable MSP430-X extensions">;
 
+def FeatureHWMult16
+ : SubtargetFeature<"hwmult16", "HWMultMode", "HWMult16",
+                    "Enable 16-bit hardware multiplier">;
+
+def FeatureHWMult32
+ : SubtargetFeature<"hwmult32", "HWMultMode", "HWMult32",
+                    "Enable 32-bit hardware multiplier">;
+
+def FeatureHWMultF5
+ : SubtargetFeature<"hwmultf5", "HWMultMode", "HWMultF5",
+                    "Enable F5 series hardware multiplier">;
+
 //===----------------------------------------------------------------------===//
 // MSP430 supported processors.
 //===----------------------------------------------------------------------===//
@@ -29,6 +41,8 @@ class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
 def : Proc<"generic",         []>;
+def : Proc<"msp430",          []>;
+def : Proc<"msp430x",         [FeatureX]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index cd58eda5d924..0b02f79f472a 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -403,12 +403,12 @@ void MSP430DAGToDAGISel::Select(SDNode *Node) {
     int FI = cast<FrameIndexSDNode>(Node)->getIndex();
     SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i16);
     if (Node->hasOneUse()) {
-      CurDAG->SelectNodeTo(Node, MSP430::ADD16ri, MVT::i16, TFI,
+      CurDAG->SelectNodeTo(Node, MSP430::ADDframe, MVT::i16, TFI,
                            CurDAG->getTargetConstant(0, dl, MVT::i16));
       return;
     }
     ReplaceNode(Node, CurDAG->getMachineNode(
-                          MSP430::ADD16ri, dl, MVT::i16, TFI,
+                          MSP430::ADDframe, dl, MVT::i16, TFI,
                           CurDAG->getTargetConstant(0, dl, MVT::i16)));
     return;
   }
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index cc6e64043f54..dae14fd301ee 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -38,27 +38,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "msp430-lower"
 
-typedef enum {
-  NoHWMult,
-  HWMult16,
-  HWMult32,
-  HWMultF5
-} HWMultUseMode;
-
-static cl::opt<HWMultUseMode>
-HWMultMode("mhwmult", cl::Hidden,
-           cl::desc("Hardware multiplier use mode"),
-           cl::init(NoHWMult),
-           cl::values(
-             clEnumValN(NoHWMult, "none",
-                "Do not use hardware multiplier"),
-             clEnumValN(HWMult16, "16bit",
-                "Use 16-bit hardware multiplier"),
-             clEnumValN(HWMult32, "32bit",
-                "Use 32-bit hardware multiplier"),
-             clEnumValN(HWMultF5, "f5series",
-                "Use F5 series hardware multiplier")));
-
 MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
                                            const MSP430Subtarget &STI)
     : TargetLowering(TM) {
@@ -262,7 +241,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
       setCmpLibcallCC(LC.Op, LC.Cond);
   }
 
-  if (HWMultMode == HWMult16) {
+  if (STI.hasHWMult16()) {
     const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
@@ -277,7 +256,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
     for (const auto &LC : LibraryCalls) {
       setLibcallName(LC.Op, LC.Name);
     }
-  } else if (HWMultMode == HWMult32) {
+  } else if (STI.hasHWMult32()) {
     const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
@@ -292,7 +271,7 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM,
     for (const auto &LC : LibraryCalls) {
       setLibcallName(LC.Op, LC.Name);
     }
-  } else if (HWMultMode == HWMultF5) {
+  } else if (STI.hasHWMultF5()) {
     const struct {
       const RTLIB::Libcall Op;
       const char * const Name;
diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td
index 1cd18611e52c..cec43040f60d 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.td
+++ b/lib/Target/MSP430/MSP430InstrInfo.td
@@ -122,6 +122,11 @@ def ADJCALLSTACKUP   : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2),
                               [(MSP430callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
+let Defs = [SR], Uses = [SP] in {
+def ADDframe : Pseudo<(outs GR16:$dst), (ins i16imm:$base, i16imm:$offset),
+                      "# ADDframe PSEUDO", []>;
+}
+
 let usesCustomInserter = 1 in {
   let Uses = [SR] in {
   def Select8  : Pseudo<(outs GR8:$dst), (ins GR8:$src, GR8:$src2, i8imm:$cc),
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index 9600bc28f100..7a3b7a8bd5ff 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -127,7 +127,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // Fold imm into offset
   Offset += MI.getOperand(FIOperandNum + 1).getImm();
 
-  if (MI.getOpcode() == MSP430::ADD16ri) {
+  if (MI.getOpcode() == MSP430::ADDframe) {
     // This is actually "load effective address" of the stack slot
     // instruction. We have only two-address instructions, thus we need to
     // expand it into mov + add
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index 6216348e4d71..776a9dcb11d4 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -19,6 +19,20 @@ using namespace llvm;
 
 #define DEBUG_TYPE "msp430-subtarget"
 
+static cl::opt<MSP430Subtarget::HWMultEnum>
+HWMultModeOption("mhwmult", cl::Hidden,
+           cl::desc("Hardware multiplier use mode for MSP430"),
+           cl::init(MSP430Subtarget::NoHWMult),
+           cl::values(
+             clEnumValN(MSP430Subtarget::NoHWMult, "none",
+                "Do not use hardware multiplier"),
+             clEnumValN(MSP430Subtarget::HWMult16, "16bit",
+                "Use 16-bit hardware multiplier"),
+             clEnumValN(MSP430Subtarget::HWMult32, "32bit",
+                "Use 32-bit hardware multiplier"),
+             clEnumValN(MSP430Subtarget::HWMultF5, "f5series",
+                "Use F5 series hardware multiplier")));
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "MSP430GenSubtargetInfo.inc"
@@ -27,7 +41,18 @@ void MSP430Subtarget::anchor() { }
 
 MSP430Subtarget &
 MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
-  ParseSubtargetFeatures("generic", FS);
+  ExtendedInsts = false;
+  HWMultMode = NoHWMult;
+
+  std::string CPUName = CPU;
+  if (CPUName.empty())
+    CPUName = "msp430";
+
+  ParseSubtargetFeatures(CPUName, FS);
+
+  if (HWMultModeOption != NoHWMult)
+    HWMultMode = HWMultModeOption;
+
   return *this;
 }
 
diff --git a/lib/Target/MSP430/MSP430Subtarget.h b/lib/Target/MSP430/MSP430Subtarget.h
index 1a00d85e01cb..8828dfd65878 100644
--- a/lib/Target/MSP430/MSP430Subtarget.h
+++ b/lib/Target/MSP430/MSP430Subtarget.h
@@ -30,8 +30,15 @@ namespace llvm {
 class StringRef;
 
 class MSP430Subtarget : public MSP430GenSubtargetInfo {
+public:
+  enum HWMultEnum {
+    NoHWMult, HWMult16, HWMult32, HWMultF5
+  };
+
+private:
   virtual void anchor();
   bool ExtendedInsts;
+  HWMultEnum HWMultMode;
   MSP430FrameLowering FrameLowering;
   MSP430InstrInfo InstrInfo;
   MSP430TargetLowering TLInfo;
@@ -50,6 +57,10 @@ public:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
+  bool hasHWMult16() const { return HWMultMode == HWMult16; }
+  bool hasHWMult32() const { return HWMultMode == HWMult32; }
+  bool hasHWMultF5() const { return HWMultMode == HWMultF5; }
+
   const TargetFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 3641a70d61b5..8fe4e75f3e18 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -813,28 +813,28 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG,
         !isShiftedMask(CN->getZExtValue(), SMPos1, SMSize1))
       return SDValue();
 
-      // The shift masks must have the same position and size.
-      if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
-        return SDValue();
+    // The shift masks must have the same position and size.
+    if (SMPos0 != SMPos1 || SMSize0 != SMSize1)
+      return SDValue();
 
-      SDValue Shl = And1.getOperand(0);
+    SDValue Shl = And1.getOperand(0);
 
-      if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
-        return SDValue();
+    if (!(CN = dyn_cast<ConstantSDNode>(Shl.getOperand(1))))
+      return SDValue();
 
-      unsigned Shamt = CN->getZExtValue();
+    unsigned Shamt = CN->getZExtValue();
 
-      // Return if the shift amount and the first bit position of mask are not the
-      // same.
-      EVT ValTy = N->getValueType(0);
-      if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
-        return SDValue();
+    // Return if the shift amount and the first bit position of mask are not the
+    // same.
+    EVT ValTy = N->getValueType(0);
+    if ((Shamt != SMPos0) || (SMPos0 + SMSize0 > ValTy.getSizeInBits()))
+      return SDValue();
 
-      SDLoc DL(N);
-      return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0),
-                         DAG.getConstant(SMPos0, DL, MVT::i32),
-                         DAG.getConstant(SMSize0, DL, MVT::i32),
-                         And0.getOperand(0));
+    SDLoc DL(N);
+    return DAG.getNode(MipsISD::Ins, DL, ValTy, Shl.getOperand(0),
+                       DAG.getConstant(SMPos0, DL, MVT::i32),
+                       DAG.getConstant(SMSize0, DL, MVT::i32),
+                       And0.getOperand(0));
   } else {
     // Pattern match DINS.
     //  $dst = or (and $src, mask0), mask1
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 8f5ecadecdea..1f4e933db2a2 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -59,9 +59,8 @@ static cl::opt<bool>
 
 void MipsSubtarget::anchor() { }
 
-MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
-                             const std::string &FS, bool little,
-                             const MipsTargetMachine &TM)
+MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
+                             bool little, const MipsTargetMachine &TM)
     : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(MipsDefault),
       IsLittle(little), IsSoftFloat(false), IsSingleFloat(false), IsFPXX(false),
       NoABICalls(false), IsFP64bit(false), UseOddSPReg(true),
@@ -77,8 +76,6 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, const std::string &CPU,
       FrameLowering(MipsFrameLowering::create(*this)),
       TLInfo(MipsTargetLowering::create(TM, *this)) {
 
-  PreviousInMips16Mode = InMips16Mode;
-
   if (MipsArchVersion == MipsDefault)
     MipsArchVersion = Mips32;
 
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index cca2cb8a4660..b4d15ee361ff 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -119,9 +119,6 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // Mips16 hard float
   bool InMips16HardFloat;
 
-  // PreviousInMips16 -- the function we just processed was in Mips 16 Mode
-  bool PreviousInMips16Mode;
-
   // InMicroMips -- can process MicroMips instructions
   bool InMicroMipsMode;
 
@@ -178,8 +175,8 @@ public:
 
   /// This constructor initializes the data members to match that
   /// of the specified triple.
-  MipsSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS,
-                bool little, const MipsTargetMachine &TM);
+  MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, bool little,
+                const MipsTargetMachine &TM);
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
diff --git a/lib/Target/Nios2/CMakeLists.txt b/lib/Target/Nios2/CMakeLists.txt
new file mode 100644
index 000000000000..78db452094bd
--- /dev/null
+++ b/lib/Target/Nios2/CMakeLists.txt
@@ -0,0 +1,18 @@
+set(LLVM_TARGET_DEFINITIONS Nios2.td)
+
+#Generate Nios2GenRegisterInfo.inc and Nios2GenInstrInfo.inc which included by
+#your hand code C++ files.
+#Nios2GenRegisterInfo.inc came from Nios2RegisterInfo.td, Nios2GenInstrInfo.inc
+#came from Nios2InstrInfo.td.
+tablegen(LLVM Nios2GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM Nios2GenInstrInfo.inc -gen-instr-info)
+
+#Nios2CommonTableGen must be defined
+add_public_tablegen_target(Nios2CommonTableGen)
+
+#Nios2CodeGen should match with LLVMBuild.txt Nios2CodeGen
+add_llvm_target(Nios2CodeGen Nios2TargetMachine.cpp)
+
+#Should match with "subdirectories =  MCTargetDesc TargetInfo" in LLVMBuild.txt
+add_subdirectory(TargetInfo)
+add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/Nios2/LLVMBuild.txt b/lib/Target/Nios2/LLVMBuild.txt
new file mode 100644
index 000000000000..b40a76379706
--- /dev/null
+++ b/lib/Target/Nios2/LLVMBuild.txt
@@ -0,0 +1,61 @@
+;===- ./lib/Target/Nios2/LLVMBuild.txt -------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+#Following comments extracted from http: // llvm.org/docs/LLVMBuild.html
+
+[common]
+subdirectories =
+    MCTargetDesc
+    TargetInfo
+
+[component_0]
+#TargetGroup components are an extension of LibraryGroups, specifically for
+#defining LLVM targets(which are handled specially in a few places).
+type = TargetGroup
+#The name of the component should always be the name of the target.(should
+#match "def Nios2 : Target" in Nios2.td)
+name = Nios2
+#Nios2 component is located in directory Target /
+parent = Target
+#Whether this target defines an assembly parser, assembly printer, disassembler
+#, and supports JIT compilation.They are optional.
+
+[component_1]
+#component_1 is a Library type and name is Nios2CodeGen.After build it will
+#in lib / libLLVMNios2CodeGen.a of your build command directory.
+type = Library
+name = Nios2CodeGen
+#Nios2CodeGen component(Library) is located in directory Nios2 /
+parent = Nios2
+#If given, a list of the names of Library or LibraryGroup components which
+#must also be linked in whenever this library is used.That is, the link time
+#dependencies for this component.When tools are built, the build system will
+#include the transitive closure of all required_libraries for the components
+#the tool needs.
+required_libraries = CodeGen
+                     Core
+                     GlobalISel
+                     MC
+                     Nios2Desc
+                     Nios2Info
+                     Support
+                     Target
+#end of required_libraries
+
+#All LLVMBuild.txt in Target / Nios2 and subdirectory use 'add_to_library_groups
+#= Nios2'
+add_to_library_groups = Nios2
diff --git a/lib/Target/Nios2/MCTargetDesc/CMakeLists.txt b/lib/Target/Nios2/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 000000000000..21def509a232
--- /dev/null
+++ b/lib/Target/Nios2/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,2 @@
+#MCTargetDesc / CMakeLists.txt
+add_llvm_library(LLVMNios2Desc Nios2MCTargetDesc.cpp)
diff --git a/lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt b/lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 000000000000..4dc6995e7f5c
--- /dev/null
+++ b/lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,25 @@
+;===- ./lib/Target/Nios2/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = Nios2Desc
+parent = Nios2
+required_libraries = MC 
+                     Nios2Info 
+                     Support
+add_to_library_groups = Nios2
diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp
new file mode 100644
index 000000000000..d913166399c6
--- /dev/null
+++ b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.cpp
@@ -0,0 +1,25 @@
+//===-- Nios2MCTargetDesc.cpp - Nios2 Target Descriptions -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Nios2 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Nios2MCTargetDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+
+using namespace llvm;
+
+#define GET_INSTRINFO_MC_DESC
+#include "Nios2GenInstrInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "Nios2GenRegisterInfo.inc"
+
+extern "C" void LLVMInitializeNios2TargetMC() {}
diff --git a/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
new file mode 100644
index 000000000000..d426062db168
--- /dev/null
+++ b/lib/Target/Nios2/MCTargetDesc/Nios2MCTargetDesc.h
@@ -0,0 +1,34 @@
+//===-- Nios2MCTargetDesc.h - Nios2 Target Descriptions ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides Nios2 specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NIOS2_MCTARGETDESC_NIOS2MCTARGETDESC_H
+#define LLVM_LIB_TARGET_NIOS2_MCTARGETDESC_NIOS2MCTARGETDESC_H
+
+namespace llvm {
+class Target;
+class Triple;
+
+Target &getTheNios2Target();
+
+} // namespace llvm
+
+// Defines symbolic names for Nios2 registers.  This defines a mapping from
+// register name to register number.
+#define GET_REGINFO_ENUM
+#include "Nios2GenRegisterInfo.inc"
+
+// Defines symbolic names for the Nios2 instructions.
+#define GET_INSTRINFO_ENUM
+#include "Nios2GenInstrInfo.inc"
+
+#endif
diff --git a/lib/Target/Nios2/Nios2.h b/lib/Target/Nios2/Nios2.h
new file mode 100644
index 000000000000..87202f48cfbe
--- /dev/null
+++ b/lib/Target/Nios2/Nios2.h
@@ -0,0 +1,25 @@
+//===-- Nios2.h - Top-level interface for Nios2 representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in
+// the LLVM Nios2 back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NIOS2_NIOS2_H
+#define LLVM_LIB_TARGET_NIOS2_NIOS2_H
+
+#include "MCTargetDesc/Nios2MCTargetDesc.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class Nios2TargetMachine;
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Nios2/Nios2.td b/lib/Target/Nios2/Nios2.td
new file mode 100644
index 000000000000..e8abba863370
--- /dev/null
+++ b/lib/Target/Nios2/Nios2.td
@@ -0,0 +1,29 @@
+//===-- Nios2.td - Describe the Nios2 Target Machine -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+//===----------------------------------------------------------------------===//
+// Target-dependent interfaces
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "Nios2RegisterInfo.td"
+include "Nios2InstrInfo.td"
+
+def Nios2InstrInfo : InstrInfo;
+
+def Nios2 : Target { let InstructionSet = Nios2InstrInfo; }
diff --git a/lib/Target/Nios2/Nios2InstrFormats.td b/lib/Target/Nios2/Nios2InstrFormats.td
new file mode 100644
index 000000000000..79868be48a48
--- /dev/null
+++ b/lib/Target/Nios2/Nios2InstrFormats.td
@@ -0,0 +1,117 @@
+//===-- Nios2InstrFormats.td - Nios2 Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//  Describe NIOS2 instructions format
+//
+//
+//===----------------------------------------------------------------------===//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<3> val> {
+  bits<3> Value = val;
+}
+
+def Pseudo : Format<0>;
+def FrmI : Format<1>;
+def FrmR : Format<2>;
+def FrmJ : Format<3>;
+def FrmOther : Format<4>; // Instruction w/ a custom format
+
+// Generic Nios2 Format
+class Nios2Inst<dag outs, dag ins, string asmstr, list<dag> pattern, Format f>
+    : Instruction {
+  field bits<32> Inst;
+  Format Form = f;
+
+  let Namespace = "Nios2";
+
+  let Size = 4;
+
+  bits<6> Opcode = 0;
+
+  // Bottom 6 bits are the 'opcode' field
+  let Inst{5 - 0} = Opcode;
+
+  let OutOperandList = outs;
+  let InOperandList = ins;
+
+  let AsmString = asmstr;
+  let Pattern = pattern;
+
+  //
+  // Attributes specific to Nios2 instructions:
+  //
+  bits<3> FormBits = Form.Value;
+
+  // TSFlags layout should be kept in sync with Nios2InstrInfo.h.
+  let TSFlags{2 - 0} = FormBits;
+
+  let DecoderNamespace = "Nios2";
+}
+
+// Nios2 Instruction Format
+class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern, Format f>
+    : Nios2Inst<outs, ins, asmstr, pattern, f> {
+}
+
+//===----------------------------------------------------------------------===//
+// Format I instruction class in Nios2 : <|A|B|immediate|opcode|>
+//===----------------------------------------------------------------------===//
+
+class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstSE<outs, ins, asmstr, pattern, FrmI> {
+  bits<5> rA;
+  bits<5> rB;
+  bits<16> imm;
+
+  let Opcode = op;
+
+  let Inst{31 - 27} = rA;
+  let Inst{26 - 22} = rB;
+  let Inst{21 - 6} = imm;
+}
+
+//===----------------------------------------------------------------------===//
+// Format R instruction : <|A|B|C|opx|imm|opcode|>
+//===----------------------------------------------------------------------===//
+
+class FR<bits<6> opx, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstSE<outs, ins, asmstr, pattern, FrmR> {
+  bits<5> rA;
+  bits<5> rB;
+  bits<5> rC;
+  bits<5> imm = 0;
+
+  // opcode is always 0x3a for R instr.
+  let Opcode = 0x3a;
+
+  let Inst{31 - 27} = rA;
+  let Inst{26 - 22} = rB;
+  let Inst{21 - 17} = rC;
+  // opx stands for opcode extension
+  let Inst{16 - 11} = opx;
+  // optional 5-bit immediate value
+  let Inst{10 - 6}  = imm;
+}
+
+//===----------------------------------------------------------------------===//
+// Format J instruction class in Nios2 : <|address|opcode|>
+//===----------------------------------------------------------------------===//
+
+class FJ<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstSE<outs, ins, asmstr, pattern, FrmJ> {
+  bits<26> addr;
+
+  let Opcode = op;
+
+  let Inst{31 - 6} = addr;
+}
diff --git a/lib/Target/Nios2/Nios2InstrInfo.td b/lib/Target/Nios2/Nios2InstrInfo.td
new file mode 100644
index 000000000000..5e4815ab3e16
--- /dev/null
+++ b/lib/Target/Nios2/Nios2InstrInfo.td
@@ -0,0 +1,50 @@
+//===- Nios2InstrInfo.td - Target Description for Nios2 ------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the Nios2 implementation of the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Instruction format superclass
+//===----------------------------------------------------------------------===//
+
+include "Nios2InstrFormats.td"
+
+//===----------------------------------------------------------------------===//
+// Nios2 Operand, Complex Patterns and Transformations Definitions.
+//===----------------------------------------------------------------------===//
+
+def simm16      : Operand<i32> {
+  let DecoderMethod= "DecodeSimm16";
+}
+
+// Node immediate fits as 16-bit sign extended on target immediate.
+// e.g. addi, andi
+def immSExt16  : PatLeaf<(imm), [{ return isInt<16>(N->getSExtValue()); }]>;
+
+//===----------------------------------------------------------------------===//
+// Instructions specific format
+//===----------------------------------------------------------------------===//
+
+// Arithmetic and logical instructions with 2 register operands.
+class ArithLogicI<bits<6> op, string instr_asm, SDNode OpNode,
+                  Operand Od, PatLeaf imm_type, RegisterClass RC> :
+  FI<op, (outs RC:$rB), (ins RC:$rA, Od:$imm16),
+     !strconcat(instr_asm, "\t$rB, $rA, $imm16"),
+     [(set RC:$rB, (OpNode RC:$rA, imm_type:$imm16))]> {
+  let isReMaterializable = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Nios2 R1 Instructions
+//===----------------------------------------------------------------------===//
+
+/// Arithmetic Instructions (ALU Immediate)
+def ADDi : ArithLogicI<0x04, "addi", add, simm16, immSExt16, CPURegs>;
diff --git a/lib/Target/Nios2/Nios2RegisterInfo.td b/lib/Target/Nios2/Nios2RegisterInfo.td
new file mode 100644
index 000000000000..1808815816f3
--- /dev/null
+++ b/lib/Target/Nios2/Nios2RegisterInfo.td
@@ -0,0 +1,60 @@
+//===-- Nios2RegisterInfo.td - Nios2 Register defs ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// We have bank of 32 registers.
+class Nios2Reg<string n> : Register<n> {
+  field bits<5> Num;
+  let Namespace = "Nios2";
+}
+
+// Nios2 CPU Registers
+class Nios2GPRReg<bits<5> num, string n> : Nios2Reg<n> {
+  let Num = num;
+}
+
+//===----------------------------------------------------------------------===//
+//  Registers
+//===----------------------------------------------------------------------===//
+
+let Namespace = "Nios2" in {
+  // General Purpose Registers
+  def ZERO : Nios2GPRReg<0, "zero">, DwarfRegNum<[ 0 ]>;
+  def AT : Nios2GPRReg<1, "at">, DwarfRegNum<[ 1 ]>;
+  foreach RegNum = 2 - 23 in {
+    def R #RegNum : Nios2GPRReg<RegNum, "r" #RegNum>, DwarfRegNum<[ RegNum ]>;
+  }
+  def ET : Nios2GPRReg<24, "et">, DwarfRegNum<[ 24 ]>;
+  def BT : Nios2GPRReg<25, "bt">, DwarfRegNum<[ 25 ]>;
+  def GP : Nios2GPRReg<26, "gp">, DwarfRegNum<[ 26 ]>;
+  def SP : Nios2GPRReg<27, "sp">, DwarfRegNum<[ 27 ]>;
+  def FP : Nios2GPRReg<28, "fp">, DwarfRegNum<[ 28 ]>;
+  def EA : Nios2GPRReg<29, "ea">, DwarfRegNum<[ 29 ]>;
+  def BA : Nios2GPRReg<30, "ba">, DwarfRegNum<[ 30 ]>;
+  def RA : Nios2GPRReg<31, "ra">, DwarfRegNum<[ 31 ]>;
+  def PC : Nios2Reg<"pc">, DwarfRegNum<[ 32 ]>;
+}
+
+//===----------------------------------------------------------------------===//
+// Register Classes
+//===----------------------------------------------------------------------===//
+
+def CPURegs : RegisterClass<"Nios2", [ i32 ], 32,
+                            (add
+                            // Reserved
+                            ZERO,
+                            AT,
+                            // Return Values and Arguments
+                            (sequence "R%u", 2, 7),
+                            // Not preserved across procedure calls
+                            // Caller saved
+                            (sequence "R%u", 8, 15),
+                            // Callee saved
+                            (sequence "R%u", 16, 23),
+                            // Reserved
+                            ET, BT, GP, SP, FP, EA, BA, RA, PC)>;
diff --git a/lib/Target/Nios2/Nios2TargetMachine.cpp b/lib/Target/Nios2/Nios2TargetMachine.cpp
new file mode 100644
index 000000000000..16d4eabcfaf7
--- /dev/null
+++ b/lib/Target/Nios2/Nios2TargetMachine.cpp
@@ -0,0 +1,46 @@
+//===-- Nios2TargetMachine.cpp - Define TargetMachine for Nios2 -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the info about Nios2 target spec.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Nios2TargetMachine.h"
+#include "Nios2.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nios2"
+
+extern "C" void LLVMInitializeNios2Target() {
+  // Register the target.
+}
+
+static std::string computeDataLayout(const Triple &TT, StringRef CPU,
+                                     const TargetOptions &Options) {
+  return "e-p:32:32:32-i8:8:32-i16:16:32-n32";
+}
+
+static Reloc::Model getEffectiveRelocModel(CodeModel::Model CM,
+                                           Optional<Reloc::Model> RM) {
+  if (!RM.hasValue() || CM == CodeModel::JITDefault)
+    return Reloc::Static;
+  return *RM;
+}
+
+Nios2TargetMachine::Nios2TargetMachine(const Target &T, const Triple &TT,
+                                       StringRef CPU, StringRef FS,
+                                       const TargetOptions &Options,
+                                       Optional<Reloc::Model> RM,
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+    : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options), TT, CPU, FS,
+                        Options, getEffectiveRelocModel(CM, RM), CM, OL) {}
+
+Nios2TargetMachine::~Nios2TargetMachine() {}
diff --git a/lib/Target/Nios2/Nios2TargetMachine.h b/lib/Target/Nios2/Nios2TargetMachine.h
new file mode 100644
index 000000000000..7f145c82f32c
--- /dev/null
+++ b/lib/Target/Nios2/Nios2TargetMachine.h
@@ -0,0 +1,30 @@
+//===-- Nios2TargetMachine.h - Define TargetMachine for Nios2 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the Nios2 specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_NIOS2_NIOS2TARGETMACHINE_H
+#define LLVM_LIB_TARGET_NIOS2_NIOS2TARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class Nios2TargetMachine : public LLVMTargetMachine {
+public:
+  Nios2TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                     StringRef FS, const TargetOptions &Options,
+                     Optional<Reloc::Model> RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
+  ~Nios2TargetMachine() override;
+};
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/Nios2/TargetInfo/CMakeLists.txt b/lib/Target/Nios2/TargetInfo/CMakeLists.txt
new file mode 100644
index 000000000000..394d2c2680b7
--- /dev/null
+++ b/lib/Target/Nios2/TargetInfo/CMakeLists.txt
@@ -0,0 +1 @@
+add_llvm_library(LLVMNios2Info Nios2TargetInfo.cpp)
diff --git a/lib/Target/Nios2/TargetInfo/LLVMBuild.txt b/lib/Target/Nios2/TargetInfo/LLVMBuild.txt
new file mode 100644
index 000000000000..558f7501ea6b
--- /dev/null
+++ b/lib/Target/Nios2/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/Nios2/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = Nios2Info
+parent = Nios2
+required_libraries = Support
+add_to_library_groups = Nios2
diff --git a/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp b/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp
new file mode 100644
index 000000000000..e317686140f7
--- /dev/null
+++ b/lib/Target/Nios2/TargetInfo/Nios2TargetInfo.cpp
@@ -0,0 +1,24 @@
+//===-- Nios2TargetInfo.cpp - Nios2 Target Implementation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Nios2.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+Target &llvm::getTheNios2Target() {
+  static Target TheNios2Target;
+  return TheNios2Target;
+}
+
+extern "C" void LLVMInitializeNios2TargetInfo() {
+  RegisterTarget<Triple::nios2,
+                 /*HasJIT=*/true>
+      X(getTheNios2Target(), "nios2", "Nios2");
+}
diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp
index ebd414baf1d2..41e3190c3eec 100644
--- a/lib/Target/PowerPC/PPCExpandISEL.cpp
+++ b/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -339,7 +339,7 @@ void PPCExpandISEL::reorganizeBlockLayout(BlockISELList &BIL,
     // Note: Cannot use stepBackward instead since we are using the Reg
     // liveness state at the end of MBB (liveOut of MBB) as the liveIn for
     // NewSuccessor. Otherwise, will cause cyclic dependence.
-    LivePhysRegs LPR(MF->getSubtarget<PPCSubtarget>().getRegisterInfo());
+    LivePhysRegs LPR(*MF->getSubtarget<PPCSubtarget>().getRegisterInfo());
     SmallVector<std::pair<unsigned, const MachineOperand *>, 2> Clobbers;
     for (MachineInstr &MI : *MBB)
       LPR.stepForward(MI, Clobbers);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index e65b1f1aa0a5..b90a5ee28342 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1596,9 +1596,8 @@ bool PPC::isSplatShuffleMask(ShuffleVectorSDNode *N, unsigned EltSize) {
   return true;
 }
 
-bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
-                          unsigned &InsertAtByte, bool &Swap, bool IsLE) {
   // Check that the mask is shuffling words
+static bool isWordShuffleMask(ShuffleVectorSDNode *N) {
   for (unsigned i = 0; i < 4; ++i) {
     unsigned B0 = N->getMaskElt(i*4);
     unsigned B1 = N->getMaskElt(i*4+1);
@@ -1610,6 +1609,14 @@ bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
       return false;
   }
 
+  return true;
+}
+
+bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+                          unsigned &InsertAtByte, bool &Swap, bool IsLE) {
+  if (!isWordShuffleMask(N))
+    return false;
+
   // Now we look at mask elements 0,4,8,12
   unsigned M0 = N->getMaskElt(0) / 4;
   unsigned M1 = N->getMaskElt(4) / 4;
@@ -1680,6 +1687,69 @@ bool PPC::isXXINSERTWMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
   return false;
 }
 
+bool PPC::isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+                               bool &Swap, bool IsLE) {
+  assert(N->getValueType(0) == MVT::v16i8 && "Shuffle vector expects v16i8");
+  // Ensure each byte index of the word is consecutive.
+  if (!isWordShuffleMask(N))
+    return false;
+
+  // Now we look at mask elements 0,4,8,12, which are the beginning of words.
+  unsigned M0 = N->getMaskElt(0) / 4;
+  unsigned M1 = N->getMaskElt(4) / 4;
+  unsigned M2 = N->getMaskElt(8) / 4;
+  unsigned M3 = N->getMaskElt(12) / 4;
+
+  // If both vector operands for the shuffle are the same vector, the mask will
+  // contain only elements from the first one and the second one will be undef.
+  if (N->getOperand(1).isUndef()) {
+    assert(M0 < 4 && "Indexing into an undef vector?");
+    if (M1 != (M0 + 1) % 4 || M2 != (M1 + 1) % 4 || M3 != (M2 + 1) % 4)
+      return false;
+
+    ShiftElts = IsLE ? (4 - M0) % 4 : M0;
+    Swap = false;
+    return true;
+  }
+
+  // Ensure each word index of the ShuffleVector Mask is consecutive.
+  if (M1 != (M0 + 1) % 8 || M2 != (M1 + 1) % 8 || M3 != (M2 + 1) % 8)
+    return false;
+
+  if (IsLE) {
+    if (M0 == 0 || M0 == 7 || M0 == 6 || M0 == 5) {
+      // Input vectors don't need to be swapped if the leading element
+      // of the result is one of the 3 left elements of the second vector
+      // (or if there is no shift to be done at all).
+      Swap = false;
+      ShiftElts = (8 - M0) % 8;
+    } else if (M0 == 4 || M0 == 3 || M0 == 2 || M0 == 1) {
+      // Input vectors need to be swapped if the leading element
+      // of the result is one of the 3 left elements of the first vector
+      // (or if we're shifting by 4 - thereby simply swapping the vectors).
+      Swap = true;
+      ShiftElts = (4 - M0) % 4;
+    }
+
+    return true;
+  } else {                                          // BE
+    if (M0 == 0 || M0 == 1 || M0 == 2 || M0 == 3) {
+      // Input vectors don't need to be swapped if the leading element
+      // of the result is one of the 4 elements of the first vector.
+      Swap = false;
+      ShiftElts = M0;
+    } else if (M0 == 4 || M0 == 5 || M0 == 6 || M0 == 7) {
+      // Input vectors need to be swapped if the leading element
+      // of the result is one of the 4 elements of the right vector.
+      Swap = true;
+      ShiftElts = M0 - 4;
+    }
+
+    return true;
+  }
+}
+
+
 /// getVSPLTImmediate - Return the appropriate VSPLT* immediate to splat the
 /// specified isSplatShuffleMask VECTOR_SHUFFLE mask.
 unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize,
@@ -7679,6 +7749,20 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
   }
 
+
+  if (Subtarget.hasVSX() &&
+      PPC::isXXSLDWIShuffleMask(SVOp, ShiftElts, Swap, isLittleEndian)) {
+    if (Swap)
+      std::swap(V1, V2);
+    SDValue Conv1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V1);
+    SDValue Conv2 =
+        DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, V2.isUndef() ? V1 : V2);
+
+    SDValue Shl = DAG.getNode(PPCISD::VECSHL, dl, MVT::v4i32, Conv1, Conv2,
+                              DAG.getConstant(ShiftElts, dl, MVT::i32));
+    return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Shl);
+  }
+
   if (Subtarget.hasVSX()) {
     if (V2.isUndef() && PPC::isSplatShuffleMask(SVOp, 4)) {
       int SplatIdx = PPC::getVSPLTImmediate(SVOp, 4, DAG);
@@ -8212,10 +8296,12 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   SDLoc DL(Op);
   switch (cast<ConstantSDNode>(Op.getOperand(ArgStart))->getZExtValue()) {
   case Intrinsic::ppc_cfence: {
+    assert(ArgStart == 1 && "llvm.ppc.cfence must carry a chain argument.");
     assert(Subtarget.isPPC64() && "Only 64-bit is supported for now.");
     return SDValue(DAG.getMachineNode(PPC::CFENCE8, DL, MVT::Other,
                                       DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64,
-                                                  Op.getOperand(ArgStart + 1))),
+                                                  Op.getOperand(ArgStart + 1)),
+                                      Op.getOperand(0)),
                    0);
   }
   default:
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index acb77943b118..2f9eb95f6de6 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -450,7 +450,11 @@ namespace llvm {
     /// a VMRGEW or VMRGOW instruction
     bool isVMRGEOShuffleMask(ShuffleVectorSDNode *N, bool CheckEven,
                              unsigned ShuffleKind, SelectionDAG &DAG);
-  
+    /// isXXSLDWIShuffleMask - Return true if this is a shuffle mask suitable
+    /// for a XXSLDWI instruction.
+    bool isXXSLDWIShuffleMask(ShuffleVectorSDNode *N, unsigned &ShiftElts,
+                              bool &Swap, bool IsLE);
+
     /// isVSLDOIShuffleMask - If this is a vsldoi shuffle mask, return the
     /// shift amount, otherwise return -1.
     int isVSLDOIShuffleMask(SDNode *N, unsigned ShuffleKind,
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index a3f894c81a01..165970f9678c 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1001,7 +1001,9 @@ def ADDItlsgdL : Pseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16imm64:$disp),
                  isPPC64;
 // LR8 is a true define, while the rest of the Defs are clobbers.  X3 is
 // explicitly defined when this op is created, so not mentioned here.
-let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1,
+// This is lowered to BL8_NOP_TLS by the assembly printer, so the size must be
+// correct because the branch select pass is relying on it.
+let hasExtraSrcRegAllocReq = 1, hasExtraDefRegAllocReq = 1, Size = 8,
     Defs = [X0,X4,X5,X6,X7,X8,X9,X10,X11,X12,LR8,CTR8,CR0,CR1,CR5,CR6,CR7] in
 def GETtlsADDR : Pseudo<(outs g8rc:$rD), (ins g8rc:$reg, tlsgd:$sym),
                         "#GETtlsADDR",
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 46f103141bc1..fd6785e963a6 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1931,6 +1931,8 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case PPC::DFSTOREf64: {
     assert(Subtarget.hasP9Vector() &&
            "Invalid D-Form Pseudo-ops on non-P9 target.");
+    assert(MI.getOperand(2).isReg() && MI.getOperand(1).isImm() &&
+           "D-form op must have register and immediate operands");
     unsigned UpperOpcode, LowerOpcode;
     switch (MI.getOpcode()) {
     case PPC::DFLOADf32:
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 0766cfe4a987..26b99eced23c 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -46,7 +46,7 @@ def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>,
 ]>;
 
 def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
-  SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>
+  SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3>
 ]>;
 
 def SDT_PPCVecInsert : SDTypeProfile<1, 3, [ SDTCisVec<0>,
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index b98140fedfc0..1589ab03e507 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1066,6 +1066,10 @@ def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>;
 def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>;
 def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>;
 
+// PPCvecshl XT, XA, XA, 2 can be selected to both XXSLDWI XT,XA,XA,2 and
+// XXSWAPD XT,XA (i.e. XXPERMDI XT,XA,XA,2), the later one is more profitable.
+def : Pat<(v4i32 (PPCvecshl v4i32:$src, v4i32:$src, 2)), (XXPERMDI $src, $src, 2)>;
+
 // Selects.
 def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
           (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
@@ -2379,8 +2383,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
 
   // Load Vector Indexed
   def LXVX    : X_XT6_RA5_RB5<31, 268, "lxvx"   , vsrc,
-                [(set v2f64:$XT, (load xoaddr:$src))]>;
-
+                [(set v2f64:$XT, (load xaddr:$src))]>;
   // Load Vector (Left-justified) with Length
   def LXVL : XX1Form<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
                    "lxvl $XT, $src, $rB", IIC_LdStLoad,
@@ -2430,7 +2433,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
 
   // Store Vector Indexed
   def STXVX    : X_XS6_RA5_RB5<31,  396, "stxvx"   , vsrc,
-                 [(store v2f64:$XT, xoaddr:$dst)]>;
+                 [(store v2f64:$XT, xaddr:$dst)]>;
 
   // Store Vector (Left-justified) with Length
   def STXVL : XX1Form<31, 397, (outs), (ins vsrc:$XT, memr:$dst, g8rc:$rB),
@@ -2498,21 +2501,38 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
             (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
   } // IsLittleEndian, HasP9Vector
 
-  def : Pat<(v2f64 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v2i64 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v4f32 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v4i32 (load xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(store v4f32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(store v4i32:$rS, xoaddr:$dst), (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
-            (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
-            (STXVX $rS, xoaddr:$dst)>;
+  // D-Form Load/Store
+  def : Pat<(v4i32 (load iaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v4f32 (load iaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2i64 (load iaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2f64 (load iaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddr:$src)), (LXV memrix16:$src)>;
+  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddr:$src)), (LXV memrix16:$src)>;
 
+  def : Pat<(store v4f32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(store v4i32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(store v2f64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(store v2i64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddr:$dst),
+            (STXV $rS, memrix16:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddr:$dst),
+            (STXV $rS, memrix16:$dst)>;
+
+
+  def : Pat<(v2f64 (load xaddr:$src)), (LXVX xaddr:$src)>;
+  def : Pat<(v2i64 (load xaddr:$src)), (LXVX xaddr:$src)>;
+  def : Pat<(v4f32 (load xaddr:$src)), (LXVX xaddr:$src)>;
+  def : Pat<(v4i32 (load xaddr:$src)), (LXVX xaddr:$src)>;
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xaddr:$src)), (LXVX xaddr:$src)>;
+  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xaddr:$src)), (LXVX xaddr:$src)>;
+  def : Pat<(store v2f64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
+  def : Pat<(store v2i64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
+  def : Pat<(store v4f32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
+  def : Pat<(store v4i32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xaddr:$dst),
+            (STXVX $rS, xaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xaddr:$dst),
+            (STXVX $rS, xaddr:$dst)>;
   def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
             (v4i32 (LXVWSX xoaddr:$src))>;
   def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
@@ -2704,9 +2724,15 @@ def FltToUIntLoad {
 def FltToLongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A)))));
 }
+def FltToLongLoadP9 {
+  dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddr:$A)))));
+}
 def FltToULongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A)))));
 }
+def FltToULongLoadP9 {
+  dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddr:$A)))));
+}
 def FltToLong {
   dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A))));
 }
@@ -2728,9 +2754,15 @@ def DblToULong {
 def DblToIntLoad {
   dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A)))));
 }
+def DblToIntLoadP9 {
+  dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddr:$A)))));
+}
 def DblToUIntLoad {
   dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A)))));
 }
+def DblToUIntLoadP9 {
+  dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddr:$A)))));
+}
 def DblToLongLoad {
   dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A)))));
 }
@@ -2898,17 +2930,17 @@ let AddedComplexity = 400 in {
               (v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>;
     def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
               (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>;
-    def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)),
+    def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
                                 (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
-    def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)),
+    def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)),
               (v4i32 (XXSPLTW (COPY_TO_REGCLASS
                                 (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>;
-    def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)),
+    def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)),
               (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
                                               (DFLOADf32 iaddr:$A),
                                               VSFRC)), 0))>;
-    def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)),
+    def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)),
               (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
                                               (DFLOADf32 iaddr:$A),
                                               VSFRC)), 0))>;
diff --git a/lib/Target/SystemZ/SystemZExpandPseudo.cpp b/lib/Target/SystemZ/SystemZExpandPseudo.cpp
index 92ce8089c24f..d02db9a617a3 100644
--- a/lib/Target/SystemZ/SystemZExpandPseudo.cpp
+++ b/lib/Target/SystemZ/SystemZExpandPseudo.cpp
@@ -74,7 +74,7 @@ bool SystemZExpandPseudo::expandLOCRMux(MachineBasicBlock &MBB,
   unsigned CCValid = MI.getOperand(3).getImm();
   unsigned CCMask = MI.getOperand(4).getImm();
 
-  LivePhysRegs LiveRegs(&TII->getRegisterInfo());
+  LivePhysRegs LiveRegs(TII->getRegisterInfo());
   LiveRegs.addLiveOuts(MBB);
   for (auto I = std::prev(MBB.end()); I != MBBI; --I)
     LiveRegs.stepBackward(*I);
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index a30bf34857b5..b34c181124de 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -236,32 +236,30 @@ void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
 void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const {
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction &MF = *MBB->getParent();
-  const unsigned Reg = MI->getOperand(0).getReg();
+  const unsigned Reg64 = MI->getOperand(0).getReg();
+  const unsigned Reg32 = RI.getSubReg(Reg64, SystemZ::subreg_l32);
 
-  // Conveniently, all 4 instructions are cloned from LOAD_STACK_GUARD,
-  // so they already have operand 0 set to reg.
+  // EAR can only load the low subregister so us a shift for %a0 to produce
+  // the GR containing %a0 and %a1.
 
   // ear <reg>, %a0
-  MachineInstr *Ear1MI = MF.CloneMachineInstr(MI);
-  MBB->insert(MI, Ear1MI);
-  Ear1MI->setDesc(get(SystemZ::EAR));
-  MachineInstrBuilder(MF, Ear1MI).addReg(SystemZ::A0);
+  BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::EAR), Reg32)
+    .addReg(SystemZ::A0)
+    .addReg(Reg64, RegState::ImplicitDefine);
 
   // sllg <reg>, <reg>, 32
-  MachineInstr *SllgMI = MF.CloneMachineInstr(MI);
-  MBB->insert(MI, SllgMI);
-  SllgMI->setDesc(get(SystemZ::SLLG));
-  MachineInstrBuilder(MF, SllgMI).addReg(Reg).addReg(0).addImm(32);
+  BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::SLLG), Reg64)
+    .addReg(Reg64)
+    .addReg(0)
+    .addImm(32);
 
   // ear <reg>, %a1
-  MachineInstr *Ear2MI = MF.CloneMachineInstr(MI);
-  MBB->insert(MI, Ear2MI);
-  Ear2MI->setDesc(get(SystemZ::EAR));
-  MachineInstrBuilder(MF, Ear2MI).addReg(SystemZ::A1);
+  BuildMI(*MBB, MI, MI->getDebugLoc(), get(SystemZ::EAR), Reg32)
+    .addReg(SystemZ::A1);
 
   // lg <reg>, 40(<reg>)
   MI->setDesc(get(SystemZ::LG));
-  MachineInstrBuilder(MF, MI).addReg(Reg).addImm(40).addReg(0);
+  MachineInstrBuilder(MF, MI).addReg(Reg64).addImm(40).addReg(0);
 }
 
 // Emit a zero-extending move from 32-bit GPR SrcReg to 32-bit GPR
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 3766ed45b8c4..ad597f5c65f0 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -55,6 +55,7 @@ public:
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
 
+  bool prefersVectorizedAddressing() { return false; }
   bool supportsEfficientVectorElementLoadStore() { return true; }
   bool enableInterleavedAccessVectorization() { return true; }
 
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 32ab475f1186..e5d3209ec6a9 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1316,16 +1316,17 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   while (!Done) {
     bool UpdateLocLex = true;
 
+    AsmToken::TokenKind TK = getLexer().getKind();
     // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
     // identifier.  Don't try an parse it as a register.
-    if (PrevTK != AsmToken::Error && Tok.getString().startswith("."))
+    if (PrevTK != AsmToken::Error && Tok.getString().startswith(".") &&
+        TK != AsmToken::Identifier)
       break;
 
     // If we're parsing an immediate expression, we don't expect a '['.
     if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
       break;
 
-    AsmToken::TokenKind TK = getLexer().getKind();
     switch (TK) {
     default: {
       if (SM.isValidEndState()) {
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index fc4adddc149b..7471373334f6 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -11,6 +11,7 @@ tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
 tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
 tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables)
+tablegen(LLVM X86GenFoldTables.inc -gen-x86-fold-tables)
 if(LLVM_BUILD_GLOBAL_ISEL)
   tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank)
   tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel)
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 3a421fe77392..fe105298f5c1 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -127,6 +127,9 @@ def FeatureERI      : SubtargetFeature<"avx512er", "HasERI", "true",
 def FeatureCDI      : SubtargetFeature<"avx512cd", "HasCDI", "true",
                       "Enable AVX-512 Conflict Detection Instructions",
                                       [FeatureAVX512]>;
+def FeatureVPOPCNTDQ : SubtargetFeature<"avx512vpopcntdq", "HasVPOPCNTDQ",
+                       "true", "Enable AVX-512 Population Count Instructions",
+                                      [FeatureAVX512]>;
 def FeaturePFI      : SubtargetFeature<"avx512pf", "HasPFI", "true",
                       "Enable AVX-512 PreFetch Instructions",
                                       [FeatureAVX512]>;
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index a5489b9aa8b7..313920e02c3e 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -1655,8 +1655,8 @@ void FPS::handleSpecialFP(MachineBasicBlock::iterator &Inst) {
 }
 
 void FPS::setKillFlags(MachineBasicBlock &MBB) const {
-  const TargetRegisterInfo *TRI =
-      MBB.getParent()->getSubtarget().getRegisterInfo();
+  const TargetRegisterInfo &TRI =
+      *MBB.getParent()->getSubtarget().getRegisterInfo();
   LivePhysRegs LPR(TRI);
 
   LPR.addLiveOuts(MBB);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 37b248416e4a..86744b064132 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1364,6 +1364,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
     }
 
+    if (Subtarget.hasVPOPCNTDQ()) {
+      // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
+      // version of popcntd/q.
+      for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
+                      MVT::v4i32, MVT::v2i64})
+        setOperationAction(ISD::CTPOP, VT, Legal);
+    }
+
     // Custom lower several nodes.
     for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
                      MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index f9344413bbcf..d8702693884d 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -2693,22 +2693,22 @@ multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                        PatFrag st_frag, PatFrag mstore> {
+                        PatFrag st_frag, PatFrag mstore, string Name> {
 
   let hasSideEffects = 0 in {
   def rr_REV  : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
                          OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
-                         [], _.ExeDomain>, EVEX;
+                         [], _.ExeDomain>, EVEX, FoldGenData<Name#rr>;
   def rrk_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
                          (ins _.KRCWM:$mask, _.RC:$src),
                          OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
                          "${dst} {${mask}}, $src}",
-                         [], _.ExeDomain>,  EVEX, EVEX_K;
+                         [], _.ExeDomain>,  EVEX, EVEX_K, FoldGenData<Name#rrk>;
   def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs  _.RC:$dst),
                           (ins _.KRCWM:$mask, _.RC:$src),
                           OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
                           "${dst} {${mask}} {z}, $src}",
-                          [], _.ExeDomain>, EVEX, EVEX_KZ;
+                          [], _.ExeDomain>, EVEX, EVEX_KZ, FoldGenData<Name#rrkz>;
   }
 
   def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
@@ -2726,80 +2726,92 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 
 
 multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
-                            AVX512VLVectorVTInfo _, Predicate prd> {
+                            AVX512VLVectorVTInfo _, Predicate prd,
+                            string Name> {
   let Predicates = [prd] in
   defm Z : avx512_store<opc, OpcodeStr, _.info512, store,
-                        masked_store_unaligned>, EVEX_V512;
+                        masked_store_unaligned, Name#Z>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
     defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store,
-                             masked_store_unaligned>, EVEX_V256;
+                             masked_store_unaligned, Name#Z256>, EVEX_V256;
     defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store,
-                             masked_store_unaligned>, EVEX_V128;
+                             masked_store_unaligned, Name#Z128>, EVEX_V128;
   }
 }
 
 multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
-                                  AVX512VLVectorVTInfo _,  Predicate prd> {
+                                  AVX512VLVectorVTInfo _,  Predicate prd,
+                                  string Name> {
   let Predicates = [prd] in
   defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512,
-                        masked_store_aligned512>, EVEX_V512;
+                        masked_store_aligned512, Name#Z>, EVEX_V512;
 
   let Predicates = [prd, HasVLX] in {
     defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256,
-                             masked_store_aligned256>, EVEX_V256;
+                             masked_store_aligned256, Name#Z256>, EVEX_V256;
     defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore,
-                             masked_store_aligned128>, EVEX_V128;
+                             masked_store_aligned128, Name#Z128>, EVEX_V128;
   }
 }
 
 defm VMOVAPS : avx512_alignedload_vl<0x28, "vmovaps", avx512vl_f32_info,
                                      HasAVX512>,
                avx512_alignedstore_vl<0x29, "vmovaps", avx512vl_f32_info,
-                                      HasAVX512>,  PS, EVEX_CD8<32, CD8VF>;
+                                      HasAVX512, "VMOVAPS">,
+               PS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
                                      HasAVX512>,
                avx512_alignedstore_vl<0x29, "vmovapd", avx512vl_f64_info,
-                                     HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+                                     HasAVX512, "VMOVAPD">,
+               PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
                               null_frag>,
-               avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512>,
+               avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
+                               "VMOVUPS">,
                               PS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
                               null_frag>,
-               avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512>,
+               avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
+                               "VMOVUPD">,
                PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info,
-                                       HasAVX512>, PD, EVEX_CD8<32, CD8VF>;
+                                       HasAVX512, "VMOVDQA32">, 
+                 PD, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
                                        HasAVX512>,
                  avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info,
-                                    HasAVX512>, PD, VEX_W, EVEX_CD8<64, CD8VF>;
+                                    HasAVX512, "VMOVDQA64">, 
+                 PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
-                 avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
-                                 HasBWI>, XD, EVEX_CD8<8, CD8VF>;
+                avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
+                                 HasBWI, "VMOVDQU8">, 
+                XD, EVEX_CD8<8, CD8VF>;
 
 defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
                  avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
-                                 HasBWI>, XD, VEX_W, EVEX_CD8<16, CD8VF>;
+                                 HasBWI, "VMOVDQU16">, 
+                 XD, VEX_W, EVEX_CD8<16, CD8VF>;
 
 defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
                                 null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
-                                 HasAVX512>, XS, EVEX_CD8<32, CD8VF>;
+                                 HasAVX512, "VMOVDQU32">, 
+                 XS, EVEX_CD8<32, CD8VF>;
 
 defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
                                 null_frag>,
                  avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
-                                 HasAVX512>, XS, VEX_W, EVEX_CD8<64, CD8VF>;
+                                 HasAVX512, "VMOVDQU64">, 
+                 XS, VEX_W, EVEX_CD8<64, CD8VF>;
 
 // Special instructions to help with spilling when we don't have VLX. We need
 // to load or store from a ZMM register instead. These are converted in
@@ -3354,17 +3366,52 @@ def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
           (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM),
            (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
 
-let hasSideEffects = 0 in
-defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info,
-                           (outs VR128X:$dst), (ins VR128X:$src1, FR32X:$src2),
-                           "vmovss.s", "$src2, $src1", "$src1, $src2", []>,
-                           XS, EVEX_4V, VEX_LIG;
+let hasSideEffects = 0 in {
+  def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+                           (ins VR128X:$src1, FR32X:$src2),
+                           "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                           [], NoItinerary>, XS, EVEX_4V, VEX_LIG,
+                           FoldGenData<"VMOVSSZrr">;
 
-let hasSideEffects = 0 in
-defm VMOVSDZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
-                           (outs VR128X:$dst), (ins VR128X:$src1, FR64X:$src2),
-                           "vmovsd.s", "$src2, $src1", "$src1, $src2", []>,
-                           XD, EVEX_4V, VEX_LIG, VEX_W;
+let Constraints = "$src0 = $dst" in
+  def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+                             (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask, 
+                                                   VR128X:$src1, FR32X:$src2),
+                             "vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
+                                        "$dst {${mask}}, $src1, $src2}",
+                             [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG,
+                             FoldGenData<"VMOVSSZrrk">;
+ 
+  def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+                         (ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2),
+                         "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+                                    "$dst {${mask}} {z}, $src1, $src2}",
+                         [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
+                         FoldGenData<"VMOVSSZrrkz">;
+
+  def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+                           (ins VR128X:$src1, FR64X:$src2),
+                           "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                           [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W,
+                           FoldGenData<"VMOVSDZrr">;
+
+let Constraints = "$src0 = $dst" in
+  def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+                             (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask, 
+                                                   VR128X:$src1, FR64X:$src2),
+                             "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
+                                        "$dst {${mask}}, $src1, $src2}",
+                             [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG,
+                             VEX_W, FoldGenData<"VMOVSDZrrk">; 
+
+  def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), 
+                              (ins f64x_info.KRCWM:$mask, VR128X:$src1, 
+                                                          FR64X:$src2),
+                              "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
+                                         "$dst {${mask}} {z}, $src1, $src2}",
+                              [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, 
+                              VEX_W, FoldGenData<"VMOVSDZrrkz">;
+}
 
 let Predicates = [HasAVX512] in {
   let AddedComplexity = 15 in {
@@ -8648,6 +8695,41 @@ let Predicates = [HasCDI, NoVLX] in {
              sub_xmm)>;
 }
 
+//===---------------------------------------------------------------------===//
+// Counts number of ones - VPOPCNTD and VPOPCNTQ
+//===---------------------------------------------------------------------===//
+
+multiclass avx512_unary_rmb_popcnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo VTInfo> {
+  let Predicates = [HasVPOPCNTDQ] in
+    defm Z : avx512_unary_rmb<opc, OpcodeStr, ctpop, VTInfo>, EVEX_V512;
+}
+
+// Use 512bit version to implement 128/256 bit.
+multiclass avx512_unary_lowering<SDNode OpNode, AVX512VLVectorVTInfo _, Predicate prd> {
+  let Predicates = [prd] in {
+    def Z256_Alt : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)),
+                       (EXTRACT_SUBREG
+                         (!cast<Instruction>(NAME # "Zrr")
+                           (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
+                                          _.info256.RC:$src1,
+                                          _.info256.SubRegIdx)),
+                       _.info256.SubRegIdx)>;
+
+    def Z128_Alt : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)),
+                       (EXTRACT_SUBREG
+                         (!cast<Instruction>(NAME # "Zrr")
+                           (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
+                                          _.info128.RC:$src1,
+                                          _.info128.SubRegIdx)),
+                       _.info128.SubRegIdx)>;
+  }
+}
+
+defm VPOPCNTD : avx512_unary_rmb_popcnt<0x55, "vpopcntd", v16i32_info>,
+                avx512_unary_lowering<ctpop, avx512vl_i32_info, HasVPOPCNTDQ>;
+defm VPOPCNTQ : avx512_unary_rmb_popcnt<0x55, "vpopcntq", v8i64_info>,
+                avx512_unary_lowering<ctpop, avx512vl_i64_info, HasVPOPCNTDQ>, VEX_W;
+
 //===---------------------------------------------------------------------===//
 // Replicate Single FP - MOVSHDUP and MOVSLDUP
 //===---------------------------------------------------------------------===//
@@ -8795,7 +8877,7 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
     def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
                    (ins _.RC:$src1, u8imm:$src2),
                    OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
-                   EVEX, TAPD;
+                   EVEX, TAPD, FoldGenData<NAME#rr>;
 
     defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
   }
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index 66382014f6e8..e38bbc9b3d36 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -964,10 +964,10 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
         } // isConvertibleToThreeAddress
       } // isCommutable
 
-      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
-      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
-      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
-      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+      def NAME#8rr_REV  : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>;
+      def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>;
+      def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>;
+      def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>;
 
       def NAME#8rm   : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
       def NAME#16rm  : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
@@ -1049,10 +1049,10 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
         } // isConvertibleToThreeAddress
       } // isCommutable
 
-      def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>;
-      def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>;
-      def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>;
-      def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>;
+      def NAME#8rr_REV  : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>;
+      def NAME#16rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>;
+      def NAME#32rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>;
+      def NAME#64rr_REV : BinOpRR_RFF_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>;
 
       def NAME#8rm   : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
       def NAME#16rm  : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
@@ -1129,10 +1129,10 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
       }
     } // isCommutable
 
-    def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
-    def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
-    def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
-    def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+    def NAME#8rr_REV  : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>, FoldGenData<NAME#8rr>;
+    def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>, FoldGenData<NAME#16rr>;
+    def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>, FoldGenData<NAME#32rr>;
+    def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>, FoldGenData<NAME#64rr>;
 
     def NAME#8rm   : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
     def NAME#16rm  : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 1941ae57f0f1..3a3cdc9fa574 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -297,7 +297,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
                (ins RC:$src1, RC:$src2, RC:$src3),
                !strconcat(OpcodeStr,
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
-               VEX_LIG;
+               VEX_LIG, FoldGenData<NAME#rr>;
 }
 
 multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
@@ -321,6 +321,12 @@ let isCodeGenOnly = 1 in {
                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
                [(set VR128:$dst,
                  (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+let hasSideEffects = 0 in
+  def rr_Int_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+               (ins VR128:$src1, VR128:$src2, VR128:$src3),
+               !strconcat(OpcodeStr,
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
+               []>, VEX_LIG, FoldGenData<NAME#rr_Int>; 
 } // isCodeGenOnly = 1
 }
 
@@ -372,12 +378,13 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
   def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
                (ins VR128:$src1, VR128:$src2, VR128:$src3),
                !strconcat(OpcodeStr,
-               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+               "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+               FoldGenData<NAME#rr>;
   def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
                 (ins VR256:$src1, VR256:$src2, VR256:$src3),
                 !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
-                VEX_L;
+                VEX_L, FoldGenData<NAME#Yrr>;
 } // isCodeGenOnly = 1
 }
 
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index c2fe786732dc..bfcbf71d252f 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -225,6 +225,12 @@ class Has3DNow0F0FOpcode  { bit has3DNow0F0FOpcode = 1; }
 class XOP { Encoding OpEnc = EncXOP; }
 class XOP_4V : XOP { bit hasVEX_4V = 1; }
 
+// Specify the alternative register form instruction to replace the current
+// instruction in case it was picked during generation of memory folding tables
+class FoldGenData<string _RegisterForm> {
+    string FoldGenRegForm = _RegisterForm;
+}
+
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
               string AsmStr,
               InstrItinClass itin,
@@ -304,6 +310,10 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
                                      CD8_EltSize,
                                      !srl(VectSize, CD8_Form{1-0}))), 0);
 
+  // Used in the memory folding generation (TableGen backend) to point to an alternative
+  // instruction to replace the current one in case it got picked during generation.
+  string FoldGenRegForm = ?;
+
   // TSFlags layout should be kept in sync with X86BaseInfo.h.
   let TSFlags{6-0}   = FormBits;
   let TSFlags{8-7}   = OpSizeBits;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index f7083a7448ce..33fbd41bb631 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -121,172 +121,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
                       (STI.is64Bit() ? X86::RETQ : X86::RETL)),
       Subtarget(STI), RI(STI.getTargetTriple()) {
 
-  static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
-    { X86::ADC32ri,     X86::ADC32mi,    0 },
-    { X86::ADC32ri8,    X86::ADC32mi8,   0 },
-    { X86::ADC32rr,     X86::ADC32mr,    0 },
-    { X86::ADC64ri32,   X86::ADC64mi32,  0 },
-    { X86::ADC64ri8,    X86::ADC64mi8,   0 },
-    { X86::ADC64rr,     X86::ADC64mr,    0 },
-    { X86::ADD16ri,     X86::ADD16mi,    0 },
-    { X86::ADD16ri8,    X86::ADD16mi8,   0 },
-    { X86::ADD16ri_DB,  X86::ADD16mi,    TB_NO_REVERSE },
-    { X86::ADD16ri8_DB, X86::ADD16mi8,   TB_NO_REVERSE },
-    { X86::ADD16rr,     X86::ADD16mr,    0 },
-    { X86::ADD16rr_DB,  X86::ADD16mr,    TB_NO_REVERSE },
-    { X86::ADD32ri,     X86::ADD32mi,    0 },
-    { X86::ADD32ri8,    X86::ADD32mi8,   0 },
-    { X86::ADD32ri_DB,  X86::ADD32mi,    TB_NO_REVERSE },
-    { X86::ADD32ri8_DB, X86::ADD32mi8,   TB_NO_REVERSE },
-    { X86::ADD32rr,     X86::ADD32mr,    0 },
-    { X86::ADD32rr_DB,  X86::ADD32mr,    TB_NO_REVERSE },
-    { X86::ADD64ri32,   X86::ADD64mi32,  0 },
-    { X86::ADD64ri8,    X86::ADD64mi8,   0 },
-    { X86::ADD64ri32_DB,X86::ADD64mi32,  TB_NO_REVERSE },
-    { X86::ADD64ri8_DB, X86::ADD64mi8,   TB_NO_REVERSE },
-    { X86::ADD64rr,     X86::ADD64mr,    0 },
-    { X86::ADD64rr_DB,  X86::ADD64mr,    TB_NO_REVERSE },
-    { X86::ADD8ri,      X86::ADD8mi,     0 },
-    { X86::ADD8rr,      X86::ADD8mr,     0 },
-    { X86::AND16ri,     X86::AND16mi,    0 },
-    { X86::AND16ri8,    X86::AND16mi8,   0 },
-    { X86::AND16rr,     X86::AND16mr,    0 },
-    { X86::AND32ri,     X86::AND32mi,    0 },
-    { X86::AND32ri8,    X86::AND32mi8,   0 },
-    { X86::AND32rr,     X86::AND32mr,    0 },
-    { X86::AND64ri32,   X86::AND64mi32,  0 },
-    { X86::AND64ri8,    X86::AND64mi8,   0 },
-    { X86::AND64rr,     X86::AND64mr,    0 },
-    { X86::AND8ri,      X86::AND8mi,     0 },
-    { X86::AND8rr,      X86::AND8mr,     0 },
-    { X86::DEC16r,      X86::DEC16m,     0 },
-    { X86::DEC32r,      X86::DEC32m,     0 },
-    { X86::DEC64r,      X86::DEC64m,     0 },
-    { X86::DEC8r,       X86::DEC8m,      0 },
-    { X86::INC16r,      X86::INC16m,     0 },
-    { X86::INC32r,      X86::INC32m,     0 },
-    { X86::INC64r,      X86::INC64m,     0 },
-    { X86::INC8r,       X86::INC8m,      0 },
-    { X86::NEG16r,      X86::NEG16m,     0 },
-    { X86::NEG32r,      X86::NEG32m,     0 },
-    { X86::NEG64r,      X86::NEG64m,     0 },
-    { X86::NEG8r,       X86::NEG8m,      0 },
-    { X86::NOT16r,      X86::NOT16m,     0 },
-    { X86::NOT32r,      X86::NOT32m,     0 },
-    { X86::NOT64r,      X86::NOT64m,     0 },
-    { X86::NOT8r,       X86::NOT8m,      0 },
-    { X86::OR16ri,      X86::OR16mi,     0 },
-    { X86::OR16ri8,     X86::OR16mi8,    0 },
-    { X86::OR16rr,      X86::OR16mr,     0 },
-    { X86::OR32ri,      X86::OR32mi,     0 },
-    { X86::OR32ri8,     X86::OR32mi8,    0 },
-    { X86::OR32rr,      X86::OR32mr,     0 },
-    { X86::OR64ri32,    X86::OR64mi32,   0 },
-    { X86::OR64ri8,     X86::OR64mi8,    0 },
-    { X86::OR64rr,      X86::OR64mr,     0 },
-    { X86::OR8ri,       X86::OR8mi,      0 },
-    { X86::OR8rr,       X86::OR8mr,      0 },
-    { X86::ROL16r1,     X86::ROL16m1,    0 },
-    { X86::ROL16rCL,    X86::ROL16mCL,   0 },
-    { X86::ROL16ri,     X86::ROL16mi,    0 },
-    { X86::ROL32r1,     X86::ROL32m1,    0 },
-    { X86::ROL32rCL,    X86::ROL32mCL,   0 },
-    { X86::ROL32ri,     X86::ROL32mi,    0 },
-    { X86::ROL64r1,     X86::ROL64m1,    0 },
-    { X86::ROL64rCL,    X86::ROL64mCL,   0 },
-    { X86::ROL64ri,     X86::ROL64mi,    0 },
-    { X86::ROL8r1,      X86::ROL8m1,     0 },
-    { X86::ROL8rCL,     X86::ROL8mCL,    0 },
-    { X86::ROL8ri,      X86::ROL8mi,     0 },
-    { X86::ROR16r1,     X86::ROR16m1,    0 },
-    { X86::ROR16rCL,    X86::ROR16mCL,   0 },
-    { X86::ROR16ri,     X86::ROR16mi,    0 },
-    { X86::ROR32r1,     X86::ROR32m1,    0 },
-    { X86::ROR32rCL,    X86::ROR32mCL,   0 },
-    { X86::ROR32ri,     X86::ROR32mi,    0 },
-    { X86::ROR64r1,     X86::ROR64m1,    0 },
-    { X86::ROR64rCL,    X86::ROR64mCL,   0 },
-    { X86::ROR64ri,     X86::ROR64mi,    0 },
-    { X86::ROR8r1,      X86::ROR8m1,     0 },
-    { X86::ROR8rCL,     X86::ROR8mCL,    0 },
-    { X86::ROR8ri,      X86::ROR8mi,     0 },
-    { X86::SAR16r1,     X86::SAR16m1,    0 },
-    { X86::SAR16rCL,    X86::SAR16mCL,   0 },
-    { X86::SAR16ri,     X86::SAR16mi,    0 },
-    { X86::SAR32r1,     X86::SAR32m1,    0 },
-    { X86::SAR32rCL,    X86::SAR32mCL,   0 },
-    { X86::SAR32ri,     X86::SAR32mi,    0 },
-    { X86::SAR64r1,     X86::SAR64m1,    0 },
-    { X86::SAR64rCL,    X86::SAR64mCL,   0 },
-    { X86::SAR64ri,     X86::SAR64mi,    0 },
-    { X86::SAR8r1,      X86::SAR8m1,     0 },
-    { X86::SAR8rCL,     X86::SAR8mCL,    0 },
-    { X86::SAR8ri,      X86::SAR8mi,     0 },
-    { X86::SBB32ri,     X86::SBB32mi,    0 },
-    { X86::SBB32ri8,    X86::SBB32mi8,   0 },
-    { X86::SBB32rr,     X86::SBB32mr,    0 },
-    { X86::SBB64ri32,   X86::SBB64mi32,  0 },
-    { X86::SBB64ri8,    X86::SBB64mi8,   0 },
-    { X86::SBB64rr,     X86::SBB64mr,    0 },
-    { X86::SHL16r1,     X86::SHL16m1,    0 },
-    { X86::SHL16rCL,    X86::SHL16mCL,   0 },
-    { X86::SHL16ri,     X86::SHL16mi,    0 },
-    { X86::SHL32r1,     X86::SHL32m1,    0 },
-    { X86::SHL32rCL,    X86::SHL32mCL,   0 },
-    { X86::SHL32ri,     X86::SHL32mi,    0 },
-    { X86::SHL64r1,     X86::SHL64m1,    0 },
-    { X86::SHL64rCL,    X86::SHL64mCL,   0 },
-    { X86::SHL64ri,     X86::SHL64mi,    0 },
-    { X86::SHL8r1,      X86::SHL8m1,     0 },
-    { X86::SHL8rCL,     X86::SHL8mCL,    0 },
-    { X86::SHL8ri,      X86::SHL8mi,     0 },
-    { X86::SHLD16rrCL,  X86::SHLD16mrCL, 0 },
-    { X86::SHLD16rri8,  X86::SHLD16mri8, 0 },
-    { X86::SHLD32rrCL,  X86::SHLD32mrCL, 0 },
-    { X86::SHLD32rri8,  X86::SHLD32mri8, 0 },
-    { X86::SHLD64rrCL,  X86::SHLD64mrCL, 0 },
-    { X86::SHLD64rri8,  X86::SHLD64mri8, 0 },
-    { X86::SHR16r1,     X86::SHR16m1,    0 },
-    { X86::SHR16rCL,    X86::SHR16mCL,   0 },
-    { X86::SHR16ri,     X86::SHR16mi,    0 },
-    { X86::SHR32r1,     X86::SHR32m1,    0 },
-    { X86::SHR32rCL,    X86::SHR32mCL,   0 },
-    { X86::SHR32ri,     X86::SHR32mi,    0 },
-    { X86::SHR64r1,     X86::SHR64m1,    0 },
-    { X86::SHR64rCL,    X86::SHR64mCL,   0 },
-    { X86::SHR64ri,     X86::SHR64mi,    0 },
-    { X86::SHR8r1,      X86::SHR8m1,     0 },
-    { X86::SHR8rCL,     X86::SHR8mCL,    0 },
-    { X86::SHR8ri,      X86::SHR8mi,     0 },
-    { X86::SHRD16rrCL,  X86::SHRD16mrCL, 0 },
-    { X86::SHRD16rri8,  X86::SHRD16mri8, 0 },
-    { X86::SHRD32rrCL,  X86::SHRD32mrCL, 0 },
-    { X86::SHRD32rri8,  X86::SHRD32mri8, 0 },
-    { X86::SHRD64rrCL,  X86::SHRD64mrCL, 0 },
-    { X86::SHRD64rri8,  X86::SHRD64mri8, 0 },
-    { X86::SUB16ri,     X86::SUB16mi,    0 },
-    { X86::SUB16ri8,    X86::SUB16mi8,   0 },
-    { X86::SUB16rr,     X86::SUB16mr,    0 },
-    { X86::SUB32ri,     X86::SUB32mi,    0 },
-    { X86::SUB32ri8,    X86::SUB32mi8,   0 },
-    { X86::SUB32rr,     X86::SUB32mr,    0 },
-    { X86::SUB64ri32,   X86::SUB64mi32,  0 },
-    { X86::SUB64ri8,    X86::SUB64mi8,   0 },
-    { X86::SUB64rr,     X86::SUB64mr,    0 },
-    { X86::SUB8ri,      X86::SUB8mi,     0 },
-    { X86::SUB8rr,      X86::SUB8mr,     0 },
-    { X86::XOR16ri,     X86::XOR16mi,    0 },
-    { X86::XOR16ri8,    X86::XOR16mi8,   0 },
-    { X86::XOR16rr,     X86::XOR16mr,    0 },
-    { X86::XOR32ri,     X86::XOR32mi,    0 },
-    { X86::XOR32ri8,    X86::XOR32mi8,   0 },
-    { X86::XOR32rr,     X86::XOR32mr,    0 },
-    { X86::XOR64ri32,   X86::XOR64mi32,  0 },
-    { X86::XOR64ri8,    X86::XOR64mi8,   0 },
-    { X86::XOR64rr,     X86::XOR64mr,    0 },
-    { X86::XOR8ri,      X86::XOR8mi,     0 },
-    { X86::XOR8rr,      X86::XOR8mr,     0 }
-  };
+// Generated memory folding tables.
+#include "X86GenFoldTables.inc"
 
   for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2Addr) {
     AddTableEntry(RegOp2MemOpTable2Addr, MemOp2RegOpTable,
@@ -295,744 +131,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
                   Entry.Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
   }
 
-  static const X86MemoryFoldTableEntry MemoryFoldTable0[] = {
-    { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
-    { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
-    { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
-    { X86::CALL32r,     X86::CALL32m,       TB_FOLDED_LOAD },
-    { X86::CALL64r,     X86::CALL64m,       TB_FOLDED_LOAD },
-    { X86::CMP16ri,     X86::CMP16mi,       TB_FOLDED_LOAD },
-    { X86::CMP16ri8,    X86::CMP16mi8,      TB_FOLDED_LOAD },
-    { X86::CMP16rr,     X86::CMP16mr,       TB_FOLDED_LOAD },
-    { X86::CMP32ri,     X86::CMP32mi,       TB_FOLDED_LOAD },
-    { X86::CMP32ri8,    X86::CMP32mi8,      TB_FOLDED_LOAD },
-    { X86::CMP32rr,     X86::CMP32mr,       TB_FOLDED_LOAD },
-    { X86::CMP64ri32,   X86::CMP64mi32,     TB_FOLDED_LOAD },
-    { X86::CMP64ri8,    X86::CMP64mi8,      TB_FOLDED_LOAD },
-    { X86::CMP64rr,     X86::CMP64mr,       TB_FOLDED_LOAD },
-    { X86::CMP8ri,      X86::CMP8mi,        TB_FOLDED_LOAD },
-    { X86::CMP8rr,      X86::CMP8mr,        TB_FOLDED_LOAD },
-    { X86::DIV16r,      X86::DIV16m,        TB_FOLDED_LOAD },
-    { X86::DIV32r,      X86::DIV32m,        TB_FOLDED_LOAD },
-    { X86::DIV64r,      X86::DIV64m,        TB_FOLDED_LOAD },
-    { X86::DIV8r,       X86::DIV8m,         TB_FOLDED_LOAD },
-    { X86::EXTRACTPSrr, X86::EXTRACTPSmr,   TB_FOLDED_STORE },
-    { X86::IDIV16r,     X86::IDIV16m,       TB_FOLDED_LOAD },
-    { X86::IDIV32r,     X86::IDIV32m,       TB_FOLDED_LOAD },
-    { X86::IDIV64r,     X86::IDIV64m,       TB_FOLDED_LOAD },
-    { X86::IDIV8r,      X86::IDIV8m,        TB_FOLDED_LOAD },
-    { X86::IMUL16r,     X86::IMUL16m,       TB_FOLDED_LOAD },
-    { X86::IMUL32r,     X86::IMUL32m,       TB_FOLDED_LOAD },
-    { X86::IMUL64r,     X86::IMUL64m,       TB_FOLDED_LOAD },
-    { X86::IMUL8r,      X86::IMUL8m,        TB_FOLDED_LOAD },
-    { X86::JMP32r,      X86::JMP32m,        TB_FOLDED_LOAD },
-    { X86::JMP64r,      X86::JMP64m,        TB_FOLDED_LOAD },
-    { X86::MOV16ri,     X86::MOV16mi,       TB_FOLDED_STORE },
-    { X86::MOV16rr,     X86::MOV16mr,       TB_FOLDED_STORE },
-    { X86::MOV32ri,     X86::MOV32mi,       TB_FOLDED_STORE },
-    { X86::MOV32rr,     X86::MOV32mr,       TB_FOLDED_STORE },
-    { X86::MOV64ri32,   X86::MOV64mi32,     TB_FOLDED_STORE },
-    { X86::MOV64rr,     X86::MOV64mr,       TB_FOLDED_STORE },
-    { X86::MOV8ri,      X86::MOV8mi,        TB_FOLDED_STORE },
-    { X86::MOV8rr,      X86::MOV8mr,        TB_FOLDED_STORE },
-    { X86::MOV8rr_NOREX, X86::MOV8mr_NOREX, TB_FOLDED_STORE },
-    { X86::MOVAPDrr,    X86::MOVAPDmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::MOVAPSrr,    X86::MOVAPSmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::MOVDQArr,    X86::MOVDQAmr,      TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::MOVDQUrr,    X86::MOVDQUmr,      TB_FOLDED_STORE },
-    { X86::MOVPDI2DIrr, X86::MOVPDI2DImr,   TB_FOLDED_STORE },
-    { X86::MOVPQIto64rr,X86::MOVPQI2QImr,   TB_FOLDED_STORE },
-    { X86::MOVSDto64rr, X86::MOVSDto64mr,   TB_FOLDED_STORE },
-    { X86::MOVSS2DIrr,  X86::MOVSS2DImr,    TB_FOLDED_STORE },
-    { X86::MOVUPDrr,    X86::MOVUPDmr,      TB_FOLDED_STORE },
-    { X86::MOVUPSrr,    X86::MOVUPSmr,      TB_FOLDED_STORE },
-    { X86::MUL16r,      X86::MUL16m,        TB_FOLDED_LOAD },
-    { X86::MUL32r,      X86::MUL32m,        TB_FOLDED_LOAD },
-    { X86::MUL64r,      X86::MUL64m,        TB_FOLDED_LOAD },
-    { X86::MUL8r,       X86::MUL8m,         TB_FOLDED_LOAD },
-    { X86::PEXTRDrr,    X86::PEXTRDmr,      TB_FOLDED_STORE },
-    { X86::PEXTRQrr,    X86::PEXTRQmr,      TB_FOLDED_STORE },
-    { X86::PUSH16r,     X86::PUSH16rmm,     TB_FOLDED_LOAD },
-    { X86::PUSH32r,     X86::PUSH32rmm,     TB_FOLDED_LOAD },
-    { X86::PUSH64r,     X86::PUSH64rmm,     TB_FOLDED_LOAD },
-    { X86::SETAEr,      X86::SETAEm,        TB_FOLDED_STORE },
-    { X86::SETAr,       X86::SETAm,         TB_FOLDED_STORE },
-    { X86::SETBEr,      X86::SETBEm,        TB_FOLDED_STORE },
-    { X86::SETBr,       X86::SETBm,         TB_FOLDED_STORE },
-    { X86::SETEr,       X86::SETEm,         TB_FOLDED_STORE },
-    { X86::SETGEr,      X86::SETGEm,        TB_FOLDED_STORE },
-    { X86::SETGr,       X86::SETGm,         TB_FOLDED_STORE },
-    { X86::SETLEr,      X86::SETLEm,        TB_FOLDED_STORE },
-    { X86::SETLr,       X86::SETLm,         TB_FOLDED_STORE },
-    { X86::SETNEr,      X86::SETNEm,        TB_FOLDED_STORE },
-    { X86::SETNOr,      X86::SETNOm,        TB_FOLDED_STORE },
-    { X86::SETNPr,      X86::SETNPm,        TB_FOLDED_STORE },
-    { X86::SETNSr,      X86::SETNSm,        TB_FOLDED_STORE },
-    { X86::SETOr,       X86::SETOm,         TB_FOLDED_STORE },
-    { X86::SETPr,       X86::SETPm,         TB_FOLDED_STORE },
-    { X86::SETSr,       X86::SETSm,         TB_FOLDED_STORE },
-    { X86::TAILJMPr,    X86::TAILJMPm,      TB_FOLDED_LOAD },
-    { X86::TAILJMPr64,  X86::TAILJMPm64,    TB_FOLDED_LOAD },
-    { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
-    { X86::TEST16ri,    X86::TEST16mi,      TB_FOLDED_LOAD },
-    { X86::TEST32ri,    X86::TEST32mi,      TB_FOLDED_LOAD },
-    { X86::TEST64ri32,  X86::TEST64mi32,    TB_FOLDED_LOAD },
-    { X86::TEST8ri,     X86::TEST8mi,       TB_FOLDED_LOAD },
-
-    // AVX 128-bit versions of foldable instructions
-    { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr,  TB_FOLDED_STORE  },
-    { X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVAPDrr,   X86::VMOVAPDmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVAPSrr,   X86::VMOVAPSmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVDQArr,   X86::VMOVDQAmr,     TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVDQUrr,   X86::VMOVDQUmr,     TB_FOLDED_STORE },
-    { X86::VMOVPDI2DIrr,X86::VMOVPDI2DImr,  TB_FOLDED_STORE },
-    { X86::VMOVPQIto64rr, X86::VMOVPQI2QImr,TB_FOLDED_STORE },
-    { X86::VMOVSDto64rr,X86::VMOVSDto64mr,  TB_FOLDED_STORE },
-    { X86::VMOVSS2DIrr, X86::VMOVSS2DImr,   TB_FOLDED_STORE },
-    { X86::VMOVUPDrr,   X86::VMOVUPDmr,     TB_FOLDED_STORE },
-    { X86::VMOVUPSrr,   X86::VMOVUPSmr,     TB_FOLDED_STORE },
-    { X86::VPEXTRDrr,   X86::VPEXTRDmr,     TB_FOLDED_STORE },
-    { X86::VPEXTRQrr,   X86::VPEXTRQmr,     TB_FOLDED_STORE },
-
-    // AVX 256-bit foldable instructions
-    { X86::VEXTRACTI128rr, X86::VEXTRACTI128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVAPDYrr,  X86::VMOVAPDYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVAPSYrr,  X86::VMOVAPSYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVDQAYrr,  X86::VMOVDQAYmr,    TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVDQUYrr,  X86::VMOVDQUYmr,    TB_FOLDED_STORE },
-    { X86::VMOVUPDYrr,  X86::VMOVUPDYmr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSYrr,  X86::VMOVUPSYmr,    TB_FOLDED_STORE },
-
-    // AVX-512 foldable instructions
-    { X86::VEXTRACTF32x4Zrr,X86::VEXTRACTF32x4Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTF32x8Zrr,X86::VEXTRACTF32x8Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTF64x2Zrr,X86::VEXTRACTF64x2Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTF64x4Zrr,X86::VEXTRACTF64x4Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTI32x4Zrr,X86::VEXTRACTI32x4Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTI32x8Zrr,X86::VEXTRACTI32x8Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
-    { X86::VEXTRACTPSZrr,   X86::VEXTRACTPSZmr,    TB_FOLDED_STORE },
-    { X86::VMOVAPDZrr,      X86::VMOVAPDZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
-    { X86::VMOVAPSZrr,      X86::VMOVAPSZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
-    { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
-    { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
-    { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zmr,   TB_FOLDED_STORE },
-    { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zmr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zmr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zmr,  TB_FOLDED_STORE },
-    { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
-    { X86::VMOVPQIto64Zrr,  X86::VMOVPQI2QIZmr, TB_FOLDED_STORE },
-    { X86::VMOVSDto64Zrr,   X86::VMOVSDto64Zmr, TB_FOLDED_STORE },
-    { X86::VMOVSS2DIZrr,    X86::VMOVSS2DIZmr,  TB_FOLDED_STORE },
-    { X86::VMOVUPDZrr,      X86::VMOVUPDZmr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSZrr,      X86::VMOVUPSZmr,    TB_FOLDED_STORE },
-    { X86::VPEXTRDZrr,      X86::VPEXTRDZmr,    TB_FOLDED_STORE },
-    { X86::VPEXTRQZrr,      X86::VPEXTRQZmr,    TB_FOLDED_STORE },
-    { X86::VPMOVDBZrr,      X86::VPMOVDBZmr,    TB_FOLDED_STORE },
-    { X86::VPMOVDWZrr,      X86::VPMOVDWZmr,    TB_FOLDED_STORE },
-    { X86::VPMOVQDZrr,      X86::VPMOVQDZmr,    TB_FOLDED_STORE },
-    { X86::VPMOVQWZrr,      X86::VPMOVQWZmr,    TB_FOLDED_STORE },
-    { X86::VPMOVWBZrr,      X86::VPMOVWBZmr,    TB_FOLDED_STORE },
-    { X86::VPMOVSDBZrr,     X86::VPMOVSDBZmr,   TB_FOLDED_STORE },
-    { X86::VPMOVSDWZrr,     X86::VPMOVSDWZmr,   TB_FOLDED_STORE },
-    { X86::VPMOVSQDZrr,     X86::VPMOVSQDZmr,   TB_FOLDED_STORE },
-    { X86::VPMOVSQWZrr,     X86::VPMOVSQWZmr,   TB_FOLDED_STORE },
-    { X86::VPMOVSWBZrr,     X86::VPMOVSWBZmr,   TB_FOLDED_STORE },
-    { X86::VPMOVUSDBZrr,    X86::VPMOVUSDBZmr,  TB_FOLDED_STORE },
-    { X86::VPMOVUSDWZrr,    X86::VPMOVUSDWZmr,  TB_FOLDED_STORE },
-    { X86::VPMOVUSQDZrr,    X86::VPMOVUSQDZmr,  TB_FOLDED_STORE },
-    { X86::VPMOVUSQWZrr,    X86::VPMOVUSQWZmr,  TB_FOLDED_STORE },
-    { X86::VPMOVUSWBZrr,    X86::VPMOVUSWBZmr,  TB_FOLDED_STORE },
-
-    // AVX-512 foldable instructions (256-bit versions)
-    { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256mr, TB_FOLDED_STORE },
-    { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256mr, TB_FOLDED_STORE },
-    { X86::VEXTRACTI32x4Z256rr,X86::VEXTRACTI32x4Z256mr, TB_FOLDED_STORE },
-    { X86::VEXTRACTI64x2Z256rr,X86::VEXTRACTI64x2Z256mr, TB_FOLDED_STORE },
-    { X86::VMOVAPDZ256rr,      X86::VMOVAPDZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVAPSZ256rr,      X86::VMOVAPSZ256mr,    TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVDQA32Z256rr,    X86::VMOVDQA32Z256mr,  TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVDQA64Z256rr,    X86::VMOVDQA64Z256mr,  TB_FOLDED_STORE | TB_ALIGN_32 },
-    { X86::VMOVUPDZ256rr,      X86::VMOVUPDZ256mr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSZ256rr,      X86::VMOVUPSZ256mr,    TB_FOLDED_STORE },
-    { X86::VMOVDQU8Z256rr,     X86::VMOVDQU8Z256mr,   TB_FOLDED_STORE },
-    { X86::VMOVDQU16Z256rr,    X86::VMOVDQU16Z256mr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU32Z256rr,    X86::VMOVDQU32Z256mr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU64Z256rr,    X86::VMOVDQU64Z256mr,  TB_FOLDED_STORE },
-    { X86::VPMOVDWZ256rr,      X86::VPMOVDWZ256mr,    TB_FOLDED_STORE },
-    { X86::VPMOVQDZ256rr,      X86::VPMOVQDZ256mr,    TB_FOLDED_STORE },
-    { X86::VPMOVWBZ256rr,      X86::VPMOVWBZ256mr,    TB_FOLDED_STORE },
-    { X86::VPMOVSDWZ256rr,     X86::VPMOVSDWZ256mr,   TB_FOLDED_STORE },
-    { X86::VPMOVSQDZ256rr,     X86::VPMOVSQDZ256mr,   TB_FOLDED_STORE },
-    { X86::VPMOVSWBZ256rr,     X86::VPMOVSWBZ256mr,   TB_FOLDED_STORE },
-    { X86::VPMOVUSDWZ256rr,    X86::VPMOVUSDWZ256mr,  TB_FOLDED_STORE },
-    { X86::VPMOVUSQDZ256rr,    X86::VPMOVUSQDZ256mr,  TB_FOLDED_STORE },
-    { X86::VPMOVUSWBZ256rr,    X86::VPMOVUSWBZ256mr,  TB_FOLDED_STORE },
-
-    // AVX-512 foldable instructions (128-bit versions)
-    { X86::VMOVAPDZ128rr,      X86::VMOVAPDZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVAPSZ128rr,      X86::VMOVAPSZ128mr,    TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVDQA32Z128rr,    X86::VMOVDQA32Z128mr,  TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVDQA64Z128rr,    X86::VMOVDQA64Z128mr,  TB_FOLDED_STORE | TB_ALIGN_16 },
-    { X86::VMOVUPDZ128rr,      X86::VMOVUPDZ128mr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSZ128rr,      X86::VMOVUPSZ128mr,    TB_FOLDED_STORE },
-    { X86::VMOVDQU8Z128rr,     X86::VMOVDQU8Z128mr,   TB_FOLDED_STORE },
-    { X86::VMOVDQU16Z128rr,    X86::VMOVDQU16Z128mr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU32Z128rr,    X86::VMOVDQU32Z128mr,  TB_FOLDED_STORE },
-    { X86::VMOVDQU64Z128rr,    X86::VMOVDQU64Z128mr,  TB_FOLDED_STORE },
-
-    // F16C foldable instructions
-    { X86::VCVTPS2PHrr,        X86::VCVTPS2PHmr,      TB_FOLDED_STORE },
-    { X86::VCVTPS2PHYrr,       X86::VCVTPS2PHYmr,     TB_FOLDED_STORE }
-  };
-
   for (X86MemoryFoldTableEntry Entry : MemoryFoldTable0) {
     AddTableEntry(RegOp2MemOpTable0, MemOp2RegOpTable,
                   Entry.RegOp, Entry.MemOp, TB_INDEX_0 | Entry.Flags);
   }
 
-  static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
-    { X86::BSF16rr,         X86::BSF16rm,             0 },
-    { X86::BSF32rr,         X86::BSF32rm,             0 },
-    { X86::BSF64rr,         X86::BSF64rm,             0 },
-    { X86::BSR16rr,         X86::BSR16rm,             0 },
-    { X86::BSR32rr,         X86::BSR32rm,             0 },
-    { X86::BSR64rr,         X86::BSR64rm,             0 },
-    { X86::CMP16rr,         X86::CMP16rm,             0 },
-    { X86::CMP32rr,         X86::CMP32rm,             0 },
-    { X86::CMP64rr,         X86::CMP64rm,             0 },
-    { X86::CMP8rr,          X86::CMP8rm,              0 },
-    { X86::CVTSD2SSrr,      X86::CVTSD2SSrm,          0 },
-    { X86::CVTSI2SD64rr,    X86::CVTSI2SD64rm,        0 },
-    { X86::CVTSI2SDrr,      X86::CVTSI2SDrm,          0 },
-    { X86::CVTSI2SS64rr,    X86::CVTSI2SS64rm,        0 },
-    { X86::CVTSI2SSrr,      X86::CVTSI2SSrm,          0 },
-    { X86::CVTSS2SDrr,      X86::CVTSS2SDrm,          0 },
-    { X86::CVTTSD2SI64rr,   X86::CVTTSD2SI64rm,       0 },
-    { X86::CVTTSD2SIrr,     X86::CVTTSD2SIrm,         0 },
-    { X86::CVTTSS2SI64rr,   X86::CVTTSS2SI64rm,       0 },
-    { X86::CVTTSS2SIrr,     X86::CVTTSS2SIrm,         0 },
-    { X86::IMUL16rri,       X86::IMUL16rmi,           0 },
-    { X86::IMUL16rri8,      X86::IMUL16rmi8,          0 },
-    { X86::IMUL32rri,       X86::IMUL32rmi,           0 },
-    { X86::IMUL32rri8,      X86::IMUL32rmi8,          0 },
-    { X86::IMUL64rri32,     X86::IMUL64rmi32,         0 },
-    { X86::IMUL64rri8,      X86::IMUL64rmi8,          0 },
-    { X86::Int_COMISDrr,    X86::Int_COMISDrm,        TB_NO_REVERSE },
-    { X86::Int_COMISSrr,    X86::Int_COMISSrm,        TB_NO_REVERSE },
-    { X86::CVTSD2SI64rr,    X86::CVTSD2SI64rm,        TB_NO_REVERSE },
-    { X86::CVTSD2SIrr,      X86::CVTSD2SIrm,          TB_NO_REVERSE },
-    { X86::CVTSS2SI64rr,    X86::CVTSS2SI64rm,        TB_NO_REVERSE },
-    { X86::CVTSS2SIrr,      X86::CVTSS2SIrm,          TB_NO_REVERSE },
-    { X86::CVTDQ2PDrr,      X86::CVTDQ2PDrm,          TB_NO_REVERSE },
-    { X86::CVTDQ2PSrr,      X86::CVTDQ2PSrm,          TB_ALIGN_16 },
-    { X86::CVTPD2DQrr,      X86::CVTPD2DQrm,          TB_ALIGN_16 },
-    { X86::CVTPD2PSrr,      X86::CVTPD2PSrm,          TB_ALIGN_16 },
-    { X86::CVTPS2DQrr,      X86::CVTPS2DQrm,          TB_ALIGN_16 },
-    { X86::CVTPS2PDrr,      X86::CVTPS2PDrm,          TB_NO_REVERSE },
-    { X86::CVTTPD2DQrr,     X86::CVTTPD2DQrm,         TB_ALIGN_16 },
-    { X86::CVTTPS2DQrr,     X86::CVTTPS2DQrm,         TB_ALIGN_16 },
-    { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm,  TB_NO_REVERSE },
-    { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm,     TB_NO_REVERSE },
-    { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm,  TB_NO_REVERSE },
-    { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm,     TB_NO_REVERSE },
-    { X86::Int_UCOMISDrr,   X86::Int_UCOMISDrm,       TB_NO_REVERSE },
-    { X86::Int_UCOMISSrr,   X86::Int_UCOMISSrm,       TB_NO_REVERSE },
-    { X86::MOV16rr,         X86::MOV16rm,             0 },
-    { X86::MOV32rr,         X86::MOV32rm,             0 },
-    { X86::MOV64rr,         X86::MOV64rm,             0 },
-    { X86::MOV64toPQIrr,    X86::MOVQI2PQIrm,         0 },
-    { X86::MOV64toSDrr,     X86::MOV64toSDrm,         0 },
-    { X86::MOV8rr,          X86::MOV8rm,              0 },
-    { X86::MOVAPDrr,        X86::MOVAPDrm,            TB_ALIGN_16 },
-    { X86::MOVAPSrr,        X86::MOVAPSrm,            TB_ALIGN_16 },
-    { X86::MOVDDUPrr,       X86::MOVDDUPrm,           TB_NO_REVERSE },
-    { X86::MOVDI2PDIrr,     X86::MOVDI2PDIrm,         0 },
-    { X86::MOVDI2SSrr,      X86::MOVDI2SSrm,          0 },
-    { X86::MOVDQArr,        X86::MOVDQArm,            TB_ALIGN_16 },
-    { X86::MOVDQUrr,        X86::MOVDQUrm,            0 },
-    { X86::MOVSHDUPrr,      X86::MOVSHDUPrm,          TB_ALIGN_16 },
-    { X86::MOVSLDUPrr,      X86::MOVSLDUPrm,          TB_ALIGN_16 },
-    { X86::MOVSX16rr8,      X86::MOVSX16rm8,          0 },
-    { X86::MOVSX32rr16,     X86::MOVSX32rm16,         0 },
-    { X86::MOVSX32rr8,      X86::MOVSX32rm8,          0 },
-    { X86::MOVSX64rr16,     X86::MOVSX64rm16,         0 },
-    { X86::MOVSX64rr32,     X86::MOVSX64rm32,         0 },
-    { X86::MOVSX64rr8,      X86::MOVSX64rm8,          0 },
-    { X86::MOVUPDrr,        X86::MOVUPDrm,            0 },
-    { X86::MOVUPSrr,        X86::MOVUPSrm,            0 },
-    { X86::MOVZPQILo2PQIrr, X86::MOVQI2PQIrm,         TB_NO_REVERSE },
-    { X86::MOVZX16rr8,      X86::MOVZX16rm8,          0 },
-    { X86::MOVZX32rr16,     X86::MOVZX32rm16,         0 },
-    { X86::MOVZX32_NOREXrr8, X86::MOVZX32_NOREXrm8,   0 },
-    { X86::MOVZX32rr8,      X86::MOVZX32rm8,          0 },
-    { X86::PABSBrr,         X86::PABSBrm,             TB_ALIGN_16 },
-    { X86::PABSDrr,         X86::PABSDrm,             TB_ALIGN_16 },
-    { X86::PABSWrr,         X86::PABSWrm,             TB_ALIGN_16 },
-    { X86::PCMPESTRIrr,     X86::PCMPESTRIrm,         TB_ALIGN_16 },
-    { X86::PCMPESTRM128rr,  X86::PCMPESTRM128rm,      TB_ALIGN_16 },
-    { X86::PCMPISTRIrr,     X86::PCMPISTRIrm,         TB_ALIGN_16 },
-    { X86::PCMPISTRM128rr,  X86::PCMPISTRM128rm,      TB_ALIGN_16 },
-    { X86::PHMINPOSUWrr128, X86::PHMINPOSUWrm128,     TB_ALIGN_16 },
-    { X86::PMOVSXBDrr,      X86::PMOVSXBDrm,          TB_NO_REVERSE },
-    { X86::PMOVSXBQrr,      X86::PMOVSXBQrm,          TB_NO_REVERSE },
-    { X86::PMOVSXBWrr,      X86::PMOVSXBWrm,          TB_NO_REVERSE },
-    { X86::PMOVSXDQrr,      X86::PMOVSXDQrm,          TB_NO_REVERSE },
-    { X86::PMOVSXWDrr,      X86::PMOVSXWDrm,          TB_NO_REVERSE },
-    { X86::PMOVSXWQrr,      X86::PMOVSXWQrm,          TB_NO_REVERSE },
-    { X86::PMOVZXBDrr,      X86::PMOVZXBDrm,          TB_NO_REVERSE },
-    { X86::PMOVZXBQrr,      X86::PMOVZXBQrm,          TB_NO_REVERSE },
-    { X86::PMOVZXBWrr,      X86::PMOVZXBWrm,          TB_NO_REVERSE },
-    { X86::PMOVZXDQrr,      X86::PMOVZXDQrm,          TB_NO_REVERSE },
-    { X86::PMOVZXWDrr,      X86::PMOVZXWDrm,          TB_NO_REVERSE },
-    { X86::PMOVZXWQrr,      X86::PMOVZXWQrm,          TB_NO_REVERSE },
-    { X86::PSHUFDri,        X86::PSHUFDmi,            TB_ALIGN_16 },
-    { X86::PSHUFHWri,       X86::PSHUFHWmi,           TB_ALIGN_16 },
-    { X86::PSHUFLWri,       X86::PSHUFLWmi,           TB_ALIGN_16 },
-    { X86::PTESTrr,         X86::PTESTrm,             TB_ALIGN_16 },
-    { X86::RCPPSr,          X86::RCPPSm,              TB_ALIGN_16 },
-    { X86::RCPSSr,          X86::RCPSSm,              0 },
-    { X86::RCPSSr_Int,      X86::RCPSSm_Int,          TB_NO_REVERSE },
-    { X86::ROUNDPDr,        X86::ROUNDPDm,            TB_ALIGN_16 },
-    { X86::ROUNDPSr,        X86::ROUNDPSm,            TB_ALIGN_16 },
-    { X86::ROUNDSDr,        X86::ROUNDSDm,            0 },
-    { X86::ROUNDSSr,        X86::ROUNDSSm,            0 },
-    { X86::RSQRTPSr,        X86::RSQRTPSm,            TB_ALIGN_16 },
-    { X86::RSQRTSSr,        X86::RSQRTSSm,            0 },
-    { X86::RSQRTSSr_Int,    X86::RSQRTSSm_Int,        TB_NO_REVERSE },
-    { X86::SQRTPDr,         X86::SQRTPDm,             TB_ALIGN_16 },
-    { X86::SQRTPSr,         X86::SQRTPSm,             TB_ALIGN_16 },
-    { X86::SQRTSDr,         X86::SQRTSDm,             0 },
-    { X86::SQRTSDr_Int,     X86::SQRTSDm_Int,         TB_NO_REVERSE },
-    { X86::SQRTSSr,         X86::SQRTSSm,             0 },
-    { X86::SQRTSSr_Int,     X86::SQRTSSm_Int,         TB_NO_REVERSE },
-    { X86::TEST16rr,        X86::TEST16rm,            0 },
-    { X86::TEST32rr,        X86::TEST32rm,            0 },
-    { X86::TEST64rr,        X86::TEST64rm,            0 },
-    { X86::TEST8rr,         X86::TEST8rm,             0 },
-    // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
-    { X86::UCOMISDrr,       X86::UCOMISDrm,           0 },
-    { X86::UCOMISSrr,       X86::UCOMISSrm,           0 },
-
-    // MMX version of foldable instructions
-    { X86::MMX_CVTPD2PIirr,   X86::MMX_CVTPD2PIirm,   0 },
-    { X86::MMX_CVTPI2PDirr,   X86::MMX_CVTPI2PDirm,   0 },
-    { X86::MMX_CVTPS2PIirr,   X86::MMX_CVTPS2PIirm,   0 },
-    { X86::MMX_CVTTPD2PIirr,  X86::MMX_CVTTPD2PIirm,  0 },
-    { X86::MMX_CVTTPS2PIirr,  X86::MMX_CVTTPS2PIirm,  0 },
-    { X86::MMX_MOVD64to64rr,  X86::MMX_MOVQ64rm,      0 },
-    { X86::MMX_PABSBrr64,     X86::MMX_PABSBrm64,     0 },
-    { X86::MMX_PABSDrr64,     X86::MMX_PABSDrm64,     0 },
-    { X86::MMX_PABSWrr64,     X86::MMX_PABSWrm64,     0 },
-    { X86::MMX_PSHUFWri,      X86::MMX_PSHUFWmi,      0 },
-
-    // 3DNow! version of foldable instructions
-    { X86::PF2IDrr,         X86::PF2IDrm,             0 },
-    { X86::PF2IWrr,         X86::PF2IWrm,             0 },
-    { X86::PFRCPrr,         X86::PFRCPrm,             0 },
-    { X86::PFRSQRTrr,       X86::PFRSQRTrm,           0 },
-    { X86::PI2FDrr,         X86::PI2FDrm,             0 },
-    { X86::PI2FWrr,         X86::PI2FWrm,             0 },
-    { X86::PSWAPDrr,        X86::PSWAPDrm,            0 },
-
-    // AVX 128-bit versions of foldable instructions
-    { X86::Int_VCOMISDrr,   X86::Int_VCOMISDrm,       TB_NO_REVERSE },
-    { X86::Int_VCOMISSrr,   X86::Int_VCOMISSrm,       TB_NO_REVERSE },
-    { X86::Int_VUCOMISDrr,  X86::Int_VUCOMISDrm,      TB_NO_REVERSE },
-    { X86::Int_VUCOMISSrr,  X86::Int_VUCOMISSrm,      TB_NO_REVERSE },
-    { X86::VCVTTSD2SI64rr,  X86::VCVTTSD2SI64rm,      0 },
-    { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE },
-    { X86::VCVTTSD2SIrr,    X86::VCVTTSD2SIrm,        0 },
-    { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm,    TB_NO_REVERSE },
-    { X86::VCVTTSS2SI64rr,  X86::VCVTTSS2SI64rm,      0 },
-    { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE },
-    { X86::VCVTTSS2SIrr,    X86::VCVTTSS2SIrm,        0 },
-    { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm,    TB_NO_REVERSE },
-    { X86::VCVTSD2SI64rr,   X86::VCVTSD2SI64rm,       TB_NO_REVERSE },
-    { X86::VCVTSD2SIrr,     X86::VCVTSD2SIrm,         TB_NO_REVERSE },
-    { X86::VCVTSS2SI64rr,   X86::VCVTSS2SI64rm,       TB_NO_REVERSE },
-    { X86::VCVTSS2SIrr,     X86::VCVTSS2SIrm,         TB_NO_REVERSE },
-    { X86::VCVTDQ2PDrr,     X86::VCVTDQ2PDrm,         TB_NO_REVERSE },
-    { X86::VCVTDQ2PSrr,     X86::VCVTDQ2PSrm,         0 },
-    { X86::VCVTPD2DQrr,     X86::VCVTPD2DQrm,         0 },
-    { X86::VCVTPD2PSrr,     X86::VCVTPD2PSrm,         0 },
-    { X86::VCVTPS2DQrr,     X86::VCVTPS2DQrm,         0 },
-    { X86::VCVTPS2PDrr,     X86::VCVTPS2PDrm,         TB_NO_REVERSE },
-    { X86::VCVTTPD2DQrr,    X86::VCVTTPD2DQrm,        0 },
-    { X86::VCVTTPS2DQrr,    X86::VCVTTPS2DQrm,        0 },
-    { X86::VMOV64toPQIrr,   X86::VMOVQI2PQIrm,        0 },
-    { X86::VMOV64toSDrr,    X86::VMOV64toSDrm,        0 },
-    { X86::VMOVAPDrr,       X86::VMOVAPDrm,           TB_ALIGN_16 },
-    { X86::VMOVAPSrr,       X86::VMOVAPSrm,           TB_ALIGN_16 },
-    { X86::VMOVDDUPrr,      X86::VMOVDDUPrm,          TB_NO_REVERSE },
-    { X86::VMOVDI2PDIrr,    X86::VMOVDI2PDIrm,        0 },
-    { X86::VMOVDI2SSrr,     X86::VMOVDI2SSrm,         0 },
-    { X86::VMOVDQArr,       X86::VMOVDQArm,           TB_ALIGN_16 },
-    { X86::VMOVDQUrr,       X86::VMOVDQUrm,           0 },
-    { X86::VMOVSLDUPrr,     X86::VMOVSLDUPrm,         0 },
-    { X86::VMOVSHDUPrr,     X86::VMOVSHDUPrm,         0 },
-    { X86::VMOVUPDrr,       X86::VMOVUPDrm,           0 },
-    { X86::VMOVUPSrr,       X86::VMOVUPSrm,           0 },
-    { X86::VMOVZPQILo2PQIrr,X86::VMOVQI2PQIrm,        TB_NO_REVERSE },
-    { X86::VPABSBrr,        X86::VPABSBrm,            0 },
-    { X86::VPABSDrr,        X86::VPABSDrm,            0 },
-    { X86::VPABSWrr,        X86::VPABSWrm,            0 },
-    { X86::VPCMPESTRIrr,    X86::VPCMPESTRIrm,        0 },
-    { X86::VPCMPESTRM128rr, X86::VPCMPESTRM128rm,     0 },
-    { X86::VPCMPISTRIrr,    X86::VPCMPISTRIrm,        0 },
-    { X86::VPCMPISTRM128rr, X86::VPCMPISTRM128rm,     0 },
-    { X86::VPHMINPOSUWrr128, X86::VPHMINPOSUWrm128,   0 },
-    { X86::VPERMILPDri,     X86::VPERMILPDmi,         0 },
-    { X86::VPERMILPSri,     X86::VPERMILPSmi,         0 },
-    { X86::VPMOVSXBDrr,     X86::VPMOVSXBDrm,         TB_NO_REVERSE },
-    { X86::VPMOVSXBQrr,     X86::VPMOVSXBQrm,         TB_NO_REVERSE },
-    { X86::VPMOVSXBWrr,     X86::VPMOVSXBWrm,         TB_NO_REVERSE },
-    { X86::VPMOVSXDQrr,     X86::VPMOVSXDQrm,         TB_NO_REVERSE },
-    { X86::VPMOVSXWDrr,     X86::VPMOVSXWDrm,         TB_NO_REVERSE },
-    { X86::VPMOVSXWQrr,     X86::VPMOVSXWQrm,         TB_NO_REVERSE },
-    { X86::VPMOVZXBDrr,     X86::VPMOVZXBDrm,         TB_NO_REVERSE },
-    { X86::VPMOVZXBQrr,     X86::VPMOVZXBQrm,         TB_NO_REVERSE },
-    { X86::VPMOVZXBWrr,     X86::VPMOVZXBWrm,         TB_NO_REVERSE },
-    { X86::VPMOVZXDQrr,     X86::VPMOVZXDQrm,         TB_NO_REVERSE },
-    { X86::VPMOVZXWDrr,     X86::VPMOVZXWDrm,         TB_NO_REVERSE },
-    { X86::VPMOVZXWQrr,     X86::VPMOVZXWQrm,         TB_NO_REVERSE },
-    { X86::VPSHUFDri,       X86::VPSHUFDmi,           0 },
-    { X86::VPSHUFHWri,      X86::VPSHUFHWmi,          0 },
-    { X86::VPSHUFLWri,      X86::VPSHUFLWmi,          0 },
-    { X86::VPTESTrr,        X86::VPTESTrm,            0 },
-    { X86::VRCPPSr,         X86::VRCPPSm,             0 },
-    { X86::VROUNDPDr,       X86::VROUNDPDm,           0 },
-    { X86::VROUNDPSr,       X86::VROUNDPSm,           0 },
-    { X86::VRSQRTPSr,       X86::VRSQRTPSm,           0 },
-    { X86::VSQRTPDr,        X86::VSQRTPDm,            0 },
-    { X86::VSQRTPSr,        X86::VSQRTPSm,            0 },
-    { X86::VTESTPDrr,       X86::VTESTPDrm,           0 },
-    { X86::VTESTPSrr,       X86::VTESTPSrm,           0 },
-    { X86::VUCOMISDrr,      X86::VUCOMISDrm,          0 },
-    { X86::VUCOMISSrr,      X86::VUCOMISSrm,          0 },
-
-    // AVX 256-bit foldable instructions
-    { X86::VCVTDQ2PDYrr,    X86::VCVTDQ2PDYrm,        TB_NO_REVERSE },
-    { X86::VCVTDQ2PSYrr,    X86::VCVTDQ2PSYrm,        0 },
-    { X86::VCVTPD2DQYrr,    X86::VCVTPD2DQYrm,        0 },
-    { X86::VCVTPD2PSYrr,    X86::VCVTPD2PSYrm,        0 },
-    { X86::VCVTPS2DQYrr,    X86::VCVTPS2DQYrm,        0 },
-    { X86::VCVTPS2PDYrr,    X86::VCVTPS2PDYrm,        TB_NO_REVERSE },
-    { X86::VCVTTPD2DQYrr,   X86::VCVTTPD2DQYrm,       0 },
-    { X86::VCVTTPS2DQYrr,   X86::VCVTTPS2DQYrm,       0 },
-    { X86::VMOVAPDYrr,      X86::VMOVAPDYrm,          TB_ALIGN_32 },
-    { X86::VMOVAPSYrr,      X86::VMOVAPSYrm,          TB_ALIGN_32 },
-    { X86::VMOVDDUPYrr,     X86::VMOVDDUPYrm,         0 },
-    { X86::VMOVDQAYrr,      X86::VMOVDQAYrm,          TB_ALIGN_32 },
-    { X86::VMOVDQUYrr,      X86::VMOVDQUYrm,          0 },
-    { X86::VMOVSLDUPYrr,    X86::VMOVSLDUPYrm,        0 },
-    { X86::VMOVSHDUPYrr,    X86::VMOVSHDUPYrm,        0 },
-    { X86::VMOVUPDYrr,      X86::VMOVUPDYrm,          0 },
-    { X86::VMOVUPSYrr,      X86::VMOVUPSYrm,          0 },
-    { X86::VPERMILPDYri,    X86::VPERMILPDYmi,        0 },
-    { X86::VPERMILPSYri,    X86::VPERMILPSYmi,        0 },
-    { X86::VPTESTYrr,       X86::VPTESTYrm,           0 },
-    { X86::VRCPPSYr,        X86::VRCPPSYm,            0 },
-    { X86::VROUNDYPDr,      X86::VROUNDYPDm,          0 },
-    { X86::VROUNDYPSr,      X86::VROUNDYPSm,          0 },
-    { X86::VRSQRTPSYr,      X86::VRSQRTPSYm,          0 },
-    { X86::VSQRTPDYr,       X86::VSQRTPDYm,           0 },
-    { X86::VSQRTPSYr,       X86::VSQRTPSYm,           0 },
-    { X86::VTESTPDYrr,      X86::VTESTPDYrm,          0 },
-    { X86::VTESTPSYrr,      X86::VTESTPSYrm,          0 },
-
-    // AVX2 foldable instructions
-
-    // VBROADCASTS{SD}rr register instructions were an AVX2 addition while the
-    // VBROADCASTS{SD}rm memory instructions were available from AVX1.
-    // TB_NO_REVERSE prevents unfolding from introducing an illegal instruction
-    // on AVX1 targets. The VPBROADCAST instructions are all AVX2 instructions
-    // so they don't need an equivalent limitation.
-    { X86::VBROADCASTSSrr,  X86::VBROADCASTSSrm,      TB_NO_REVERSE },
-    { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
-    { X86::VPABSBYrr,       X86::VPABSBYrm,           0 },
-    { X86::VPABSDYrr,       X86::VPABSDYrm,           0 },
-    { X86::VPABSWYrr,       X86::VPABSWYrm,           0 },
-    { X86::VPBROADCASTBrr,  X86::VPBROADCASTBrm,      TB_NO_REVERSE },
-    { X86::VPBROADCASTBYrr, X86::VPBROADCASTBYrm,     TB_NO_REVERSE },
-    { X86::VPBROADCASTDrr,  X86::VPBROADCASTDrm,      TB_NO_REVERSE },
-    { X86::VPBROADCASTDYrr, X86::VPBROADCASTDYrm,     TB_NO_REVERSE },
-    { X86::VPBROADCASTQrr,  X86::VPBROADCASTQrm,      TB_NO_REVERSE },
-    { X86::VPBROADCASTQYrr, X86::VPBROADCASTQYrm,     TB_NO_REVERSE },
-    { X86::VPBROADCASTWrr,  X86::VPBROADCASTWrm,      TB_NO_REVERSE },
-    { X86::VPBROADCASTWYrr, X86::VPBROADCASTWYrm,     TB_NO_REVERSE },
-    { X86::VPERMPDYri,      X86::VPERMPDYmi,          0 },
-    { X86::VPERMQYri,       X86::VPERMQYmi,           0 },
-    { X86::VPMOVSXBDYrr,    X86::VPMOVSXBDYrm,        TB_NO_REVERSE },
-    { X86::VPMOVSXBQYrr,    X86::VPMOVSXBQYrm,        TB_NO_REVERSE },
-    { X86::VPMOVSXBWYrr,    X86::VPMOVSXBWYrm,        0 },
-    { X86::VPMOVSXDQYrr,    X86::VPMOVSXDQYrm,        0 },
-    { X86::VPMOVSXWDYrr,    X86::VPMOVSXWDYrm,        0 },
-    { X86::VPMOVSXWQYrr,    X86::VPMOVSXWQYrm,        TB_NO_REVERSE },
-    { X86::VPMOVZXBDYrr,    X86::VPMOVZXBDYrm,        TB_NO_REVERSE },
-    { X86::VPMOVZXBQYrr,    X86::VPMOVZXBQYrm,        TB_NO_REVERSE },
-    { X86::VPMOVZXBWYrr,    X86::VPMOVZXBWYrm,        0 },
-    { X86::VPMOVZXDQYrr,    X86::VPMOVZXDQYrm,        0 },
-    { X86::VPMOVZXWDYrr,    X86::VPMOVZXWDYrm,        0 },
-    { X86::VPMOVZXWQYrr,    X86::VPMOVZXWQYrm,        TB_NO_REVERSE },
-    { X86::VPSHUFDYri,      X86::VPSHUFDYmi,          0 },
-    { X86::VPSHUFHWYri,     X86::VPSHUFHWYmi,         0 },
-    { X86::VPSHUFLWYri,     X86::VPSHUFLWYmi,         0 },
-
-    // XOP foldable instructions
-    { X86::VFRCZPDrr,          X86::VFRCZPDrm,        0 },
-    { X86::VFRCZPDrrY,         X86::VFRCZPDrmY,       0 },
-    { X86::VFRCZPSrr,          X86::VFRCZPSrm,        0 },
-    { X86::VFRCZPSrrY,         X86::VFRCZPSrmY,       0 },
-    { X86::VFRCZSDrr,          X86::VFRCZSDrm,        0 },
-    { X86::VFRCZSSrr,          X86::VFRCZSSrm,        0 },
-    { X86::VPHADDBDrr,         X86::VPHADDBDrm,       0 },
-    { X86::VPHADDBQrr,         X86::VPHADDBQrm,       0 },
-    { X86::VPHADDBWrr,         X86::VPHADDBWrm,       0 },
-    { X86::VPHADDDQrr,         X86::VPHADDDQrm,       0 },
-    { X86::VPHADDWDrr,         X86::VPHADDWDrm,       0 },
-    { X86::VPHADDWQrr,         X86::VPHADDWQrm,       0 },
-    { X86::VPHADDUBDrr,        X86::VPHADDUBDrm,      0 },
-    { X86::VPHADDUBQrr,        X86::VPHADDUBQrm,      0 },
-    { X86::VPHADDUBWrr,        X86::VPHADDUBWrm,      0 },
-    { X86::VPHADDUDQrr,        X86::VPHADDUDQrm,      0 },
-    { X86::VPHADDUWDrr,        X86::VPHADDUWDrm,      0 },
-    { X86::VPHADDUWQrr,        X86::VPHADDUWQrm,      0 },
-    { X86::VPHSUBBWrr,         X86::VPHSUBBWrm,       0 },
-    { X86::VPHSUBDQrr,         X86::VPHSUBDQrm,       0 },
-    { X86::VPHSUBWDrr,         X86::VPHSUBWDrm,       0 },
-    { X86::VPROTBri,           X86::VPROTBmi,         0 },
-    { X86::VPROTBrr,           X86::VPROTBmr,         0 },
-    { X86::VPROTDri,           X86::VPROTDmi,         0 },
-    { X86::VPROTDrr,           X86::VPROTDmr,         0 },
-    { X86::VPROTQri,           X86::VPROTQmi,         0 },
-    { X86::VPROTQrr,           X86::VPROTQmr,         0 },
-    { X86::VPROTWri,           X86::VPROTWmi,         0 },
-    { X86::VPROTWrr,           X86::VPROTWmr,         0 },
-    { X86::VPSHABrr,           X86::VPSHABmr,         0 },
-    { X86::VPSHADrr,           X86::VPSHADmr,         0 },
-    { X86::VPSHAQrr,           X86::VPSHAQmr,         0 },
-    { X86::VPSHAWrr,           X86::VPSHAWmr,         0 },
-    { X86::VPSHLBrr,           X86::VPSHLBmr,         0 },
-    { X86::VPSHLDrr,           X86::VPSHLDmr,         0 },
-    { X86::VPSHLQrr,           X86::VPSHLQmr,         0 },
-    { X86::VPSHLWrr,           X86::VPSHLWmr,         0 },
-
-    // LWP foldable instructions
-    { X86::LWPINS32rri,        X86::LWPINS32rmi,      0 },
-    { X86::LWPINS64rri,        X86::LWPINS64rmi,      0 },
-    { X86::LWPVAL32rri,        X86::LWPVAL32rmi,      0 },
-    { X86::LWPVAL64rri,        X86::LWPVAL64rmi,      0 },
-
-    // BMI/BMI2/LZCNT/POPCNT/TBM foldable instructions
-    { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
-    { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
-    { X86::BEXTRI32ri,      X86::BEXTRI32mi,          0 },
-    { X86::BEXTRI64ri,      X86::BEXTRI64mi,          0 },
-    { X86::BLCFILL32rr,     X86::BLCFILL32rm,         0 },
-    { X86::BLCFILL64rr,     X86::BLCFILL64rm,         0 },
-    { X86::BLCI32rr,        X86::BLCI32rm,            0 },
-    { X86::BLCI64rr,        X86::BLCI64rm,            0 },
-    { X86::BLCIC32rr,       X86::BLCIC32rm,           0 },
-    { X86::BLCIC64rr,       X86::BLCIC64rm,           0 },
-    { X86::BLCMSK32rr,      X86::BLCMSK32rm,          0 },
-    { X86::BLCMSK64rr,      X86::BLCMSK64rm,          0 },
-    { X86::BLCS32rr,        X86::BLCS32rm,            0 },
-    { X86::BLCS64rr,        X86::BLCS64rm,            0 },
-    { X86::BLSFILL32rr,     X86::BLSFILL32rm,         0 },
-    { X86::BLSFILL64rr,     X86::BLSFILL64rm,         0 },
-    { X86::BLSI32rr,        X86::BLSI32rm,            0 },
-    { X86::BLSI64rr,        X86::BLSI64rm,            0 },
-    { X86::BLSIC32rr,       X86::BLSIC32rm,           0 },
-    { X86::BLSIC64rr,       X86::BLSIC64rm,           0 },
-    { X86::BLSMSK32rr,      X86::BLSMSK32rm,          0 },
-    { X86::BLSMSK64rr,      X86::BLSMSK64rm,          0 },
-    { X86::BLSR32rr,        X86::BLSR32rm,            0 },
-    { X86::BLSR64rr,        X86::BLSR64rm,            0 },
-    { X86::BZHI32rr,        X86::BZHI32rm,            0 },
-    { X86::BZHI64rr,        X86::BZHI64rm,            0 },
-    { X86::LZCNT16rr,       X86::LZCNT16rm,           0 },
-    { X86::LZCNT32rr,       X86::LZCNT32rm,           0 },
-    { X86::LZCNT64rr,       X86::LZCNT64rm,           0 },
-    { X86::POPCNT16rr,      X86::POPCNT16rm,          0 },
-    { X86::POPCNT32rr,      X86::POPCNT32rm,          0 },
-    { X86::POPCNT64rr,      X86::POPCNT64rm,          0 },
-    { X86::RORX32ri,        X86::RORX32mi,            0 },
-    { X86::RORX64ri,        X86::RORX64mi,            0 },
-    { X86::SARX32rr,        X86::SARX32rm,            0 },
-    { X86::SARX64rr,        X86::SARX64rm,            0 },
-    { X86::SHRX32rr,        X86::SHRX32rm,            0 },
-    { X86::SHRX64rr,        X86::SHRX64rm,            0 },
-    { X86::SHLX32rr,        X86::SHLX32rm,            0 },
-    { X86::SHLX64rr,        X86::SHLX64rm,            0 },
-    { X86::T1MSKC32rr,      X86::T1MSKC32rm,          0 },
-    { X86::T1MSKC64rr,      X86::T1MSKC64rm,          0 },
-    { X86::TZCNT16rr,       X86::TZCNT16rm,           0 },
-    { X86::TZCNT32rr,       X86::TZCNT32rm,           0 },
-    { X86::TZCNT64rr,       X86::TZCNT64rm,           0 },
-    { X86::TZMSK32rr,       X86::TZMSK32rm,           0 },
-    { X86::TZMSK64rr,       X86::TZMSK64rm,           0 },
-
-    // AVX-512 foldable instructions
-    { X86::VBROADCASTSSZr,   X86::VBROADCASTSSZm,     TB_NO_REVERSE },
-    { X86::VBROADCASTSDZr,   X86::VBROADCASTSDZm,     TB_NO_REVERSE },
-    { X86::VMOV64toPQIZrr,   X86::VMOVQI2PQIZrm,      0 },
-    { X86::VMOV64toSDZrr,    X86::VMOV64toSDZrm,      0 },
-    { X86::VMOVDI2PDIZrr,    X86::VMOVDI2PDIZrm,      0 },
-    { X86::VMOVDI2SSZrr,     X86::VMOVDI2SSZrm,       0 },
-    { X86::VMOVAPDZrr,       X86::VMOVAPDZrm,         TB_ALIGN_64 },
-    { X86::VMOVAPSZrr,       X86::VMOVAPSZrm,         TB_ALIGN_64 },
-    { X86::VMOVDQA32Zrr,     X86::VMOVDQA32Zrm,       TB_ALIGN_64 },
-    { X86::VMOVDQA64Zrr,     X86::VMOVDQA64Zrm,       TB_ALIGN_64 },
-    { X86::VMOVDQU8Zrr,      X86::VMOVDQU8Zrm,        0 },
-    { X86::VMOVDQU16Zrr,     X86::VMOVDQU16Zrm,       0 },
-    { X86::VMOVDQU32Zrr,     X86::VMOVDQU32Zrm,       0 },
-    { X86::VMOVDQU64Zrr,     X86::VMOVDQU64Zrm,       0 },
-    { X86::VMOVUPDZrr,       X86::VMOVUPDZrm,         0 },
-    { X86::VMOVUPSZrr,       X86::VMOVUPSZrm,         0 },
-    { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm,      TB_NO_REVERSE },
-    { X86::VPABSBZrr,        X86::VPABSBZrm,          0 },
-    { X86::VPABSDZrr,        X86::VPABSDZrm,          0 },
-    { X86::VPABSQZrr,        X86::VPABSQZrm,          0 },
-    { X86::VPABSWZrr,        X86::VPABSWZrm,          0 },
-    { X86::VPERMILPDZri,     X86::VPERMILPDZmi,       0 },
-    { X86::VPERMILPSZri,     X86::VPERMILPSZmi,       0 },
-    { X86::VPERMPDZri,       X86::VPERMPDZmi,         0 },
-    { X86::VPERMQZri,        X86::VPERMQZmi,          0 },
-    { X86::VPMOVSXBDZrr,     X86::VPMOVSXBDZrm,       0 },
-    { X86::VPMOVSXBQZrr,     X86::VPMOVSXBQZrm,       TB_NO_REVERSE },
-    { X86::VPMOVSXBWZrr,     X86::VPMOVSXBWZrm,       0 },
-    { X86::VPMOVSXDQZrr,     X86::VPMOVSXDQZrm,       0 },
-    { X86::VPMOVSXWDZrr,     X86::VPMOVSXWDZrm,       0 },
-    { X86::VPMOVSXWQZrr,     X86::VPMOVSXWQZrm,       0 },
-    { X86::VPMOVZXBDZrr,     X86::VPMOVZXBDZrm,       0 },
-    { X86::VPMOVZXBQZrr,     X86::VPMOVZXBQZrm,       TB_NO_REVERSE },
-    { X86::VPMOVZXBWZrr,     X86::VPMOVZXBWZrm,       0 },
-    { X86::VPMOVZXDQZrr,     X86::VPMOVZXDQZrm,       0 },
-    { X86::VPMOVZXWDZrr,     X86::VPMOVZXWDZrm,       0 },
-    { X86::VPMOVZXWQZrr,     X86::VPMOVZXWQZrm,       0 },
-    { X86::VPSHUFDZri,       X86::VPSHUFDZmi,         0 },
-    { X86::VPSHUFHWZri,      X86::VPSHUFHWZmi,        0 },
-    { X86::VPSHUFLWZri,      X86::VPSHUFLWZmi,        0 },
-    { X86::VPSLLDQZ512rr,    X86::VPSLLDQZ512rm,      0 },
-    { X86::VPSLLDZri,        X86::VPSLLDZmi,          0 },
-    { X86::VPSLLQZri,        X86::VPSLLQZmi,          0 },
-    { X86::VPSLLWZri,        X86::VPSLLWZmi,          0 },
-    { X86::VPSRADZri,        X86::VPSRADZmi,          0 },
-    { X86::VPSRAQZri,        X86::VPSRAQZmi,          0 },
-    { X86::VPSRAWZri,        X86::VPSRAWZmi,          0 },
-    { X86::VPSRLDQZ512rr,    X86::VPSRLDQZ512rm,      0 },
-    { X86::VPSRLDZri,        X86::VPSRLDZmi,          0 },
-    { X86::VPSRLQZri,        X86::VPSRLQZmi,          0 },
-    { X86::VPSRLWZri,        X86::VPSRLWZmi,          0 },
-
-    // AVX-512 foldable instructions (256-bit versions)
-    { X86::VBROADCASTSSZ256r,    X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
-    { X86::VBROADCASTSDZ256r,    X86::VBROADCASTSDZ256m,    TB_NO_REVERSE },
-    { X86::VMOVAPDZ256rr,        X86::VMOVAPDZ256rm,        TB_ALIGN_32 },
-    { X86::VMOVAPSZ256rr,        X86::VMOVAPSZ256rm,        TB_ALIGN_32 },
-    { X86::VMOVDQA32Z256rr,      X86::VMOVDQA32Z256rm,      TB_ALIGN_32 },
-    { X86::VMOVDQA64Z256rr,      X86::VMOVDQA64Z256rm,      TB_ALIGN_32 },
-    { X86::VMOVDQU8Z256rr,       X86::VMOVDQU8Z256rm,       0 },
-    { X86::VMOVDQU16Z256rr,      X86::VMOVDQU16Z256rm,      0 },
-    { X86::VMOVDQU32Z256rr,      X86::VMOVDQU32Z256rm,      0 },
-    { X86::VMOVDQU64Z256rr,      X86::VMOVDQU64Z256rm,      0 },
-    { X86::VMOVUPDZ256rr,        X86::VMOVUPDZ256rm,        0 },
-    { X86::VMOVUPSZ256rr,        X86::VMOVUPSZ256rm,        0 },
-    { X86::VPABSBZ256rr,         X86::VPABSBZ256rm,         0 },
-    { X86::VPABSDZ256rr,         X86::VPABSDZ256rm,         0 },
-    { X86::VPABSQZ256rr,         X86::VPABSQZ256rm,         0 },
-    { X86::VPABSWZ256rr,         X86::VPABSWZ256rm,         0 },
-    { X86::VPERMILPDZ256ri,      X86::VPERMILPDZ256mi,      0 },
-    { X86::VPERMILPSZ256ri,      X86::VPERMILPSZ256mi,      0 },
-    { X86::VPERMPDZ256ri,        X86::VPERMPDZ256mi,        0 },
-    { X86::VPERMQZ256ri,         X86::VPERMQZ256mi,         0 },
-    { X86::VPMOVSXBDZ256rr,      X86::VPMOVSXBDZ256rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXBQZ256rr,      X86::VPMOVSXBQZ256rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXBWZ256rr,      X86::VPMOVSXBWZ256rm,      0 },
-    { X86::VPMOVSXDQZ256rr,      X86::VPMOVSXDQZ256rm,      0 },
-    { X86::VPMOVSXWDZ256rr,      X86::VPMOVSXWDZ256rm,      0 },
-    { X86::VPMOVSXWQZ256rr,      X86::VPMOVSXWQZ256rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXBDZ256rr,      X86::VPMOVZXBDZ256rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXBQZ256rr,      X86::VPMOVZXBQZ256rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXBWZ256rr,      X86::VPMOVZXBWZ256rm,      0 },
-    { X86::VPMOVZXDQZ256rr,      X86::VPMOVZXDQZ256rm,      0 },
-    { X86::VPMOVZXWDZ256rr,      X86::VPMOVZXWDZ256rm,      0 },
-    { X86::VPMOVZXWQZ256rr,      X86::VPMOVZXWQZ256rm,      TB_NO_REVERSE },
-    { X86::VPSHUFDZ256ri,        X86::VPSHUFDZ256mi,        0 },
-    { X86::VPSHUFHWZ256ri,       X86::VPSHUFHWZ256mi,       0 },
-    { X86::VPSHUFLWZ256ri,       X86::VPSHUFLWZ256mi,       0 },
-    { X86::VPSLLDQZ256rr,        X86::VPSLLDQZ256rm,        0 },
-    { X86::VPSLLDZ256ri,         X86::VPSLLDZ256mi,         0 },
-    { X86::VPSLLQZ256ri,         X86::VPSLLQZ256mi,         0 },
-    { X86::VPSLLWZ256ri,         X86::VPSLLWZ256mi,         0 },
-    { X86::VPSRADZ256ri,         X86::VPSRADZ256mi,         0 },
-    { X86::VPSRAQZ256ri,         X86::VPSRAQZ256mi,         0 },
-    { X86::VPSRAWZ256ri,         X86::VPSRAWZ256mi,         0 },
-    { X86::VPSRLDQZ256rr,        X86::VPSRLDQZ256rm,        0 },
-    { X86::VPSRLDZ256ri,         X86::VPSRLDZ256mi,         0 },
-    { X86::VPSRLQZ256ri,         X86::VPSRLQZ256mi,         0 },
-    { X86::VPSRLWZ256ri,         X86::VPSRLWZ256mi,         0 },
-
-    // AVX-512 foldable instructions (128-bit versions)
-    { X86::VBROADCASTSSZ128r,    X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
-    { X86::VMOVAPDZ128rr,        X86::VMOVAPDZ128rm,        TB_ALIGN_16 },
-    { X86::VMOVAPSZ128rr,        X86::VMOVAPSZ128rm,        TB_ALIGN_16 },
-    { X86::VMOVDQA32Z128rr,      X86::VMOVDQA32Z128rm,      TB_ALIGN_16 },
-    { X86::VMOVDQA64Z128rr,      X86::VMOVDQA64Z128rm,      TB_ALIGN_16 },
-    { X86::VMOVDQU8Z128rr,       X86::VMOVDQU8Z128rm,       0 },
-    { X86::VMOVDQU16Z128rr,      X86::VMOVDQU16Z128rm,      0 },
-    { X86::VMOVDQU32Z128rr,      X86::VMOVDQU32Z128rm,      0 },
-    { X86::VMOVDQU64Z128rr,      X86::VMOVDQU64Z128rm,      0 },
-    { X86::VMOVUPDZ128rr,        X86::VMOVUPDZ128rm,        0 },
-    { X86::VMOVUPSZ128rr,        X86::VMOVUPSZ128rm,        0 },
-    { X86::VPABSBZ128rr,         X86::VPABSBZ128rm,         0 },
-    { X86::VPABSDZ128rr,         X86::VPABSDZ128rm,         0 },
-    { X86::VPABSQZ128rr,         X86::VPABSQZ128rm,         0 },
-    { X86::VPABSWZ128rr,         X86::VPABSWZ128rm,         0 },
-    { X86::VPERMILPDZ128ri,      X86::VPERMILPDZ128mi,      0 },
-    { X86::VPERMILPSZ128ri,      X86::VPERMILPSZ128mi,      0 },
-    { X86::VPMOVSXBDZ128rr,      X86::VPMOVSXBDZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXBQZ128rr,      X86::VPMOVSXBQZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXBWZ128rr,      X86::VPMOVSXBWZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXDQZ128rr,      X86::VPMOVSXDQZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXWDZ128rr,      X86::VPMOVSXWDZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVSXWQZ128rr,      X86::VPMOVSXWQZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXBDZ128rr,      X86::VPMOVZXBDZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXBQZ128rr,      X86::VPMOVZXBQZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXBWZ128rr,      X86::VPMOVZXBWZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXDQZ128rr,      X86::VPMOVZXDQZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXWDZ128rr,      X86::VPMOVZXWDZ128rm,      TB_NO_REVERSE },
-    { X86::VPMOVZXWQZ128rr,      X86::VPMOVZXWQZ128rm,      TB_NO_REVERSE },
-    { X86::VPSHUFDZ128ri,        X86::VPSHUFDZ128mi,        0 },
-    { X86::VPSHUFHWZ128ri,       X86::VPSHUFHWZ128mi,       0 },
-    { X86::VPSHUFLWZ128ri,       X86::VPSHUFLWZ128mi,       0 },
-    { X86::VPSLLDQZ128rr,        X86::VPSLLDQZ128rm,        0 },
-    { X86::VPSLLDZ128ri,         X86::VPSLLDZ128mi,         0 },
-    { X86::VPSLLQZ128ri,         X86::VPSLLQZ128mi,         0 },
-    { X86::VPSLLWZ128ri,         X86::VPSLLWZ128mi,         0 },
-    { X86::VPSRADZ128ri,         X86::VPSRADZ128mi,         0 },
-    { X86::VPSRAQZ128ri,         X86::VPSRAQZ128mi,         0 },
-    { X86::VPSRAWZ128ri,         X86::VPSRAWZ128mi,         0 },
-    { X86::VPSRLDQZ128rr,        X86::VPSRLDQZ128rm,        0 },
-    { X86::VPSRLDZ128ri,         X86::VPSRLDZ128mi,         0 },
-    { X86::VPSRLQZ128ri,         X86::VPSRLQZ128mi,         0 },
-    { X86::VPSRLWZ128ri,         X86::VPSRLWZ128mi,         0 },
-
-    // F16C foldable instructions
-    { X86::VCVTPH2PSrr,        X86::VCVTPH2PSrm,            0 },
-    { X86::VCVTPH2PSYrr,       X86::VCVTPH2PSYrm,           0 },
-
-    // AES foldable instructions
-    { X86::AESIMCrr,              X86::AESIMCrm,              TB_ALIGN_16 },
-    { X86::AESKEYGENASSIST128rr,  X86::AESKEYGENASSIST128rm,  TB_ALIGN_16 },
-    { X86::VAESIMCrr,             X86::VAESIMCrm,             0 },
-    { X86::VAESKEYGENASSIST128rr, X86::VAESKEYGENASSIST128rm, 0 }
-  };
-
   for (X86MemoryFoldTableEntry Entry : MemoryFoldTable1) {
     AddTableEntry(RegOp2MemOpTable1, MemOp2RegOpTable,
                   Entry.RegOp, Entry.MemOp,
@@ -1040,1394 +143,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
                   Entry.Flags | TB_INDEX_1 | TB_FOLDED_LOAD);
   }
 
-  static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
-    { X86::ADC32rr,         X86::ADC32rm,       0 },
-    { X86::ADC64rr,         X86::ADC64rm,       0 },
-    { X86::ADD16rr,         X86::ADD16rm,       0 },
-    { X86::ADD16rr_DB,      X86::ADD16rm,       TB_NO_REVERSE },
-    { X86::ADD32rr,         X86::ADD32rm,       0 },
-    { X86::ADD32rr_DB,      X86::ADD32rm,       TB_NO_REVERSE },
-    { X86::ADD64rr,         X86::ADD64rm,       0 },
-    { X86::ADD64rr_DB,      X86::ADD64rm,       TB_NO_REVERSE },
-    { X86::ADD8rr,          X86::ADD8rm,        0 },
-    { X86::ADDPDrr,         X86::ADDPDrm,       TB_ALIGN_16 },
-    { X86::ADDPSrr,         X86::ADDPSrm,       TB_ALIGN_16 },
-    { X86::ADDSDrr,         X86::ADDSDrm,       0 },
-    { X86::ADDSDrr_Int,     X86::ADDSDrm_Int,   TB_NO_REVERSE },
-    { X86::ADDSSrr,         X86::ADDSSrm,       0 },
-    { X86::ADDSSrr_Int,     X86::ADDSSrm_Int,   TB_NO_REVERSE },
-    { X86::ADDSUBPDrr,      X86::ADDSUBPDrm,    TB_ALIGN_16 },
-    { X86::ADDSUBPSrr,      X86::ADDSUBPSrm,    TB_ALIGN_16 },
-    { X86::AND16rr,         X86::AND16rm,       0 },
-    { X86::AND32rr,         X86::AND32rm,       0 },
-    { X86::AND64rr,         X86::AND64rm,       0 },
-    { X86::AND8rr,          X86::AND8rm,        0 },
-    { X86::ANDNPDrr,        X86::ANDNPDrm,      TB_ALIGN_16 },
-    { X86::ANDNPSrr,        X86::ANDNPSrm,      TB_ALIGN_16 },
-    { X86::ANDPDrr,         X86::ANDPDrm,       TB_ALIGN_16 },
-    { X86::ANDPSrr,         X86::ANDPSrm,       TB_ALIGN_16 },
-    { X86::BLENDPDrri,      X86::BLENDPDrmi,    TB_ALIGN_16 },
-    { X86::BLENDPSrri,      X86::BLENDPSrmi,    TB_ALIGN_16 },
-    { X86::BLENDVPDrr0,     X86::BLENDVPDrm0,   TB_ALIGN_16 },
-    { X86::BLENDVPSrr0,     X86::BLENDVPSrm0,   TB_ALIGN_16 },
-    { X86::CMOVA16rr,       X86::CMOVA16rm,     0 },
-    { X86::CMOVA32rr,       X86::CMOVA32rm,     0 },
-    { X86::CMOVA64rr,       X86::CMOVA64rm,     0 },
-    { X86::CMOVAE16rr,      X86::CMOVAE16rm,    0 },
-    { X86::CMOVAE32rr,      X86::CMOVAE32rm,    0 },
-    { X86::CMOVAE64rr,      X86::CMOVAE64rm,    0 },
-    { X86::CMOVB16rr,       X86::CMOVB16rm,     0 },
-    { X86::CMOVB32rr,       X86::CMOVB32rm,     0 },
-    { X86::CMOVB64rr,       X86::CMOVB64rm,     0 },
-    { X86::CMOVBE16rr,      X86::CMOVBE16rm,    0 },
-    { X86::CMOVBE32rr,      X86::CMOVBE32rm,    0 },
-    { X86::CMOVBE64rr,      X86::CMOVBE64rm,    0 },
-    { X86::CMOVE16rr,       X86::CMOVE16rm,     0 },
-    { X86::CMOVE32rr,       X86::CMOVE32rm,     0 },
-    { X86::CMOVE64rr,       X86::CMOVE64rm,     0 },
-    { X86::CMOVG16rr,       X86::CMOVG16rm,     0 },
-    { X86::CMOVG32rr,       X86::CMOVG32rm,     0 },
-    { X86::CMOVG64rr,       X86::CMOVG64rm,     0 },
-    { X86::CMOVGE16rr,      X86::CMOVGE16rm,    0 },
-    { X86::CMOVGE32rr,      X86::CMOVGE32rm,    0 },
-    { X86::CMOVGE64rr,      X86::CMOVGE64rm,    0 },
-    { X86::CMOVL16rr,       X86::CMOVL16rm,     0 },
-    { X86::CMOVL32rr,       X86::CMOVL32rm,     0 },
-    { X86::CMOVL64rr,       X86::CMOVL64rm,     0 },
-    { X86::CMOVLE16rr,      X86::CMOVLE16rm,    0 },
-    { X86::CMOVLE32rr,      X86::CMOVLE32rm,    0 },
-    { X86::CMOVLE64rr,      X86::CMOVLE64rm,    0 },
-    { X86::CMOVNE16rr,      X86::CMOVNE16rm,    0 },
-    { X86::CMOVNE32rr,      X86::CMOVNE32rm,    0 },
-    { X86::CMOVNE64rr,      X86::CMOVNE64rm,    0 },
-    { X86::CMOVNO16rr,      X86::CMOVNO16rm,    0 },
-    { X86::CMOVNO32rr,      X86::CMOVNO32rm,    0 },
-    { X86::CMOVNO64rr,      X86::CMOVNO64rm,    0 },
-    { X86::CMOVNP16rr,      X86::CMOVNP16rm,    0 },
-    { X86::CMOVNP32rr,      X86::CMOVNP32rm,    0 },
-    { X86::CMOVNP64rr,      X86::CMOVNP64rm,    0 },
-    { X86::CMOVNS16rr,      X86::CMOVNS16rm,    0 },
-    { X86::CMOVNS32rr,      X86::CMOVNS32rm,    0 },
-    { X86::CMOVNS64rr,      X86::CMOVNS64rm,    0 },
-    { X86::CMOVO16rr,       X86::CMOVO16rm,     0 },
-    { X86::CMOVO32rr,       X86::CMOVO32rm,     0 },
-    { X86::CMOVO64rr,       X86::CMOVO64rm,     0 },
-    { X86::CMOVP16rr,       X86::CMOVP16rm,     0 },
-    { X86::CMOVP32rr,       X86::CMOVP32rm,     0 },
-    { X86::CMOVP64rr,       X86::CMOVP64rm,     0 },
-    { X86::CMOVS16rr,       X86::CMOVS16rm,     0 },
-    { X86::CMOVS32rr,       X86::CMOVS32rm,     0 },
-    { X86::CMOVS64rr,       X86::CMOVS64rm,     0 },
-    { X86::CMPPDrri,        X86::CMPPDrmi,      TB_ALIGN_16 },
-    { X86::CMPPSrri,        X86::CMPPSrmi,      TB_ALIGN_16 },
-    { X86::CMPSDrr,         X86::CMPSDrm,       0 },
-    { X86::CMPSSrr,         X86::CMPSSrm,       0 },
-    { X86::CRC32r32r32,     X86::CRC32r32m32,   0 },
-    { X86::CRC32r64r64,     X86::CRC32r64m64,   0 },
-    { X86::DIVPDrr,         X86::DIVPDrm,       TB_ALIGN_16 },
-    { X86::DIVPSrr,         X86::DIVPSrm,       TB_ALIGN_16 },
-    { X86::DIVSDrr,         X86::DIVSDrm,       0 },
-    { X86::DIVSDrr_Int,     X86::DIVSDrm_Int,   TB_NO_REVERSE },
-    { X86::DIVSSrr,         X86::DIVSSrm,       0 },
-    { X86::DIVSSrr_Int,     X86::DIVSSrm_Int,   TB_NO_REVERSE },
-    { X86::DPPDrri,         X86::DPPDrmi,       TB_ALIGN_16 },
-    { X86::DPPSrri,         X86::DPPSrmi,       TB_ALIGN_16 },
-    { X86::HADDPDrr,        X86::HADDPDrm,      TB_ALIGN_16 },
-    { X86::HADDPSrr,        X86::HADDPSrm,      TB_ALIGN_16 },
-    { X86::HSUBPDrr,        X86::HSUBPDrm,      TB_ALIGN_16 },
-    { X86::HSUBPSrr,        X86::HSUBPSrm,      TB_ALIGN_16 },
-    { X86::IMUL16rr,        X86::IMUL16rm,      0 },
-    { X86::IMUL32rr,        X86::IMUL32rm,      0 },
-    { X86::IMUL64rr,        X86::IMUL64rm,      0 },
-    { X86::Int_CMPSDrr,     X86::Int_CMPSDrm,   TB_NO_REVERSE },
-    { X86::Int_CMPSSrr,     X86::Int_CMPSSrm,   TB_NO_REVERSE },
-    { X86::Int_CVTSD2SSrr,  X86::Int_CVTSD2SSrm,      TB_NO_REVERSE },
-    { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm,    0 },
-    { X86::Int_CVTSI2SDrr,  X86::Int_CVTSI2SDrm,      0 },
-    { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm,    0 },
-    { X86::Int_CVTSI2SSrr,  X86::Int_CVTSI2SSrm,      0 },
-    { X86::Int_CVTSS2SDrr,  X86::Int_CVTSS2SDrm,      TB_NO_REVERSE },
-    { X86::MAXPDrr,         X86::MAXPDrm,       TB_ALIGN_16 },
-    { X86::MAXCPDrr,        X86::MAXCPDrm,      TB_ALIGN_16 },
-    { X86::MAXPSrr,         X86::MAXPSrm,       TB_ALIGN_16 },
-    { X86::MAXCPSrr,        X86::MAXCPSrm,      TB_ALIGN_16 },
-    { X86::MAXSDrr,         X86::MAXSDrm,       0 },
-    { X86::MAXCSDrr,        X86::MAXCSDrm,      0 },
-    { X86::MAXSDrr_Int,     X86::MAXSDrm_Int,   TB_NO_REVERSE },
-    { X86::MAXSSrr,         X86::MAXSSrm,       0 },
-    { X86::MAXCSSrr,        X86::MAXCSSrm,      0 },
-    { X86::MAXSSrr_Int,     X86::MAXSSrm_Int,   TB_NO_REVERSE },
-    { X86::MINPDrr,         X86::MINPDrm,       TB_ALIGN_16 },
-    { X86::MINCPDrr,        X86::MINCPDrm,      TB_ALIGN_16 },
-    { X86::MINPSrr,         X86::MINPSrm,       TB_ALIGN_16 },
-    { X86::MINCPSrr,        X86::MINCPSrm,      TB_ALIGN_16 },
-    { X86::MINSDrr,         X86::MINSDrm,       0 },
-    { X86::MINCSDrr,        X86::MINCSDrm,      0 },
-    { X86::MINSDrr_Int,     X86::MINSDrm_Int,   TB_NO_REVERSE },
-    { X86::MINSSrr,         X86::MINSSrm,       0 },
-    { X86::MINCSSrr,        X86::MINCSSrm,      0 },
-    { X86::MINSSrr_Int,     X86::MINSSrm_Int,   TB_NO_REVERSE },
-    { X86::MOVLHPSrr,       X86::MOVHPSrm,      TB_NO_REVERSE },
-    { X86::MPSADBWrri,      X86::MPSADBWrmi,    TB_ALIGN_16 },
-    { X86::MULPDrr,         X86::MULPDrm,       TB_ALIGN_16 },
-    { X86::MULPSrr,         X86::MULPSrm,       TB_ALIGN_16 },
-    { X86::MULSDrr,         X86::MULSDrm,       0 },
-    { X86::MULSDrr_Int,     X86::MULSDrm_Int,   TB_NO_REVERSE },
-    { X86::MULSSrr,         X86::MULSSrm,       0 },
-    { X86::MULSSrr_Int,     X86::MULSSrm_Int,   TB_NO_REVERSE },
-    { X86::OR16rr,          X86::OR16rm,        0 },
-    { X86::OR32rr,          X86::OR32rm,        0 },
-    { X86::OR64rr,          X86::OR64rm,        0 },
-    { X86::OR8rr,           X86::OR8rm,         0 },
-    { X86::ORPDrr,          X86::ORPDrm,        TB_ALIGN_16 },
-    { X86::ORPSrr,          X86::ORPSrm,        TB_ALIGN_16 },
-    { X86::PACKSSDWrr,      X86::PACKSSDWrm,    TB_ALIGN_16 },
-    { X86::PACKSSWBrr,      X86::PACKSSWBrm,    TB_ALIGN_16 },
-    { X86::PACKUSDWrr,      X86::PACKUSDWrm,    TB_ALIGN_16 },
-    { X86::PACKUSWBrr,      X86::PACKUSWBrm,    TB_ALIGN_16 },
-    { X86::PADDBrr,         X86::PADDBrm,       TB_ALIGN_16 },
-    { X86::PADDDrr,         X86::PADDDrm,       TB_ALIGN_16 },
-    { X86::PADDQrr,         X86::PADDQrm,       TB_ALIGN_16 },
-    { X86::PADDSBrr,        X86::PADDSBrm,      TB_ALIGN_16 },
-    { X86::PADDSWrr,        X86::PADDSWrm,      TB_ALIGN_16 },
-    { X86::PADDUSBrr,       X86::PADDUSBrm,     TB_ALIGN_16 },
-    { X86::PADDUSWrr,       X86::PADDUSWrm,     TB_ALIGN_16 },
-    { X86::PADDWrr,         X86::PADDWrm,       TB_ALIGN_16 },
-    { X86::PALIGNRrri,      X86::PALIGNRrmi,    TB_ALIGN_16 },
-    { X86::PANDNrr,         X86::PANDNrm,       TB_ALIGN_16 },
-    { X86::PANDrr,          X86::PANDrm,        TB_ALIGN_16 },
-    { X86::PAVGBrr,         X86::PAVGBrm,       TB_ALIGN_16 },
-    { X86::PAVGWrr,         X86::PAVGWrm,       TB_ALIGN_16 },
-    { X86::PBLENDVBrr0,     X86::PBLENDVBrm0,   TB_ALIGN_16 },
-    { X86::PBLENDWrri,      X86::PBLENDWrmi,    TB_ALIGN_16 },
-    { X86::PCLMULQDQrr,     X86::PCLMULQDQrm,   TB_ALIGN_16 },
-    { X86::PCMPEQBrr,       X86::PCMPEQBrm,     TB_ALIGN_16 },
-    { X86::PCMPEQDrr,       X86::PCMPEQDrm,     TB_ALIGN_16 },
-    { X86::PCMPEQQrr,       X86::PCMPEQQrm,     TB_ALIGN_16 },
-    { X86::PCMPEQWrr,       X86::PCMPEQWrm,     TB_ALIGN_16 },
-    { X86::PCMPGTBrr,       X86::PCMPGTBrm,     TB_ALIGN_16 },
-    { X86::PCMPGTDrr,       X86::PCMPGTDrm,     TB_ALIGN_16 },
-    { X86::PCMPGTQrr,       X86::PCMPGTQrm,     TB_ALIGN_16 },
-    { X86::PCMPGTWrr,       X86::PCMPGTWrm,     TB_ALIGN_16 },
-    { X86::PHADDDrr,        X86::PHADDDrm,      TB_ALIGN_16 },
-    { X86::PHADDWrr,        X86::PHADDWrm,      TB_ALIGN_16 },
-    { X86::PHADDSWrr128,    X86::PHADDSWrm128,  TB_ALIGN_16 },
-    { X86::PHSUBDrr,        X86::PHSUBDrm,      TB_ALIGN_16 },
-    { X86::PHSUBSWrr128,    X86::PHSUBSWrm128,  TB_ALIGN_16 },
-    { X86::PHSUBWrr,        X86::PHSUBWrm,      TB_ALIGN_16 },
-    { X86::PINSRBrr,        X86::PINSRBrm,      0 },
-    { X86::PINSRDrr,        X86::PINSRDrm,      0 },
-    { X86::PINSRQrr,        X86::PINSRQrm,      0 },
-    { X86::PINSRWrri,       X86::PINSRWrmi,     0 },
-    { X86::PMADDUBSWrr,     X86::PMADDUBSWrm,   TB_ALIGN_16 },
-    { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
-    { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
-    { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
-    { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
-    { X86::PMAXUBrr,        X86::PMAXUBrm,      TB_ALIGN_16 },
-    { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
-    { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
-    { X86::PMINSBrr,        X86::PMINSBrm,      TB_ALIGN_16 },
-    { X86::PMINSDrr,        X86::PMINSDrm,      TB_ALIGN_16 },
-    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
-    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
-    { X86::PMINUDrr,        X86::PMINUDrm,      TB_ALIGN_16 },
-    { X86::PMINUWrr,        X86::PMINUWrm,      TB_ALIGN_16 },
-    { X86::PMULDQrr,        X86::PMULDQrm,      TB_ALIGN_16 },
-    { X86::PMULHRSWrr,      X86::PMULHRSWrm,    TB_ALIGN_16 },
-    { X86::PMULHUWrr,       X86::PMULHUWrm,     TB_ALIGN_16 },
-    { X86::PMULHWrr,        X86::PMULHWrm,      TB_ALIGN_16 },
-    { X86::PMULLDrr,        X86::PMULLDrm,      TB_ALIGN_16 },
-    { X86::PMULLWrr,        X86::PMULLWrm,      TB_ALIGN_16 },
-    { X86::PMULUDQrr,       X86::PMULUDQrm,     TB_ALIGN_16 },
-    { X86::PORrr,           X86::PORrm,         TB_ALIGN_16 },
-    { X86::PSADBWrr,        X86::PSADBWrm,      TB_ALIGN_16 },
-    { X86::PSHUFBrr,        X86::PSHUFBrm,      TB_ALIGN_16 },
-    { X86::PSIGNBrr128,     X86::PSIGNBrm128,   TB_ALIGN_16 },
-    { X86::PSIGNWrr128,     X86::PSIGNWrm128,   TB_ALIGN_16 },
-    { X86::PSIGNDrr128,     X86::PSIGNDrm128,   TB_ALIGN_16 },
-    { X86::PSLLDrr,         X86::PSLLDrm,       TB_ALIGN_16 },
-    { X86::PSLLQrr,         X86::PSLLQrm,       TB_ALIGN_16 },
-    { X86::PSLLWrr,         X86::PSLLWrm,       TB_ALIGN_16 },
-    { X86::PSRADrr,         X86::PSRADrm,       TB_ALIGN_16 },
-    { X86::PSRAWrr,         X86::PSRAWrm,       TB_ALIGN_16 },
-    { X86::PSRLDrr,         X86::PSRLDrm,       TB_ALIGN_16 },
-    { X86::PSRLQrr,         X86::PSRLQrm,       TB_ALIGN_16 },
-    { X86::PSRLWrr,         X86::PSRLWrm,       TB_ALIGN_16 },
-    { X86::PSUBBrr,         X86::PSUBBrm,       TB_ALIGN_16 },
-    { X86::PSUBDrr,         X86::PSUBDrm,       TB_ALIGN_16 },
-    { X86::PSUBQrr,         X86::PSUBQrm,       TB_ALIGN_16 },
-    { X86::PSUBSBrr,        X86::PSUBSBrm,      TB_ALIGN_16 },
-    { X86::PSUBSWrr,        X86::PSUBSWrm,      TB_ALIGN_16 },
-    { X86::PSUBUSBrr,       X86::PSUBUSBrm,     TB_ALIGN_16 },
-    { X86::PSUBUSWrr,       X86::PSUBUSWrm,     TB_ALIGN_16 },
-    { X86::PSUBWrr,         X86::PSUBWrm,       TB_ALIGN_16 },
-    { X86::PUNPCKHBWrr,     X86::PUNPCKHBWrm,   TB_ALIGN_16 },
-    { X86::PUNPCKHDQrr,     X86::PUNPCKHDQrm,   TB_ALIGN_16 },
-    { X86::PUNPCKHQDQrr,    X86::PUNPCKHQDQrm,  TB_ALIGN_16 },
-    { X86::PUNPCKHWDrr,     X86::PUNPCKHWDrm,   TB_ALIGN_16 },
-    { X86::PUNPCKLBWrr,     X86::PUNPCKLBWrm,   TB_ALIGN_16 },
-    { X86::PUNPCKLDQrr,     X86::PUNPCKLDQrm,   TB_ALIGN_16 },
-    { X86::PUNPCKLQDQrr,    X86::PUNPCKLQDQrm,  TB_ALIGN_16 },
-    { X86::PUNPCKLWDrr,     X86::PUNPCKLWDrm,   TB_ALIGN_16 },
-    { X86::PXORrr,          X86::PXORrm,        TB_ALIGN_16 },
-    { X86::ROUNDSDr_Int,    X86::ROUNDSDm_Int,  TB_NO_REVERSE },
-    { X86::ROUNDSSr_Int,    X86::ROUNDSSm_Int,  TB_NO_REVERSE },
-    { X86::SBB32rr,         X86::SBB32rm,       0 },
-    { X86::SBB64rr,         X86::SBB64rm,       0 },
-    { X86::SHUFPDrri,       X86::SHUFPDrmi,     TB_ALIGN_16 },
-    { X86::SHUFPSrri,       X86::SHUFPSrmi,     TB_ALIGN_16 },
-    { X86::SUB16rr,         X86::SUB16rm,       0 },
-    { X86::SUB32rr,         X86::SUB32rm,       0 },
-    { X86::SUB64rr,         X86::SUB64rm,       0 },
-    { X86::SUB8rr,          X86::SUB8rm,        0 },
-    { X86::SUBPDrr,         X86::SUBPDrm,       TB_ALIGN_16 },
-    { X86::SUBPSrr,         X86::SUBPSrm,       TB_ALIGN_16 },
-    { X86::SUBSDrr,         X86::SUBSDrm,       0 },
-    { X86::SUBSDrr_Int,     X86::SUBSDrm_Int,   TB_NO_REVERSE },
-    { X86::SUBSSrr,         X86::SUBSSrm,       0 },
-    { X86::SUBSSrr_Int,     X86::SUBSSrm_Int,   TB_NO_REVERSE },
-    // FIXME: TEST*rr -> swapped operand of TEST*mr.
-    { X86::UNPCKHPDrr,      X86::UNPCKHPDrm,    TB_ALIGN_16 },
-    { X86::UNPCKHPSrr,      X86::UNPCKHPSrm,    TB_ALIGN_16 },
-    { X86::UNPCKLPDrr,      X86::UNPCKLPDrm,    TB_ALIGN_16 },
-    { X86::UNPCKLPSrr,      X86::UNPCKLPSrm,    TB_ALIGN_16 },
-    { X86::XOR16rr,         X86::XOR16rm,       0 },
-    { X86::XOR32rr,         X86::XOR32rm,       0 },
-    { X86::XOR64rr,         X86::XOR64rm,       0 },
-    { X86::XOR8rr,          X86::XOR8rm,        0 },
-    { X86::XORPDrr,         X86::XORPDrm,       TB_ALIGN_16 },
-    { X86::XORPSrr,         X86::XORPSrm,       TB_ALIGN_16 },
-
-    // MMX version of foldable instructions
-    { X86::MMX_CVTPI2PSirr,   X86::MMX_CVTPI2PSirm,   0 },
-    { X86::MMX_PACKSSDWirr,   X86::MMX_PACKSSDWirm,   0 },
-    { X86::MMX_PACKSSWBirr,   X86::MMX_PACKSSWBirm,   0 },
-    { X86::MMX_PACKUSWBirr,   X86::MMX_PACKUSWBirm,   0 },
-    { X86::MMX_PADDBirr,      X86::MMX_PADDBirm,      0 },
-    { X86::MMX_PADDDirr,      X86::MMX_PADDDirm,      0 },
-    { X86::MMX_PADDQirr,      X86::MMX_PADDQirm,      0 },
-    { X86::MMX_PADDSBirr,     X86::MMX_PADDSBirm,     0 },
-    { X86::MMX_PADDSWirr,     X86::MMX_PADDSWirm,     0 },
-    { X86::MMX_PADDUSBirr,    X86::MMX_PADDUSBirm,    0 },
-    { X86::MMX_PADDUSWirr,    X86::MMX_PADDUSWirm,    0 },
-    { X86::MMX_PADDWirr,      X86::MMX_PADDWirm,      0 },
-    { X86::MMX_PALIGNR64irr,  X86::MMX_PALIGNR64irm,  0 },
-    { X86::MMX_PANDNirr,      X86::MMX_PANDNirm,      0 },
-    { X86::MMX_PANDirr,       X86::MMX_PANDirm,       0 },
-    { X86::MMX_PAVGBirr,      X86::MMX_PAVGBirm,      0 },
-    { X86::MMX_PAVGWirr,      X86::MMX_PAVGWirm,      0 },
-    { X86::MMX_PCMPEQBirr,    X86::MMX_PCMPEQBirm,    0 },
-    { X86::MMX_PCMPEQDirr,    X86::MMX_PCMPEQDirm,    0 },
-    { X86::MMX_PCMPEQWirr,    X86::MMX_PCMPEQWirm,    0 },
-    { X86::MMX_PCMPGTBirr,    X86::MMX_PCMPGTBirm,    0 },
-    { X86::MMX_PCMPGTDirr,    X86::MMX_PCMPGTDirm,    0 },
-    { X86::MMX_PCMPGTWirr,    X86::MMX_PCMPGTWirm,    0 },
-    { X86::MMX_PHADDSWrr64,   X86::MMX_PHADDSWrm64,   0 },
-    { X86::MMX_PHADDWrr64,    X86::MMX_PHADDWrm64,    0 },
-    { X86::MMX_PHADDrr64,     X86::MMX_PHADDrm64,     0 },
-    { X86::MMX_PHSUBDrr64,    X86::MMX_PHSUBDrm64,    0 },
-    { X86::MMX_PHSUBSWrr64,   X86::MMX_PHSUBSWrm64,   0 },
-    { X86::MMX_PHSUBWrr64,    X86::MMX_PHSUBWrm64,    0 },
-    { X86::MMX_PINSRWirri,    X86::MMX_PINSRWirmi,    0 },
-    { X86::MMX_PMADDUBSWrr64, X86::MMX_PMADDUBSWrm64, 0 },
-    { X86::MMX_PMADDWDirr,    X86::MMX_PMADDWDirm,    0 },
-    { X86::MMX_PMAXSWirr,     X86::MMX_PMAXSWirm,     0 },
-    { X86::MMX_PMAXUBirr,     X86::MMX_PMAXUBirm,     0 },
-    { X86::MMX_PMINSWirr,     X86::MMX_PMINSWirm,     0 },
-    { X86::MMX_PMINUBirr,     X86::MMX_PMINUBirm,     0 },
-    { X86::MMX_PMULHRSWrr64,  X86::MMX_PMULHRSWrm64,  0 },
-    { X86::MMX_PMULHUWirr,    X86::MMX_PMULHUWirm,    0 },
-    { X86::MMX_PMULHWirr,     X86::MMX_PMULHWirm,     0 },
-    { X86::MMX_PMULLWirr,     X86::MMX_PMULLWirm,     0 },
-    { X86::MMX_PMULUDQirr,    X86::MMX_PMULUDQirm,    0 },
-    { X86::MMX_PORirr,        X86::MMX_PORirm,        0 },
-    { X86::MMX_PSADBWirr,     X86::MMX_PSADBWirm,     0 },
-    { X86::MMX_PSHUFBrr64,    X86::MMX_PSHUFBrm64,    0 },
-    { X86::MMX_PSIGNBrr64,    X86::MMX_PSIGNBrm64,    0 },
-    { X86::MMX_PSIGNDrr64,    X86::MMX_PSIGNDrm64,    0 },
-    { X86::MMX_PSIGNWrr64,    X86::MMX_PSIGNWrm64,    0 },
-    { X86::MMX_PSLLDrr,       X86::MMX_PSLLDrm,       0 },
-    { X86::MMX_PSLLQrr,       X86::MMX_PSLLQrm,       0 },
-    { X86::MMX_PSLLWrr,       X86::MMX_PSLLWrm,       0 },
-    { X86::MMX_PSRADrr,       X86::MMX_PSRADrm,       0 },
-    { X86::MMX_PSRAWrr,       X86::MMX_PSRAWrm,       0 },
-    { X86::MMX_PSRLDrr,       X86::MMX_PSRLDrm,       0 },
-    { X86::MMX_PSRLQrr,       X86::MMX_PSRLQrm,       0 },
-    { X86::MMX_PSRLWrr,       X86::MMX_PSRLWrm,       0 },
-    { X86::MMX_PSUBBirr,      X86::MMX_PSUBBirm,      0 },
-    { X86::MMX_PSUBDirr,      X86::MMX_PSUBDirm,      0 },
-    { X86::MMX_PSUBQirr,      X86::MMX_PSUBQirm,      0 },
-    { X86::MMX_PSUBSBirr,     X86::MMX_PSUBSBirm,     0 },
-    { X86::MMX_PSUBSWirr,     X86::MMX_PSUBSWirm,     0 },
-    { X86::MMX_PSUBUSBirr,    X86::MMX_PSUBUSBirm,    0 },
-    { X86::MMX_PSUBUSWirr,    X86::MMX_PSUBUSWirm,    0 },
-    { X86::MMX_PSUBWirr,      X86::MMX_PSUBWirm,      0 },
-    { X86::MMX_PUNPCKHBWirr,  X86::MMX_PUNPCKHBWirm,  0 },
-    { X86::MMX_PUNPCKHDQirr,  X86::MMX_PUNPCKHDQirm,  0 },
-    { X86::MMX_PUNPCKHWDirr,  X86::MMX_PUNPCKHWDirm,  0 },
-    { X86::MMX_PUNPCKLBWirr,  X86::MMX_PUNPCKLBWirm,  0 },
-    { X86::MMX_PUNPCKLDQirr,  X86::MMX_PUNPCKLDQirm,  0 },
-    { X86::MMX_PUNPCKLWDirr,  X86::MMX_PUNPCKLWDirm,  0 },
-    { X86::MMX_PXORirr,       X86::MMX_PXORirm,       0 },
-
-    // 3DNow! version of foldable instructions
-    { X86::PAVGUSBrr,         X86::PAVGUSBrm,         0 },
-    { X86::PFACCrr,           X86::PFACCrm,           0 },
-    { X86::PFADDrr,           X86::PFADDrm,           0 },
-    { X86::PFCMPEQrr,         X86::PFCMPEQrm,         0 },
-    { X86::PFCMPGErr,         X86::PFCMPGErm,         0 },
-    { X86::PFCMPGTrr,         X86::PFCMPGTrm,         0 },
-    { X86::PFMAXrr,           X86::PFMAXrm,           0 },
-    { X86::PFMINrr,           X86::PFMINrm,           0 },
-    { X86::PFMULrr,           X86::PFMULrm,           0 },
-    { X86::PFNACCrr,          X86::PFNACCrm,          0 },
-    { X86::PFPNACCrr,         X86::PFPNACCrm,         0 },
-    { X86::PFRCPIT1rr,        X86::PFRCPIT1rm,        0 },
-    { X86::PFRCPIT2rr,        X86::PFRCPIT2rm,        0 },
-    { X86::PFRSQIT1rr,        X86::PFRSQIT1rm,        0 },
-    { X86::PFSUBrr,           X86::PFSUBrm,           0 },
-    { X86::PFSUBRrr,          X86::PFSUBRrm,          0 },
-    { X86::PMULHRWrr,         X86::PMULHRWrm,         0 },
-
-    // AVX 128-bit versions of foldable instructions
-    { X86::VCVTSI2SD64rr,     X86::VCVTSI2SD64rm,      0 },
-    { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm,  0 },
-    { X86::VCVTSI2SDrr,       X86::VCVTSI2SDrm,        0 },
-    { X86::Int_VCVTSI2SDrr,   X86::Int_VCVTSI2SDrm,    0 },
-    { X86::VCVTSI2SS64rr,     X86::VCVTSI2SS64rm,      0 },
-    { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm,  0 },
-    { X86::VCVTSI2SSrr,       X86::VCVTSI2SSrm,        0 },
-    { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
-    { X86::VADDPDrr,          X86::VADDPDrm,           0 },
-    { X86::VADDPSrr,          X86::VADDPSrm,           0 },
-    { X86::VADDSDrr,          X86::VADDSDrm,           0 },
-    { X86::VADDSDrr_Int,      X86::VADDSDrm_Int,       TB_NO_REVERSE },
-    { X86::VADDSSrr,          X86::VADDSSrm,           0 },
-    { X86::VADDSSrr_Int,      X86::VADDSSrm_Int,       TB_NO_REVERSE },
-    { X86::VADDSUBPDrr,       X86::VADDSUBPDrm,        0 },
-    { X86::VADDSUBPSrr,       X86::VADDSUBPSrm,        0 },
-    { X86::VANDNPDrr,         X86::VANDNPDrm,          0 },
-    { X86::VANDNPSrr,         X86::VANDNPSrm,          0 },
-    { X86::VANDPDrr,          X86::VANDPDrm,           0 },
-    { X86::VANDPSrr,          X86::VANDPSrm,           0 },
-    { X86::VBLENDPDrri,       X86::VBLENDPDrmi,        0 },
-    { X86::VBLENDPSrri,       X86::VBLENDPSrmi,        0 },
-    { X86::VBLENDVPDrr,       X86::VBLENDVPDrm,        0 },
-    { X86::VBLENDVPSrr,       X86::VBLENDVPSrm,        0 },
-    { X86::VCMPPDrri,         X86::VCMPPDrmi,          0 },
-    { X86::VCMPPSrri,         X86::VCMPPSrmi,          0 },
-    { X86::VCMPSDrr,          X86::VCMPSDrm,           0 },
-    { X86::VCMPSSrr,          X86::VCMPSSrm,           0 },
-    { X86::VDIVPDrr,          X86::VDIVPDrm,           0 },
-    { X86::VDIVPSrr,          X86::VDIVPSrm,           0 },
-    { X86::VDIVSDrr,          X86::VDIVSDrm,           0 },
-    { X86::VDIVSDrr_Int,      X86::VDIVSDrm_Int,       TB_NO_REVERSE },
-    { X86::VDIVSSrr,          X86::VDIVSSrm,           0 },
-    { X86::VDIVSSrr_Int,      X86::VDIVSSrm_Int,       TB_NO_REVERSE },
-    { X86::VDPPDrri,          X86::VDPPDrmi,           0 },
-    { X86::VDPPSrri,          X86::VDPPSrmi,           0 },
-    { X86::VHADDPDrr,         X86::VHADDPDrm,          0 },
-    { X86::VHADDPSrr,         X86::VHADDPSrm,          0 },
-    { X86::VHSUBPDrr,         X86::VHSUBPDrm,          0 },
-    { X86::VHSUBPSrr,         X86::VHSUBPSrm,          0 },
-    { X86::Int_VCMPSDrr,      X86::Int_VCMPSDrm,       TB_NO_REVERSE },
-    { X86::Int_VCMPSSrr,      X86::Int_VCMPSSrm,       TB_NO_REVERSE },
-    { X86::VMAXCPDrr,         X86::VMAXCPDrm,          0 },
-    { X86::VMAXCPSrr,         X86::VMAXCPSrm,          0 },
-    { X86::VMAXCSDrr,         X86::VMAXCSDrm,          0 },
-    { X86::VMAXCSSrr,         X86::VMAXCSSrm,          0 },
-    { X86::VMAXPDrr,          X86::VMAXPDrm,           0 },
-    { X86::VMAXPSrr,          X86::VMAXPSrm,           0 },
-    { X86::VMAXSDrr,          X86::VMAXSDrm,           0 },
-    { X86::VMAXSDrr_Int,      X86::VMAXSDrm_Int,       TB_NO_REVERSE },
-    { X86::VMAXSSrr,          X86::VMAXSSrm,           0 },
-    { X86::VMAXSSrr_Int,      X86::VMAXSSrm_Int,       TB_NO_REVERSE },
-    { X86::VMINCPDrr,         X86::VMINCPDrm,          0 },
-    { X86::VMINCPSrr,         X86::VMINCPSrm,          0 },
-    { X86::VMINCSDrr,         X86::VMINCSDrm,          0 },
-    { X86::VMINCSSrr,         X86::VMINCSSrm,          0 },
-    { X86::VMINPDrr,          X86::VMINPDrm,           0 },
-    { X86::VMINPSrr,          X86::VMINPSrm,           0 },
-    { X86::VMINSDrr,          X86::VMINSDrm,           0 },
-    { X86::VMINSDrr_Int,      X86::VMINSDrm_Int,       TB_NO_REVERSE },
-    { X86::VMINSSrr,          X86::VMINSSrm,           0 },
-    { X86::VMINSSrr_Int,      X86::VMINSSrm_Int,       TB_NO_REVERSE },
-    { X86::VMOVLHPSrr,        X86::VMOVHPSrm,          TB_NO_REVERSE },
-    { X86::VMPSADBWrri,       X86::VMPSADBWrmi,        0 },
-    { X86::VMULPDrr,          X86::VMULPDrm,           0 },
-    { X86::VMULPSrr,          X86::VMULPSrm,           0 },
-    { X86::VMULSDrr,          X86::VMULSDrm,           0 },
-    { X86::VMULSDrr_Int,      X86::VMULSDrm_Int,       TB_NO_REVERSE },
-    { X86::VMULSSrr,          X86::VMULSSrm,           0 },
-    { X86::VMULSSrr_Int,      X86::VMULSSrm_Int,       TB_NO_REVERSE },
-    { X86::VORPDrr,           X86::VORPDrm,            0 },
-    { X86::VORPSrr,           X86::VORPSrm,            0 },
-    { X86::VPACKSSDWrr,       X86::VPACKSSDWrm,        0 },
-    { X86::VPACKSSWBrr,       X86::VPACKSSWBrm,        0 },
-    { X86::VPACKUSDWrr,       X86::VPACKUSDWrm,        0 },
-    { X86::VPACKUSWBrr,       X86::VPACKUSWBrm,        0 },
-    { X86::VPADDBrr,          X86::VPADDBrm,           0 },
-    { X86::VPADDDrr,          X86::VPADDDrm,           0 },
-    { X86::VPADDQrr,          X86::VPADDQrm,           0 },
-    { X86::VPADDSBrr,         X86::VPADDSBrm,          0 },
-    { X86::VPADDSWrr,         X86::VPADDSWrm,          0 },
-    { X86::VPADDUSBrr,        X86::VPADDUSBrm,         0 },
-    { X86::VPADDUSWrr,        X86::VPADDUSWrm,         0 },
-    { X86::VPADDWrr,          X86::VPADDWrm,           0 },
-    { X86::VPALIGNRrri,       X86::VPALIGNRrmi,        0 },
-    { X86::VPANDNrr,          X86::VPANDNrm,           0 },
-    { X86::VPANDrr,           X86::VPANDrm,            0 },
-    { X86::VPAVGBrr,          X86::VPAVGBrm,           0 },
-    { X86::VPAVGWrr,          X86::VPAVGWrm,           0 },
-    { X86::VPBLENDVBrr,       X86::VPBLENDVBrm,        0 },
-    { X86::VPBLENDWrri,       X86::VPBLENDWrmi,        0 },
-    { X86::VPCLMULQDQrr,      X86::VPCLMULQDQrm,       0 },
-    { X86::VPCMPEQBrr,        X86::VPCMPEQBrm,         0 },
-    { X86::VPCMPEQDrr,        X86::VPCMPEQDrm,         0 },
-    { X86::VPCMPEQQrr,        X86::VPCMPEQQrm,         0 },
-    { X86::VPCMPEQWrr,        X86::VPCMPEQWrm,         0 },
-    { X86::VPCMPGTBrr,        X86::VPCMPGTBrm,         0 },
-    { X86::VPCMPGTDrr,        X86::VPCMPGTDrm,         0 },
-    { X86::VPCMPGTQrr,        X86::VPCMPGTQrm,         0 },
-    { X86::VPCMPGTWrr,        X86::VPCMPGTWrm,         0 },
-    { X86::VPHADDDrr,         X86::VPHADDDrm,          0 },
-    { X86::VPHADDSWrr128,     X86::VPHADDSWrm128,      0 },
-    { X86::VPHADDWrr,         X86::VPHADDWrm,          0 },
-    { X86::VPHSUBDrr,         X86::VPHSUBDrm,          0 },
-    { X86::VPHSUBSWrr128,     X86::VPHSUBSWrm128,      0 },
-    { X86::VPHSUBWrr,         X86::VPHSUBWrm,          0 },
-    { X86::VPERMILPDrr,       X86::VPERMILPDrm,        0 },
-    { X86::VPERMILPSrr,       X86::VPERMILPSrm,        0 },
-    { X86::VPINSRBrr,         X86::VPINSRBrm,          0 },
-    { X86::VPINSRDrr,         X86::VPINSRDrm,          0 },
-    { X86::VPINSRQrr,         X86::VPINSRQrm,          0 },
-    { X86::VPINSRWrri,        X86::VPINSRWrmi,         0 },
-    { X86::VPMADDUBSWrr,      X86::VPMADDUBSWrm,       0 },
-    { X86::VPMADDWDrr,        X86::VPMADDWDrm,         0 },
-    { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
-    { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
-    { X86::VPMAXSWrr,         X86::VPMAXSWrm,          0 },
-    { X86::VPMAXUBrr,         X86::VPMAXUBrm,          0 },
-    { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
-    { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
-    { X86::VPMINSBrr,         X86::VPMINSBrm,          0 },
-    { X86::VPMINSDrr,         X86::VPMINSDrm,          0 },
-    { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
-    { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
-    { X86::VPMINUDrr,         X86::VPMINUDrm,          0 },
-    { X86::VPMINUWrr,         X86::VPMINUWrm,          0 },
-    { X86::VPMULDQrr,         X86::VPMULDQrm,          0 },
-    { X86::VPMULHRSWrr,       X86::VPMULHRSWrm,        0 },
-    { X86::VPMULHUWrr,        X86::VPMULHUWrm,         0 },
-    { X86::VPMULHWrr,         X86::VPMULHWrm,          0 },
-    { X86::VPMULLDrr,         X86::VPMULLDrm,          0 },
-    { X86::VPMULLWrr,         X86::VPMULLWrm,          0 },
-    { X86::VPMULUDQrr,        X86::VPMULUDQrm,         0 },
-    { X86::VPORrr,            X86::VPORrm,             0 },
-    { X86::VPSADBWrr,         X86::VPSADBWrm,          0 },
-    { X86::VPSHUFBrr,         X86::VPSHUFBrm,          0 },
-    { X86::VPSIGNBrr128,      X86::VPSIGNBrm128,       0 },
-    { X86::VPSIGNWrr128,      X86::VPSIGNWrm128,       0 },
-    { X86::VPSIGNDrr128,      X86::VPSIGNDrm128,       0 },
-    { X86::VPSLLDrr,          X86::VPSLLDrm,           0 },
-    { X86::VPSLLQrr,          X86::VPSLLQrm,           0 },
-    { X86::VPSLLWrr,          X86::VPSLLWrm,           0 },
-    { X86::VPSRADrr,          X86::VPSRADrm,           0 },
-    { X86::VPSRAWrr,          X86::VPSRAWrm,           0 },
-    { X86::VPSRLDrr,          X86::VPSRLDrm,           0 },
-    { X86::VPSRLQrr,          X86::VPSRLQrm,           0 },
-    { X86::VPSRLWrr,          X86::VPSRLWrm,           0 },
-    { X86::VPSUBBrr,          X86::VPSUBBrm,           0 },
-    { X86::VPSUBDrr,          X86::VPSUBDrm,           0 },
-    { X86::VPSUBQrr,          X86::VPSUBQrm,           0 },
-    { X86::VPSUBSBrr,         X86::VPSUBSBrm,          0 },
-    { X86::VPSUBSWrr,         X86::VPSUBSWrm,          0 },
-    { X86::VPSUBUSBrr,        X86::VPSUBUSBrm,         0 },
-    { X86::VPSUBUSWrr,        X86::VPSUBUSWrm,         0 },
-    { X86::VPSUBWrr,          X86::VPSUBWrm,           0 },
-    { X86::VPUNPCKHBWrr,      X86::VPUNPCKHBWrm,       0 },
-    { X86::VPUNPCKHDQrr,      X86::VPUNPCKHDQrm,       0 },
-    { X86::VPUNPCKHQDQrr,     X86::VPUNPCKHQDQrm,      0 },
-    { X86::VPUNPCKHWDrr,      X86::VPUNPCKHWDrm,       0 },
-    { X86::VPUNPCKLBWrr,      X86::VPUNPCKLBWrm,       0 },
-    { X86::VPUNPCKLDQrr,      X86::VPUNPCKLDQrm,       0 },
-    { X86::VPUNPCKLQDQrr,     X86::VPUNPCKLQDQrm,      0 },
-    { X86::VPUNPCKLWDrr,      X86::VPUNPCKLWDrm,       0 },
-    { X86::VPXORrr,           X86::VPXORrm,            0 },
-    { X86::VRCPSSr,           X86::VRCPSSm,            0 },
-    { X86::VRCPSSr_Int,       X86::VRCPSSm_Int,        TB_NO_REVERSE },
-    { X86::VRSQRTSSr,         X86::VRSQRTSSm,          0 },
-    { X86::VRSQRTSSr_Int,     X86::VRSQRTSSm_Int,      TB_NO_REVERSE },
-    { X86::VROUNDSDr,         X86::VROUNDSDm,          0 },
-    { X86::VROUNDSDr_Int,     X86::VROUNDSDm_Int,      TB_NO_REVERSE },
-    { X86::VROUNDSSr,         X86::VROUNDSSm,          0 },
-    { X86::VROUNDSSr_Int,     X86::VROUNDSSm_Int,      TB_NO_REVERSE },
-    { X86::VSHUFPDrri,        X86::VSHUFPDrmi,         0 },
-    { X86::VSHUFPSrri,        X86::VSHUFPSrmi,         0 },
-    { X86::VSQRTSDr,          X86::VSQRTSDm,           0 },
-    { X86::VSQRTSDr_Int,      X86::VSQRTSDm_Int,       TB_NO_REVERSE },
-    { X86::VSQRTSSr,          X86::VSQRTSSm,           0 },
-    { X86::VSQRTSSr_Int,      X86::VSQRTSSm_Int,       TB_NO_REVERSE },
-    { X86::VSUBPDrr,          X86::VSUBPDrm,           0 },
-    { X86::VSUBPSrr,          X86::VSUBPSrm,           0 },
-    { X86::VSUBSDrr,          X86::VSUBSDrm,           0 },
-    { X86::VSUBSDrr_Int,      X86::VSUBSDrm_Int,       TB_NO_REVERSE },
-    { X86::VSUBSSrr,          X86::VSUBSSrm,           0 },
-    { X86::VSUBSSrr_Int,      X86::VSUBSSrm_Int,       TB_NO_REVERSE },
-    { X86::VUNPCKHPDrr,       X86::VUNPCKHPDrm,        0 },
-    { X86::VUNPCKHPSrr,       X86::VUNPCKHPSrm,        0 },
-    { X86::VUNPCKLPDrr,       X86::VUNPCKLPDrm,        0 },
-    { X86::VUNPCKLPSrr,       X86::VUNPCKLPSrm,        0 },
-    { X86::VXORPDrr,          X86::VXORPDrm,           0 },
-    { X86::VXORPSrr,          X86::VXORPSrm,           0 },
-
-    // AVX 256-bit foldable instructions
-    { X86::VADDPDYrr,         X86::VADDPDYrm,          0 },
-    { X86::VADDPSYrr,         X86::VADDPSYrm,          0 },
-    { X86::VADDSUBPDYrr,      X86::VADDSUBPDYrm,       0 },
-    { X86::VADDSUBPSYrr,      X86::VADDSUBPSYrm,       0 },
-    { X86::VANDNPDYrr,        X86::VANDNPDYrm,         0 },
-    { X86::VANDNPSYrr,        X86::VANDNPSYrm,         0 },
-    { X86::VANDPDYrr,         X86::VANDPDYrm,          0 },
-    { X86::VANDPSYrr,         X86::VANDPSYrm,          0 },
-    { X86::VBLENDPDYrri,      X86::VBLENDPDYrmi,       0 },
-    { X86::VBLENDPSYrri,      X86::VBLENDPSYrmi,       0 },
-    { X86::VBLENDVPDYrr,      X86::VBLENDVPDYrm,       0 },
-    { X86::VBLENDVPSYrr,      X86::VBLENDVPSYrm,       0 },
-    { X86::VCMPPDYrri,        X86::VCMPPDYrmi,         0 },
-    { X86::VCMPPSYrri,        X86::VCMPPSYrmi,         0 },
-    { X86::VDIVPDYrr,         X86::VDIVPDYrm,          0 },
-    { X86::VDIVPSYrr,         X86::VDIVPSYrm,          0 },
-    { X86::VDPPSYrri,         X86::VDPPSYrmi,          0 },
-    { X86::VHADDPDYrr,        X86::VHADDPDYrm,         0 },
-    { X86::VHADDPSYrr,        X86::VHADDPSYrm,         0 },
-    { X86::VHSUBPDYrr,        X86::VHSUBPDYrm,         0 },
-    { X86::VHSUBPSYrr,        X86::VHSUBPSYrm,         0 },
-    { X86::VINSERTF128rr,     X86::VINSERTF128rm,      0 },
-    { X86::VMAXCPDYrr,        X86::VMAXCPDYrm,         0 },
-    { X86::VMAXCPSYrr,        X86::VMAXCPSYrm,         0 },
-    { X86::VMAXPDYrr,         X86::VMAXPDYrm,          0 },
-    { X86::VMAXPSYrr,         X86::VMAXPSYrm,          0 },
-    { X86::VMINCPDYrr,        X86::VMINCPDYrm,         0 },
-    { X86::VMINCPSYrr,        X86::VMINCPSYrm,         0 },
-    { X86::VMINPDYrr,         X86::VMINPDYrm,          0 },
-    { X86::VMINPSYrr,         X86::VMINPSYrm,          0 },
-    { X86::VMULPDYrr,         X86::VMULPDYrm,          0 },
-    { X86::VMULPSYrr,         X86::VMULPSYrm,          0 },
-    { X86::VORPDYrr,          X86::VORPDYrm,           0 },
-    { X86::VORPSYrr,          X86::VORPSYrm,           0 },
-    { X86::VPERM2F128rr,      X86::VPERM2F128rm,       0 },
-    { X86::VPERMILPDYrr,      X86::VPERMILPDYrm,       0 },
-    { X86::VPERMILPSYrr,      X86::VPERMILPSYrm,       0 },
-    { X86::VSHUFPDYrri,       X86::VSHUFPDYrmi,        0 },
-    { X86::VSHUFPSYrri,       X86::VSHUFPSYrmi,        0 },
-    { X86::VSUBPDYrr,         X86::VSUBPDYrm,          0 },
-    { X86::VSUBPSYrr,         X86::VSUBPSYrm,          0 },
-    { X86::VUNPCKHPDYrr,      X86::VUNPCKHPDYrm,       0 },
-    { X86::VUNPCKHPSYrr,      X86::VUNPCKHPSYrm,       0 },
-    { X86::VUNPCKLPDYrr,      X86::VUNPCKLPDYrm,       0 },
-    { X86::VUNPCKLPSYrr,      X86::VUNPCKLPSYrm,       0 },
-    { X86::VXORPDYrr,         X86::VXORPDYrm,          0 },
-    { X86::VXORPSYrr,         X86::VXORPSYrm,          0 },
-
-    // AVX2 foldable instructions
-    { X86::VINSERTI128rr,     X86::VINSERTI128rm,      0 },
-    { X86::VPACKSSDWYrr,      X86::VPACKSSDWYrm,       0 },
-    { X86::VPACKSSWBYrr,      X86::VPACKSSWBYrm,       0 },
-    { X86::VPACKUSDWYrr,      X86::VPACKUSDWYrm,       0 },
-    { X86::VPACKUSWBYrr,      X86::VPACKUSWBYrm,       0 },
-    { X86::VPADDBYrr,         X86::VPADDBYrm,          0 },
-    { X86::VPADDDYrr,         X86::VPADDDYrm,          0 },
-    { X86::VPADDQYrr,         X86::VPADDQYrm,          0 },
-    { X86::VPADDSBYrr,        X86::VPADDSBYrm,         0 },
-    { X86::VPADDSWYrr,        X86::VPADDSWYrm,         0 },
-    { X86::VPADDUSBYrr,       X86::VPADDUSBYrm,        0 },
-    { X86::VPADDUSWYrr,       X86::VPADDUSWYrm,        0 },
-    { X86::VPADDWYrr,         X86::VPADDWYrm,          0 },
-    { X86::VPALIGNRYrri,      X86::VPALIGNRYrmi,       0 },
-    { X86::VPANDNYrr,         X86::VPANDNYrm,          0 },
-    { X86::VPANDYrr,          X86::VPANDYrm,           0 },
-    { X86::VPAVGBYrr,         X86::VPAVGBYrm,          0 },
-    { X86::VPAVGWYrr,         X86::VPAVGWYrm,          0 },
-    { X86::VPBLENDDrri,       X86::VPBLENDDrmi,        0 },
-    { X86::VPBLENDDYrri,      X86::VPBLENDDYrmi,       0 },
-    { X86::VPBLENDVBYrr,      X86::VPBLENDVBYrm,       0 },
-    { X86::VPBLENDWYrri,      X86::VPBLENDWYrmi,       0 },
-    { X86::VPCMPEQBYrr,       X86::VPCMPEQBYrm,        0 },
-    { X86::VPCMPEQDYrr,       X86::VPCMPEQDYrm,        0 },
-    { X86::VPCMPEQQYrr,       X86::VPCMPEQQYrm,        0 },
-    { X86::VPCMPEQWYrr,       X86::VPCMPEQWYrm,        0 },
-    { X86::VPCMPGTBYrr,       X86::VPCMPGTBYrm,        0 },
-    { X86::VPCMPGTDYrr,       X86::VPCMPGTDYrm,        0 },
-    { X86::VPCMPGTQYrr,       X86::VPCMPGTQYrm,        0 },
-    { X86::VPCMPGTWYrr,       X86::VPCMPGTWYrm,        0 },
-    { X86::VPERM2I128rr,      X86::VPERM2I128rm,       0 },
-    { X86::VPERMDYrr,         X86::VPERMDYrm,          0 },
-    { X86::VPERMPSYrr,        X86::VPERMPSYrm,         0 },
-    { X86::VPHADDDYrr,        X86::VPHADDDYrm,         0 },
-    { X86::VPHADDSWrr256,     X86::VPHADDSWrm256,      0 },
-    { X86::VPHADDWYrr,        X86::VPHADDWYrm,         0 },
-    { X86::VPHSUBDYrr,        X86::VPHSUBDYrm,         0 },
-    { X86::VPHSUBSWrr256,     X86::VPHSUBSWrm256,      0 },
-    { X86::VPHSUBWYrr,        X86::VPHSUBWYrm,         0 },
-    { X86::VPMADDUBSWYrr,     X86::VPMADDUBSWYrm,      0 },
-    { X86::VPMADDWDYrr,       X86::VPMADDWDYrm,        0 },
-    { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
-    { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
-    { X86::VPMAXSWYrr,        X86::VPMAXSWYrm,         0 },
-    { X86::VPMAXUBYrr,        X86::VPMAXUBYrm,         0 },
-    { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
-    { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
-    { X86::VPMINSBYrr,        X86::VPMINSBYrm,         0 },
-    { X86::VPMINSDYrr,        X86::VPMINSDYrm,         0 },
-    { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
-    { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
-    { X86::VPMINUDYrr,        X86::VPMINUDYrm,         0 },
-    { X86::VPMINUWYrr,        X86::VPMINUWYrm,         0 },
-    { X86::VMPSADBWYrri,      X86::VMPSADBWYrmi,       0 },
-    { X86::VPMULDQYrr,        X86::VPMULDQYrm,         0 },
-    { X86::VPMULHRSWYrr,      X86::VPMULHRSWYrm,       0 },
-    { X86::VPMULHUWYrr,       X86::VPMULHUWYrm,        0 },
-    { X86::VPMULHWYrr,        X86::VPMULHWYrm,         0 },
-    { X86::VPMULLDYrr,        X86::VPMULLDYrm,         0 },
-    { X86::VPMULLWYrr,        X86::VPMULLWYrm,         0 },
-    { X86::VPMULUDQYrr,       X86::VPMULUDQYrm,        0 },
-    { X86::VPORYrr,           X86::VPORYrm,            0 },
-    { X86::VPSADBWYrr,        X86::VPSADBWYrm,         0 },
-    { X86::VPSHUFBYrr,        X86::VPSHUFBYrm,         0 },
-    { X86::VPSIGNBYrr256,     X86::VPSIGNBYrm256,      0 },
-    { X86::VPSIGNWYrr256,     X86::VPSIGNWYrm256,      0 },
-    { X86::VPSIGNDYrr256,     X86::VPSIGNDYrm256,      0 },
-    { X86::VPSLLDYrr,         X86::VPSLLDYrm,          0 },
-    { X86::VPSLLQYrr,         X86::VPSLLQYrm,          0 },
-    { X86::VPSLLWYrr,         X86::VPSLLWYrm,          0 },
-    { X86::VPSLLVDrr,         X86::VPSLLVDrm,          0 },
-    { X86::VPSLLVDYrr,        X86::VPSLLVDYrm,         0 },
-    { X86::VPSLLVQrr,         X86::VPSLLVQrm,          0 },
-    { X86::VPSLLVQYrr,        X86::VPSLLVQYrm,         0 },
-    { X86::VPSRADYrr,         X86::VPSRADYrm,          0 },
-    { X86::VPSRAWYrr,         X86::VPSRAWYrm,          0 },
-    { X86::VPSRAVDrr,         X86::VPSRAVDrm,          0 },
-    { X86::VPSRAVDYrr,        X86::VPSRAVDYrm,         0 },
-    { X86::VPSRLDYrr,         X86::VPSRLDYrm,          0 },
-    { X86::VPSRLQYrr,         X86::VPSRLQYrm,          0 },
-    { X86::VPSRLWYrr,         X86::VPSRLWYrm,          0 },
-    { X86::VPSRLVDrr,         X86::VPSRLVDrm,          0 },
-    { X86::VPSRLVDYrr,        X86::VPSRLVDYrm,         0 },
-    { X86::VPSRLVQrr,         X86::VPSRLVQrm,          0 },
-    { X86::VPSRLVQYrr,        X86::VPSRLVQYrm,         0 },
-    { X86::VPSUBBYrr,         X86::VPSUBBYrm,          0 },
-    { X86::VPSUBDYrr,         X86::VPSUBDYrm,          0 },
-    { X86::VPSUBQYrr,         X86::VPSUBQYrm,          0 },
-    { X86::VPSUBSBYrr,        X86::VPSUBSBYrm,         0 },
-    { X86::VPSUBSWYrr,        X86::VPSUBSWYrm,         0 },
-    { X86::VPSUBUSBYrr,       X86::VPSUBUSBYrm,        0 },
-    { X86::VPSUBUSWYrr,       X86::VPSUBUSWYrm,        0 },
-    { X86::VPSUBWYrr,         X86::VPSUBWYrm,          0 },
-    { X86::VPUNPCKHBWYrr,     X86::VPUNPCKHBWYrm,      0 },
-    { X86::VPUNPCKHDQYrr,     X86::VPUNPCKHDQYrm,      0 },
-    { X86::VPUNPCKHQDQYrr,    X86::VPUNPCKHQDQYrm,     0 },
-    { X86::VPUNPCKHWDYrr,     X86::VPUNPCKHWDYrm,      0 },
-    { X86::VPUNPCKLBWYrr,     X86::VPUNPCKLBWYrm,      0 },
-    { X86::VPUNPCKLDQYrr,     X86::VPUNPCKLDQYrm,      0 },
-    { X86::VPUNPCKLQDQYrr,    X86::VPUNPCKLQDQYrm,     0 },
-    { X86::VPUNPCKLWDYrr,     X86::VPUNPCKLWDYrm,      0 },
-    { X86::VPXORYrr,          X86::VPXORYrm,           0 },
-
-    // FMA4 foldable patterns
-    { X86::VFMADDSS4rr,       X86::VFMADDSS4mr,        TB_ALIGN_NONE },
-    { X86::VFMADDSS4rr_Int,   X86::VFMADDSS4mr_Int,    TB_NO_REVERSE },
-    { X86::VFMADDSD4rr,       X86::VFMADDSD4mr,        TB_ALIGN_NONE },
-    { X86::VFMADDSD4rr_Int,   X86::VFMADDSD4mr_Int,    TB_NO_REVERSE },
-    { X86::VFMADDPS4rr,       X86::VFMADDPS4mr,        TB_ALIGN_NONE },
-    { X86::VFMADDPD4rr,       X86::VFMADDPD4mr,        TB_ALIGN_NONE },
-    { X86::VFMADDPS4Yrr,      X86::VFMADDPS4Ymr,       TB_ALIGN_NONE },
-    { X86::VFMADDPD4Yrr,      X86::VFMADDPD4Ymr,       TB_ALIGN_NONE },
-    { X86::VFNMADDSS4rr,      X86::VFNMADDSS4mr,       TB_ALIGN_NONE },
-    { X86::VFNMADDSS4rr_Int,  X86::VFNMADDSS4mr_Int,   TB_NO_REVERSE },
-    { X86::VFNMADDSD4rr,      X86::VFNMADDSD4mr,       TB_ALIGN_NONE },
-    { X86::VFNMADDSD4rr_Int,  X86::VFNMADDSD4mr_Int,   TB_NO_REVERSE },
-    { X86::VFNMADDPS4rr,      X86::VFNMADDPS4mr,       TB_ALIGN_NONE },
-    { X86::VFNMADDPD4rr,      X86::VFNMADDPD4mr,       TB_ALIGN_NONE },
-    { X86::VFNMADDPS4Yrr,     X86::VFNMADDPS4Ymr,      TB_ALIGN_NONE },
-    { X86::VFNMADDPD4Yrr,     X86::VFNMADDPD4Ymr,      TB_ALIGN_NONE },
-    { X86::VFMSUBSS4rr,       X86::VFMSUBSS4mr,        TB_ALIGN_NONE },
-    { X86::VFMSUBSS4rr_Int,   X86::VFMSUBSS4mr_Int,    TB_NO_REVERSE },
-    { X86::VFMSUBSD4rr,       X86::VFMSUBSD4mr,        TB_ALIGN_NONE },
-    { X86::VFMSUBSD4rr_Int,   X86::VFMSUBSD4mr_Int,    TB_NO_REVERSE },
-    { X86::VFMSUBPS4rr,       X86::VFMSUBPS4mr,        TB_ALIGN_NONE },
-    { X86::VFMSUBPD4rr,       X86::VFMSUBPD4mr,        TB_ALIGN_NONE },
-    { X86::VFMSUBPS4Yrr,      X86::VFMSUBPS4Ymr,       TB_ALIGN_NONE },
-    { X86::VFMSUBPD4Yrr,      X86::VFMSUBPD4Ymr,       TB_ALIGN_NONE },
-    { X86::VFNMSUBSS4rr,      X86::VFNMSUBSS4mr,       TB_ALIGN_NONE },
-    { X86::VFNMSUBSS4rr_Int,  X86::VFNMSUBSS4mr_Int,   TB_NO_REVERSE },
-    { X86::VFNMSUBSD4rr,      X86::VFNMSUBSD4mr,       TB_ALIGN_NONE },
-    { X86::VFNMSUBSD4rr_Int,  X86::VFNMSUBSD4mr_Int,   TB_NO_REVERSE },
-    { X86::VFNMSUBPS4rr,      X86::VFNMSUBPS4mr,       TB_ALIGN_NONE },
-    { X86::VFNMSUBPD4rr,      X86::VFNMSUBPD4mr,       TB_ALIGN_NONE },
-    { X86::VFNMSUBPS4Yrr,     X86::VFNMSUBPS4Ymr,      TB_ALIGN_NONE },
-    { X86::VFNMSUBPD4Yrr,     X86::VFNMSUBPD4Ymr,      TB_ALIGN_NONE },
-    { X86::VFMADDSUBPS4rr,    X86::VFMADDSUBPS4mr,     TB_ALIGN_NONE },
-    { X86::VFMADDSUBPD4rr,    X86::VFMADDSUBPD4mr,     TB_ALIGN_NONE },
-    { X86::VFMADDSUBPS4Yrr,   X86::VFMADDSUBPS4Ymr,    TB_ALIGN_NONE },
-    { X86::VFMADDSUBPD4Yrr,   X86::VFMADDSUBPD4Ymr,    TB_ALIGN_NONE },
-    { X86::VFMSUBADDPS4rr,    X86::VFMSUBADDPS4mr,     TB_ALIGN_NONE },
-    { X86::VFMSUBADDPD4rr,    X86::VFMSUBADDPD4mr,     TB_ALIGN_NONE },
-    { X86::VFMSUBADDPS4Yrr,   X86::VFMSUBADDPS4Ymr,    TB_ALIGN_NONE },
-    { X86::VFMSUBADDPD4Yrr,   X86::VFMSUBADDPD4Ymr,    TB_ALIGN_NONE },
-
-    // XOP foldable instructions
-    { X86::VPCMOVrrr,         X86::VPCMOVrmr,           0 },
-    { X86::VPCMOVYrrr,        X86::VPCMOVYrmr,          0 },
-    { X86::VPCOMBri,          X86::VPCOMBmi,            0 },
-    { X86::VPCOMDri,          X86::VPCOMDmi,            0 },
-    { X86::VPCOMQri,          X86::VPCOMQmi,            0 },
-    { X86::VPCOMWri,          X86::VPCOMWmi,            0 },
-    { X86::VPCOMUBri,         X86::VPCOMUBmi,           0 },
-    { X86::VPCOMUDri,         X86::VPCOMUDmi,           0 },
-    { X86::VPCOMUQri,         X86::VPCOMUQmi,           0 },
-    { X86::VPCOMUWri,         X86::VPCOMUWmi,           0 },
-    { X86::VPERMIL2PDrr,      X86::VPERMIL2PDmr,        0 },
-    { X86::VPERMIL2PDYrr,     X86::VPERMIL2PDYmr,       0 },
-    { X86::VPERMIL2PSrr,      X86::VPERMIL2PSmr,        0 },
-    { X86::VPERMIL2PSYrr,     X86::VPERMIL2PSYmr,       0 },
-    { X86::VPMACSDDrr,        X86::VPMACSDDrm,          0 },
-    { X86::VPMACSDQHrr,       X86::VPMACSDQHrm,         0 },
-    { X86::VPMACSDQLrr,       X86::VPMACSDQLrm,         0 },
-    { X86::VPMACSSDDrr,       X86::VPMACSSDDrm,         0 },
-    { X86::VPMACSSDQHrr,      X86::VPMACSSDQHrm,        0 },
-    { X86::VPMACSSDQLrr,      X86::VPMACSSDQLrm,        0 },
-    { X86::VPMACSSWDrr,       X86::VPMACSSWDrm,         0 },
-    { X86::VPMACSSWWrr,       X86::VPMACSSWWrm,         0 },
-    { X86::VPMACSWDrr,        X86::VPMACSWDrm,          0 },
-    { X86::VPMACSWWrr,        X86::VPMACSWWrm,          0 },
-    { X86::VPMADCSSWDrr,      X86::VPMADCSSWDrm,        0 },
-    { X86::VPMADCSWDrr,       X86::VPMADCSWDrm,         0 },
-    { X86::VPPERMrrr,         X86::VPPERMrmr,           0 },
-    { X86::VPROTBrr,          X86::VPROTBrm,            0 },
-    { X86::VPROTDrr,          X86::VPROTDrm,            0 },
-    { X86::VPROTQrr,          X86::VPROTQrm,            0 },
-    { X86::VPROTWrr,          X86::VPROTWrm,            0 },
-    { X86::VPSHABrr,          X86::VPSHABrm,            0 },
-    { X86::VPSHADrr,          X86::VPSHADrm,            0 },
-    { X86::VPSHAQrr,          X86::VPSHAQrm,            0 },
-    { X86::VPSHAWrr,          X86::VPSHAWrm,            0 },
-    { X86::VPSHLBrr,          X86::VPSHLBrm,            0 },
-    { X86::VPSHLDrr,          X86::VPSHLDrm,            0 },
-    { X86::VPSHLQrr,          X86::VPSHLQrm,            0 },
-    { X86::VPSHLWrr,          X86::VPSHLWrm,            0 },
-
-    // BMI/BMI2 foldable instructions
-    { X86::ANDN32rr,          X86::ANDN32rm,            0 },
-    { X86::ANDN64rr,          X86::ANDN64rm,            0 },
-    { X86::MULX32rr,          X86::MULX32rm,            0 },
-    { X86::MULX64rr,          X86::MULX64rm,            0 },
-    { X86::PDEP32rr,          X86::PDEP32rm,            0 },
-    { X86::PDEP64rr,          X86::PDEP64rm,            0 },
-    { X86::PEXT32rr,          X86::PEXT32rm,            0 },
-    { X86::PEXT64rr,          X86::PEXT64rm,            0 },
-
-    // ADX foldable instructions
-    { X86::ADCX32rr,          X86::ADCX32rm,            0 },
-    { X86::ADCX64rr,          X86::ADCX64rm,            0 },
-    { X86::ADOX32rr,          X86::ADOX32rm,            0 },
-    { X86::ADOX64rr,          X86::ADOX64rm,            0 },
-
-    // AVX-512 foldable instructions
-    { X86::VADDPDZrr,         X86::VADDPDZrm,           0 },
-    { X86::VADDPSZrr,         X86::VADDPSZrm,           0 },
-    { X86::VADDSDZrr,         X86::VADDSDZrm,           0 },
-    { X86::VADDSDZrr_Int,     X86::VADDSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VADDSSZrr,         X86::VADDSSZrm,           0 },
-    { X86::VADDSSZrr_Int,     X86::VADDSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VALIGNDZrri,       X86::VALIGNDZrmi,         0 },
-    { X86::VALIGNQZrri,       X86::VALIGNQZrmi,         0 },
-    { X86::VANDNPDZrr,        X86::VANDNPDZrm,          0 },
-    { X86::VANDNPSZrr,        X86::VANDNPSZrm,          0 },
-    { X86::VANDPDZrr,         X86::VANDPDZrm,           0 },
-    { X86::VANDPSZrr,         X86::VANDPSZrm,           0 },
-    { X86::VCMPPDZrri,        X86::VCMPPDZrmi,          0 },
-    { X86::VCMPPSZrri,        X86::VCMPPSZrmi,          0 },
-    { X86::VCMPSDZrr,         X86::VCMPSDZrm,           0 },
-    { X86::VCMPSDZrr_Int,     X86::VCMPSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VCMPSSZrr,         X86::VCMPSSZrm,           0 },
-    { X86::VCMPSSZrr_Int,     X86::VCMPSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VDIVPDZrr,         X86::VDIVPDZrm,           0 },
-    { X86::VDIVPSZrr,         X86::VDIVPSZrm,           0 },
-    { X86::VDIVSDZrr,         X86::VDIVSDZrm,           0 },
-    { X86::VDIVSDZrr_Int,     X86::VDIVSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VDIVSSZrr,         X86::VDIVSSZrm,           0 },
-    { X86::VDIVSSZrr_Int,     X86::VDIVSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VINSERTF32x4Zrr,   X86::VINSERTF32x4Zrm,     0 },
-    { X86::VINSERTF32x8Zrr,   X86::VINSERTF32x8Zrm,     0 },
-    { X86::VINSERTF64x2Zrr,   X86::VINSERTF64x2Zrm,     0 },
-    { X86::VINSERTF64x4Zrr,   X86::VINSERTF64x4Zrm,     0 },
-    { X86::VINSERTI32x4Zrr,   X86::VINSERTI32x4Zrm,     0 },
-    { X86::VINSERTI32x8Zrr,   X86::VINSERTI32x8Zrm,     0 },
-    { X86::VINSERTI64x2Zrr,   X86::VINSERTI64x2Zrm,     0 },
-    { X86::VINSERTI64x4Zrr,   X86::VINSERTI64x4Zrm,     0 },
-    { X86::VMAXCPDZrr,        X86::VMAXCPDZrm,          0 },
-    { X86::VMAXCPSZrr,        X86::VMAXCPSZrm,          0 },
-    { X86::VMAXCSDZrr,        X86::VMAXCSDZrm,          0 },
-    { X86::VMAXCSSZrr,        X86::VMAXCSSZrm,          0 },
-    { X86::VMAXPDZrr,         X86::VMAXPDZrm,           0 },
-    { X86::VMAXPSZrr,         X86::VMAXPSZrm,           0 },
-    { X86::VMAXSDZrr,         X86::VMAXSDZrm,           0 },
-    { X86::VMAXSDZrr_Int,     X86::VMAXSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VMAXSSZrr,         X86::VMAXSSZrm,           0 },
-    { X86::VMAXSSZrr_Int,     X86::VMAXSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VMINCPDZrr,        X86::VMINCPDZrm,          0 },
-    { X86::VMINCPSZrr,        X86::VMINCPSZrm,          0 },
-    { X86::VMINCSDZrr,        X86::VMINCSDZrm,          0 },
-    { X86::VMINCSSZrr,        X86::VMINCSSZrm,          0 },
-    { X86::VMINPDZrr,         X86::VMINPDZrm,           0 },
-    { X86::VMINPSZrr,         X86::VMINPSZrm,           0 },
-    { X86::VMINSDZrr,         X86::VMINSDZrm,           0 },
-    { X86::VMINSDZrr_Int,     X86::VMINSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VMINSSZrr,         X86::VMINSSZrm,           0 },
-    { X86::VMINSSZrr_Int,     X86::VMINSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VMOVLHPSZrr,       X86::VMOVHPSZ128rm,       TB_NO_REVERSE },
-    { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
-    { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
-    { X86::VMULSDZrr,         X86::VMULSDZrm,           0 },
-    { X86::VMULSDZrr_Int,     X86::VMULSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VMULSSZrr,         X86::VMULSSZrm,           0 },
-    { X86::VMULSSZrr_Int,     X86::VMULSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VORPDZrr,          X86::VORPDZrm,            0 },
-    { X86::VORPSZrr,          X86::VORPSZrm,            0 },
-    { X86::VPACKSSDWZrr,      X86::VPACKSSDWZrm,        0 },
-    { X86::VPACKSSWBZrr,      X86::VPACKSSWBZrm,        0 },
-    { X86::VPACKUSDWZrr,      X86::VPACKUSDWZrm,        0 },
-    { X86::VPACKUSWBZrr,      X86::VPACKUSWBZrm,        0 },
-    { X86::VPADDBZrr,         X86::VPADDBZrm,           0 },
-    { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
-    { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
-    { X86::VPADDSBZrr,        X86::VPADDSBZrm,          0 },
-    { X86::VPADDSWZrr,        X86::VPADDSWZrm,          0 },
-    { X86::VPADDUSBZrr,       X86::VPADDUSBZrm,         0 },
-    { X86::VPADDUSWZrr,       X86::VPADDUSWZrm,         0 },
-    { X86::VPADDWZrr,         X86::VPADDWZrm,           0 },
-    { X86::VPALIGNRZrri,      X86::VPALIGNRZrmi,        0 },
-    { X86::VPANDDZrr,         X86::VPANDDZrm,           0 },
-    { X86::VPANDNDZrr,        X86::VPANDNDZrm,          0 },
-    { X86::VPANDNQZrr,        X86::VPANDNQZrm,          0 },
-    { X86::VPANDQZrr,         X86::VPANDQZrm,           0 },
-    { X86::VPAVGBZrr,         X86::VPAVGBZrm,           0 },
-    { X86::VPAVGWZrr,         X86::VPAVGWZrm,           0 },
-    { X86::VPCMPBZrri,        X86::VPCMPBZrmi,          0 },
-    { X86::VPCMPDZrri,        X86::VPCMPDZrmi,          0 },
-    { X86::VPCMPEQBZrr,       X86::VPCMPEQBZrm,         0 },
-    { X86::VPCMPEQDZrr,       X86::VPCMPEQDZrm,         0 },
-    { X86::VPCMPEQQZrr,       X86::VPCMPEQQZrm,         0 },
-    { X86::VPCMPEQWZrr,       X86::VPCMPEQWZrm,         0 },
-    { X86::VPCMPGTBZrr,       X86::VPCMPGTBZrm,         0 },
-    { X86::VPCMPGTDZrr,       X86::VPCMPGTDZrm,         0 },
-    { X86::VPCMPGTQZrr,       X86::VPCMPGTQZrm,         0 },
-    { X86::VPCMPGTWZrr,       X86::VPCMPGTWZrm,         0 },
-    { X86::VPCMPQZrri,        X86::VPCMPQZrmi,          0 },
-    { X86::VPCMPUBZrri,       X86::VPCMPUBZrmi,         0 },
-    { X86::VPCMPUDZrri,       X86::VPCMPUDZrmi,         0 },
-    { X86::VPCMPUQZrri,       X86::VPCMPUQZrmi,         0 },
-    { X86::VPCMPUWZrri,       X86::VPCMPUWZrmi,         0 },
-    { X86::VPCMPWZrri,        X86::VPCMPWZrmi,          0 },
-    { X86::VPERMBZrr,         X86::VPERMBZrm,           0 },
-    { X86::VPERMDZrr,         X86::VPERMDZrm,           0 },
-    { X86::VPERMILPDZrr,      X86::VPERMILPDZrm,        0 },
-    { X86::VPERMILPSZrr,      X86::VPERMILPSZrm,        0 },
-    { X86::VPERMPDZrr,        X86::VPERMPDZrm,          0 },
-    { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
-    { X86::VPERMQZrr,         X86::VPERMQZrm,           0 },
-    { X86::VPERMWZrr,         X86::VPERMWZrm,           0 },
-    { X86::VPINSRBZrr,        X86::VPINSRBZrm,          0 },
-    { X86::VPINSRDZrr,        X86::VPINSRDZrm,          0 },
-    { X86::VPINSRQZrr,        X86::VPINSRQZrm,          0 },
-    { X86::VPINSRWZrr,        X86::VPINSRWZrm,          0 },
-    { X86::VPMADDUBSWZrr,     X86::VPMADDUBSWZrm,       0 },
-    { X86::VPMADDWDZrr,       X86::VPMADDWDZrm,         0 },
-    { X86::VPMAXSBZrr,        X86::VPMAXSBZrm,          0 },
-    { X86::VPMAXSDZrr,        X86::VPMAXSDZrm,          0 },
-    { X86::VPMAXSQZrr,        X86::VPMAXSQZrm,          0 },
-    { X86::VPMAXSWZrr,        X86::VPMAXSWZrm,          0 },
-    { X86::VPMAXUBZrr,        X86::VPMAXUBZrm,          0 },
-    { X86::VPMAXUDZrr,        X86::VPMAXUDZrm,          0 },
-    { X86::VPMAXUQZrr,        X86::VPMAXUQZrm,          0 },
-    { X86::VPMAXUWZrr,        X86::VPMAXUWZrm,          0 },
-    { X86::VPMINSBZrr,        X86::VPMINSBZrm,          0 },
-    { X86::VPMINSDZrr,        X86::VPMINSDZrm,          0 },
-    { X86::VPMINSQZrr,        X86::VPMINSQZrm,          0 },
-    { X86::VPMINSWZrr,        X86::VPMINSWZrm,          0 },
-    { X86::VPMINUBZrr,        X86::VPMINUBZrm,          0 },
-    { X86::VPMINUDZrr,        X86::VPMINUDZrm,          0 },
-    { X86::VPMINUQZrr,        X86::VPMINUQZrm,          0 },
-    { X86::VPMINUWZrr,        X86::VPMINUWZrm,          0 },
-    { X86::VPMULDQZrr,        X86::VPMULDQZrm,          0 },
-    { X86::VPMULLDZrr,        X86::VPMULLDZrm,          0 },
-    { X86::VPMULLQZrr,        X86::VPMULLQZrm,          0 },
-    { X86::VPMULLWZrr,        X86::VPMULLWZrm,          0 },
-    { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
-    { X86::VPORDZrr,          X86::VPORDZrm,            0 },
-    { X86::VPORQZrr,          X86::VPORQZrm,            0 },
-    { X86::VPSADBWZ512rr,     X86::VPSADBWZ512rm,       0 },
-    { X86::VPSHUFBZrr,        X86::VPSHUFBZrm,          0 },
-    { X86::VPSLLDZrr,         X86::VPSLLDZrm,           0 },
-    { X86::VPSLLQZrr,         X86::VPSLLQZrm,           0 },
-    { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
-    { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
-    { X86::VPSLLVWZrr,        X86::VPSLLVWZrm,          0 },
-    { X86::VPSLLWZrr,         X86::VPSLLWZrm,           0 },
-    { X86::VPSRADZrr,         X86::VPSRADZrm,           0 },
-    { X86::VPSRAQZrr,         X86::VPSRAQZrm,           0 },
-    { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
-    { X86::VPSRAVQZrr,        X86::VPSRAVQZrm,          0 },
-    { X86::VPSRAVWZrr,        X86::VPSRAVWZrm,          0 },
-    { X86::VPSRAWZrr,         X86::VPSRAWZrm,           0 },
-    { X86::VPSRLDZrr,         X86::VPSRLDZrm,           0 },
-    { X86::VPSRLQZrr,         X86::VPSRLQZrm,           0 },
-    { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
-    { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
-    { X86::VPSRLVWZrr,        X86::VPSRLVWZrm,          0 },
-    { X86::VPSRLWZrr,         X86::VPSRLWZrm,           0 },
-    { X86::VPSUBBZrr,         X86::VPSUBBZrm,           0 },
-    { X86::VPSUBDZrr,         X86::VPSUBDZrm,           0 },
-    { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
-    { X86::VPSUBSBZrr,        X86::VPSUBSBZrm,          0 },
-    { X86::VPSUBSWZrr,        X86::VPSUBSWZrm,          0 },
-    { X86::VPSUBUSBZrr,       X86::VPSUBUSBZrm,         0 },
-    { X86::VPSUBUSWZrr,       X86::VPSUBUSWZrm,         0 },
-    { X86::VPSUBWZrr,         X86::VPSUBWZrm,           0 },
-    { X86::VPUNPCKHBWZrr,     X86::VPUNPCKHBWZrm,       0 },
-    { X86::VPUNPCKHDQZrr,     X86::VPUNPCKHDQZrm,       0 },
-    { X86::VPUNPCKHQDQZrr,    X86::VPUNPCKHQDQZrm,      0 },
-    { X86::VPUNPCKHWDZrr,     X86::VPUNPCKHWDZrm,       0 },
-    { X86::VPUNPCKLBWZrr,     X86::VPUNPCKLBWZrm,       0 },
-    { X86::VPUNPCKLDQZrr,     X86::VPUNPCKLDQZrm,       0 },
-    { X86::VPUNPCKLQDQZrr,    X86::VPUNPCKLQDQZrm,      0 },
-    { X86::VPUNPCKLWDZrr,     X86::VPUNPCKLWDZrm,       0 },
-    { X86::VPXORDZrr,         X86::VPXORDZrm,           0 },
-    { X86::VPXORQZrr,         X86::VPXORQZrm,           0 },
-    { X86::VSHUFPDZrri,       X86::VSHUFPDZrmi,         0 },
-    { X86::VSHUFPSZrri,       X86::VSHUFPSZrmi,         0 },
-    { X86::VSUBPDZrr,         X86::VSUBPDZrm,           0 },
-    { X86::VSUBPSZrr,         X86::VSUBPSZrm,           0 },
-    { X86::VSUBSDZrr,         X86::VSUBSDZrm,           0 },
-    { X86::VSUBSDZrr_Int,     X86::VSUBSDZrm_Int,       TB_NO_REVERSE },
-    { X86::VSUBSSZrr,         X86::VSUBSSZrm,           0 },
-    { X86::VSUBSSZrr_Int,     X86::VSUBSSZrm_Int,       TB_NO_REVERSE },
-    { X86::VUNPCKHPDZrr,      X86::VUNPCKHPDZrm,        0 },
-    { X86::VUNPCKHPSZrr,      X86::VUNPCKHPSZrm,        0 },
-    { X86::VUNPCKLPDZrr,      X86::VUNPCKLPDZrm,        0 },
-    { X86::VUNPCKLPSZrr,      X86::VUNPCKLPSZrm,        0 },
-    { X86::VXORPDZrr,         X86::VXORPDZrm,           0 },
-    { X86::VXORPSZrr,         X86::VXORPSZrm,           0 },
-
-    // AVX-512{F,VL} foldable instructions
-    { X86::VADDPDZ128rr,      X86::VADDPDZ128rm,        0 },
-    { X86::VADDPDZ256rr,      X86::VADDPDZ256rm,        0 },
-    { X86::VADDPSZ128rr,      X86::VADDPSZ128rm,        0 },
-    { X86::VADDPSZ256rr,      X86::VADDPSZ256rm,        0 },
-    { X86::VALIGNDZ128rri,    X86::VALIGNDZ128rmi,      0 },
-    { X86::VALIGNDZ256rri,    X86::VALIGNDZ256rmi,      0 },
-    { X86::VALIGNQZ128rri,    X86::VALIGNQZ128rmi,      0 },
-    { X86::VALIGNQZ256rri,    X86::VALIGNQZ256rmi,      0 },
-    { X86::VANDNPDZ128rr,     X86::VANDNPDZ128rm,       0 },
-    { X86::VANDNPDZ256rr,     X86::VANDNPDZ256rm,       0 },
-    { X86::VANDNPSZ128rr,     X86::VANDNPSZ128rm,       0 },
-    { X86::VANDNPSZ256rr,     X86::VANDNPSZ256rm,       0 },
-    { X86::VANDPDZ128rr,      X86::VANDPDZ128rm,        0 },
-    { X86::VANDPDZ256rr,      X86::VANDPDZ256rm,        0 },
-    { X86::VANDPSZ128rr,      X86::VANDPSZ128rm,        0 },
-    { X86::VANDPSZ256rr,      X86::VANDPSZ256rm,        0 },
-    { X86::VCMPPDZ128rri,     X86::VCMPPDZ128rmi,       0 },
-    { X86::VCMPPDZ256rri,     X86::VCMPPDZ256rmi,       0 },
-    { X86::VCMPPSZ128rri,     X86::VCMPPSZ128rmi,       0 },
-    { X86::VCMPPSZ256rri,     X86::VCMPPSZ256rmi,       0 },
-    { X86::VDIVPDZ128rr,      X86::VDIVPDZ128rm,        0 },
-    { X86::VDIVPDZ256rr,      X86::VDIVPDZ256rm,        0 },
-    { X86::VDIVPSZ128rr,      X86::VDIVPSZ128rm,        0 },
-    { X86::VDIVPSZ256rr,      X86::VDIVPSZ256rm,        0 },
-    { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rm,  0 },
-    { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rm,  0 },
-    { X86::VINSERTI32x4Z256rr,X86::VINSERTI32x4Z256rm,  0 },
-    { X86::VINSERTI64x2Z256rr,X86::VINSERTI64x2Z256rm,  0 },
-    { X86::VMAXCPDZ128rr,     X86::VMAXCPDZ128rm,       0 },
-    { X86::VMAXCPDZ256rr,     X86::VMAXCPDZ256rm,       0 },
-    { X86::VMAXCPSZ128rr,     X86::VMAXCPSZ128rm,       0 },
-    { X86::VMAXCPSZ256rr,     X86::VMAXCPSZ256rm,       0 },
-    { X86::VMAXPDZ128rr,      X86::VMAXPDZ128rm,        0 },
-    { X86::VMAXPDZ256rr,      X86::VMAXPDZ256rm,        0 },
-    { X86::VMAXPSZ128rr,      X86::VMAXPSZ128rm,        0 },
-    { X86::VMAXPSZ256rr,      X86::VMAXPSZ256rm,        0 },
-    { X86::VMINCPDZ128rr,     X86::VMINCPDZ128rm,       0 },
-    { X86::VMINCPDZ256rr,     X86::VMINCPDZ256rm,       0 },
-    { X86::VMINCPSZ128rr,     X86::VMINCPSZ128rm,       0 },
-    { X86::VMINCPSZ256rr,     X86::VMINCPSZ256rm,       0 },
-    { X86::VMINPDZ128rr,      X86::VMINPDZ128rm,        0 },
-    { X86::VMINPDZ256rr,      X86::VMINPDZ256rm,        0 },
-    { X86::VMINPSZ128rr,      X86::VMINPSZ128rm,        0 },
-    { X86::VMINPSZ256rr,      X86::VMINPSZ256rm,        0 },
-    { X86::VMULPDZ128rr,      X86::VMULPDZ128rm,        0 },
-    { X86::VMULPDZ256rr,      X86::VMULPDZ256rm,        0 },
-    { X86::VMULPSZ128rr,      X86::VMULPSZ128rm,        0 },
-    { X86::VMULPSZ256rr,      X86::VMULPSZ256rm,        0 },
-    { X86::VORPDZ128rr,       X86::VORPDZ128rm,         0 },
-    { X86::VORPDZ256rr,       X86::VORPDZ256rm,         0 },
-    { X86::VORPSZ128rr,       X86::VORPSZ128rm,         0 },
-    { X86::VORPSZ256rr,       X86::VORPSZ256rm,         0 },
-    { X86::VPACKSSDWZ256rr,   X86::VPACKSSDWZ256rm,     0 },
-    { X86::VPACKSSDWZ128rr,   X86::VPACKSSDWZ128rm,     0 },
-    { X86::VPACKSSWBZ256rr,   X86::VPACKSSWBZ256rm,     0 },
-    { X86::VPACKSSWBZ128rr,   X86::VPACKSSWBZ128rm,     0 },
-    { X86::VPACKUSDWZ256rr,   X86::VPACKUSDWZ256rm,     0 },
-    { X86::VPACKUSDWZ128rr,   X86::VPACKUSDWZ128rm,     0 },
-    { X86::VPACKUSWBZ256rr,   X86::VPACKUSWBZ256rm,     0 },
-    { X86::VPACKUSWBZ128rr,   X86::VPACKUSWBZ128rm,     0 },
-    { X86::VPADDBZ128rr,      X86::VPADDBZ128rm,        0 },
-    { X86::VPADDBZ256rr,      X86::VPADDBZ256rm,        0 },
-    { X86::VPADDDZ128rr,      X86::VPADDDZ128rm,        0 },
-    { X86::VPADDDZ256rr,      X86::VPADDDZ256rm,        0 },
-    { X86::VPADDQZ128rr,      X86::VPADDQZ128rm,        0 },
-    { X86::VPADDQZ256rr,      X86::VPADDQZ256rm,        0 },
-    { X86::VPADDSBZ128rr,     X86::VPADDSBZ128rm,       0 },
-    { X86::VPADDSBZ256rr,     X86::VPADDSBZ256rm,       0 },
-    { X86::VPADDSWZ128rr,     X86::VPADDSWZ128rm,       0 },
-    { X86::VPADDSWZ256rr,     X86::VPADDSWZ256rm,       0 },
-    { X86::VPADDUSBZ128rr,    X86::VPADDUSBZ128rm,      0 },
-    { X86::VPADDUSBZ256rr,    X86::VPADDUSBZ256rm,      0 },
-    { X86::VPADDUSWZ128rr,    X86::VPADDUSWZ128rm,      0 },
-    { X86::VPADDUSWZ256rr,    X86::VPADDUSWZ256rm,      0 },
-    { X86::VPADDWZ128rr,      X86::VPADDWZ128rm,        0 },
-    { X86::VPADDWZ256rr,      X86::VPADDWZ256rm,        0 },
-    { X86::VPALIGNRZ128rri,   X86::VPALIGNRZ128rmi,     0 },
-    { X86::VPALIGNRZ256rri,   X86::VPALIGNRZ256rmi,     0 },
-    { X86::VPANDDZ128rr,      X86::VPANDDZ128rm,        0 },
-    { X86::VPANDDZ256rr,      X86::VPANDDZ256rm,        0 },
-    { X86::VPANDNDZ128rr,     X86::VPANDNDZ128rm,       0 },
-    { X86::VPANDNDZ256rr,     X86::VPANDNDZ256rm,       0 },
-    { X86::VPANDNQZ128rr,     X86::VPANDNQZ128rm,       0 },
-    { X86::VPANDNQZ256rr,     X86::VPANDNQZ256rm,       0 },
-    { X86::VPANDQZ128rr,      X86::VPANDQZ128rm,        0 },
-    { X86::VPANDQZ256rr,      X86::VPANDQZ256rm,        0 },
-    { X86::VPAVGBZ128rr,      X86::VPAVGBZ128rm,        0 },
-    { X86::VPAVGBZ256rr,      X86::VPAVGBZ256rm,        0 },
-    { X86::VPAVGWZ128rr,      X86::VPAVGWZ128rm,        0 },
-    { X86::VPAVGWZ256rr,      X86::VPAVGWZ256rm,        0 },
-    { X86::VPCMPBZ128rri,     X86::VPCMPBZ128rmi,       0 },
-    { X86::VPCMPBZ256rri,     X86::VPCMPBZ256rmi,       0 },
-    { X86::VPCMPDZ128rri,     X86::VPCMPDZ128rmi,       0 },
-    { X86::VPCMPDZ256rri,     X86::VPCMPDZ256rmi,       0 },
-    { X86::VPCMPEQBZ128rr,    X86::VPCMPEQBZ128rm,      0 },
-    { X86::VPCMPEQBZ256rr,    X86::VPCMPEQBZ256rm,      0 },
-    { X86::VPCMPEQDZ128rr,    X86::VPCMPEQDZ128rm,      0 },
-    { X86::VPCMPEQDZ256rr,    X86::VPCMPEQDZ256rm,      0 },
-    { X86::VPCMPEQQZ128rr,    X86::VPCMPEQQZ128rm,      0 },
-    { X86::VPCMPEQQZ256rr,    X86::VPCMPEQQZ256rm,      0 },
-    { X86::VPCMPEQWZ128rr,    X86::VPCMPEQWZ128rm,      0 },
-    { X86::VPCMPEQWZ256rr,    X86::VPCMPEQWZ256rm,      0 },
-    { X86::VPCMPGTBZ128rr,    X86::VPCMPGTBZ128rm,      0 },
-    { X86::VPCMPGTBZ256rr,    X86::VPCMPGTBZ256rm,      0 },
-    { X86::VPCMPGTDZ128rr,    X86::VPCMPGTDZ128rm,      0 },
-    { X86::VPCMPGTDZ256rr,    X86::VPCMPGTDZ256rm,      0 },
-    { X86::VPCMPGTQZ128rr,    X86::VPCMPGTQZ128rm,      0 },
-    { X86::VPCMPGTQZ256rr,    X86::VPCMPGTQZ256rm,      0 },
-    { X86::VPCMPGTWZ128rr,    X86::VPCMPGTWZ128rm,      0 },
-    { X86::VPCMPGTWZ256rr,    X86::VPCMPGTWZ256rm,      0 },
-    { X86::VPCMPQZ128rri,     X86::VPCMPQZ128rmi,       0 },
-    { X86::VPCMPQZ256rri,     X86::VPCMPQZ256rmi,       0 },
-    { X86::VPCMPUBZ128rri,    X86::VPCMPUBZ128rmi,      0 },
-    { X86::VPCMPUBZ256rri,    X86::VPCMPUBZ256rmi,      0 },
-    { X86::VPCMPUDZ128rri,    X86::VPCMPUDZ128rmi,      0 },
-    { X86::VPCMPUDZ256rri,    X86::VPCMPUDZ256rmi,      0 },
-    { X86::VPCMPUQZ128rri,    X86::VPCMPUQZ128rmi,      0 },
-    { X86::VPCMPUQZ256rri,    X86::VPCMPUQZ256rmi,      0 },
-    { X86::VPCMPUWZ128rri,    X86::VPCMPUWZ128rmi,      0 },
-    { X86::VPCMPUWZ256rri,    X86::VPCMPUWZ256rmi,      0 },
-    { X86::VPCMPWZ128rri,     X86::VPCMPWZ128rmi,       0 },
-    { X86::VPCMPWZ256rri,     X86::VPCMPWZ256rmi,       0 },
-    { X86::VPERMBZ128rr,      X86::VPERMBZ128rm,        0 },
-    { X86::VPERMBZ256rr,      X86::VPERMBZ256rm,        0 },
-    { X86::VPERMDZ256rr,      X86::VPERMDZ256rm,        0 },
-    { X86::VPERMILPDZ128rr,   X86::VPERMILPDZ128rm,     0 },
-    { X86::VPERMILPDZ256rr,   X86::VPERMILPDZ256rm,     0 },
-    { X86::VPERMILPSZ128rr,   X86::VPERMILPSZ128rm,     0 },
-    { X86::VPERMILPSZ256rr,   X86::VPERMILPSZ256rm,     0 },
-    { X86::VPERMPDZ256rr,     X86::VPERMPDZ256rm,       0 },
-    { X86::VPERMPSZ256rr,     X86::VPERMPSZ256rm,       0 },
-    { X86::VPERMQZ256rr,      X86::VPERMQZ256rm,        0 },
-    { X86::VPERMWZ128rr,      X86::VPERMWZ128rm,        0 },
-    { X86::VPERMWZ256rr,      X86::VPERMWZ256rm,        0 },
-    { X86::VPMADDUBSWZ128rr,  X86::VPMADDUBSWZ128rm,    0 },
-    { X86::VPMADDUBSWZ256rr,  X86::VPMADDUBSWZ256rm,    0 },
-    { X86::VPMADDWDZ128rr,    X86::VPMADDWDZ128rm,      0 },
-    { X86::VPMADDWDZ256rr,    X86::VPMADDWDZ256rm,      0 },
-    { X86::VPMAXSBZ128rr,     X86::VPMAXSBZ128rm,       0 },
-    { X86::VPMAXSBZ256rr,     X86::VPMAXSBZ256rm,       0 },
-    { X86::VPMAXSDZ128rr,     X86::VPMAXSDZ128rm,       0 },
-    { X86::VPMAXSDZ256rr,     X86::VPMAXSDZ256rm,       0 },
-    { X86::VPMAXSQZ128rr,     X86::VPMAXSQZ128rm,       0 },
-    { X86::VPMAXSQZ256rr,     X86::VPMAXSQZ256rm,       0 },
-    { X86::VPMAXSWZ128rr,     X86::VPMAXSWZ128rm,       0 },
-    { X86::VPMAXSWZ256rr,     X86::VPMAXSWZ256rm,       0 },
-    { X86::VPMAXUBZ128rr,     X86::VPMAXUBZ128rm,       0 },
-    { X86::VPMAXUBZ256rr,     X86::VPMAXUBZ256rm,       0 },
-    { X86::VPMAXUDZ128rr,     X86::VPMAXUDZ128rm,       0 },
-    { X86::VPMAXUDZ256rr,     X86::VPMAXUDZ256rm,       0 },
-    { X86::VPMAXUQZ128rr,     X86::VPMAXUQZ128rm,       0 },
-    { X86::VPMAXUQZ256rr,     X86::VPMAXUQZ256rm,       0 },
-    { X86::VPMAXUWZ128rr,     X86::VPMAXUWZ128rm,       0 },
-    { X86::VPMAXUWZ256rr,     X86::VPMAXUWZ256rm,       0 },
-    { X86::VPMINSBZ128rr,     X86::VPMINSBZ128rm,       0 },
-    { X86::VPMINSBZ256rr,     X86::VPMINSBZ256rm,       0 },
-    { X86::VPMINSDZ128rr,     X86::VPMINSDZ128rm,       0 },
-    { X86::VPMINSDZ256rr,     X86::VPMINSDZ256rm,       0 },
-    { X86::VPMINSQZ128rr,     X86::VPMINSQZ128rm,       0 },
-    { X86::VPMINSQZ256rr,     X86::VPMINSQZ256rm,       0 },
-    { X86::VPMINSWZ128rr,     X86::VPMINSWZ128rm,       0 },
-    { X86::VPMINSWZ256rr,     X86::VPMINSWZ256rm,       0 },
-    { X86::VPMINUBZ128rr,     X86::VPMINUBZ128rm,       0 },
-    { X86::VPMINUBZ256rr,     X86::VPMINUBZ256rm,       0 },
-    { X86::VPMINUDZ128rr,     X86::VPMINUDZ128rm,       0 },
-    { X86::VPMINUDZ256rr,     X86::VPMINUDZ256rm,       0 },
-    { X86::VPMINUQZ128rr,     X86::VPMINUQZ128rm,       0 },
-    { X86::VPMINUQZ256rr,     X86::VPMINUQZ256rm,       0 },
-    { X86::VPMINUWZ128rr,     X86::VPMINUWZ128rm,       0 },
-    { X86::VPMINUWZ256rr,     X86::VPMINUWZ256rm,       0 },
-    { X86::VPMULDQZ128rr,     X86::VPMULDQZ128rm,       0 },
-    { X86::VPMULDQZ256rr,     X86::VPMULDQZ256rm,       0 },
-    { X86::VPMULLDZ128rr,     X86::VPMULLDZ128rm,       0 },
-    { X86::VPMULLDZ256rr,     X86::VPMULLDZ256rm,       0 },
-    { X86::VPMULLQZ128rr,     X86::VPMULLQZ128rm,       0 },
-    { X86::VPMULLQZ256rr,     X86::VPMULLQZ256rm,       0 },
-    { X86::VPMULLWZ128rr,     X86::VPMULLWZ128rm,       0 },
-    { X86::VPMULLWZ256rr,     X86::VPMULLWZ256rm,       0 },
-    { X86::VPMULUDQZ128rr,    X86::VPMULUDQZ128rm,      0 },
-    { X86::VPMULUDQZ256rr,    X86::VPMULUDQZ256rm,      0 },
-    { X86::VPORDZ128rr,       X86::VPORDZ128rm,         0 },
-    { X86::VPORDZ256rr,       X86::VPORDZ256rm,         0 },
-    { X86::VPORQZ128rr,       X86::VPORQZ128rm,         0 },
-    { X86::VPORQZ256rr,       X86::VPORQZ256rm,         0 },
-    { X86::VPSADBWZ128rr,     X86::VPSADBWZ128rm,       0 },
-    { X86::VPSADBWZ256rr,     X86::VPSADBWZ256rm,       0 },
-    { X86::VPSHUFBZ128rr,     X86::VPSHUFBZ128rm,       0 },
-    { X86::VPSHUFBZ256rr,     X86::VPSHUFBZ256rm,       0 },
-    { X86::VPSLLDZ128rr,      X86::VPSLLDZ128rm,        0 },
-    { X86::VPSLLDZ256rr,      X86::VPSLLDZ256rm,        0 },
-    { X86::VPSLLQZ128rr,      X86::VPSLLQZ128rm,        0 },
-    { X86::VPSLLQZ256rr,      X86::VPSLLQZ256rm,        0 },
-    { X86::VPSLLVDZ128rr,     X86::VPSLLVDZ128rm,       0 },
-    { X86::VPSLLVDZ256rr,     X86::VPSLLVDZ256rm,       0 },
-    { X86::VPSLLVQZ128rr,     X86::VPSLLVQZ128rm,       0 },
-    { X86::VPSLLVQZ256rr,     X86::VPSLLVQZ256rm,       0 },
-    { X86::VPSLLVWZ128rr,     X86::VPSLLVWZ128rm,       0 },
-    { X86::VPSLLVWZ256rr,     X86::VPSLLVWZ256rm,       0 },
-    { X86::VPSLLWZ128rr,      X86::VPSLLWZ128rm,        0 },
-    { X86::VPSLLWZ256rr,      X86::VPSLLWZ256rm,        0 },
-    { X86::VPSRADZ128rr,      X86::VPSRADZ128rm,        0 },
-    { X86::VPSRADZ256rr,      X86::VPSRADZ256rm,        0 },
-    { X86::VPSRAQZ128rr,      X86::VPSRAQZ128rm,        0 },
-    { X86::VPSRAQZ256rr,      X86::VPSRAQZ256rm,        0 },
-    { X86::VPSRAVDZ128rr,     X86::VPSRAVDZ128rm,       0 },
-    { X86::VPSRAVDZ256rr,     X86::VPSRAVDZ256rm,       0 },
-    { X86::VPSRAVQZ128rr,     X86::VPSRAVQZ128rm,       0 },
-    { X86::VPSRAVQZ256rr,     X86::VPSRAVQZ256rm,       0 },
-    { X86::VPSRAVWZ128rr,     X86::VPSRAVWZ128rm,       0 },
-    { X86::VPSRAVWZ256rr,     X86::VPSRAVWZ256rm,       0 },
-    { X86::VPSRAWZ128rr,      X86::VPSRAWZ128rm,        0 },
-    { X86::VPSRAWZ256rr,      X86::VPSRAWZ256rm,        0 },
-    { X86::VPSRLDZ128rr,      X86::VPSRLDZ128rm,        0 },
-    { X86::VPSRLDZ256rr,      X86::VPSRLDZ256rm,        0 },
-    { X86::VPSRLQZ128rr,      X86::VPSRLQZ128rm,        0 },
-    { X86::VPSRLQZ256rr,      X86::VPSRLQZ256rm,        0 },
-    { X86::VPSRLVDZ128rr,     X86::VPSRLVDZ128rm,       0 },
-    { X86::VPSRLVDZ256rr,     X86::VPSRLVDZ256rm,       0 },
-    { X86::VPSRLVQZ128rr,     X86::VPSRLVQZ128rm,       0 },
-    { X86::VPSRLVQZ256rr,     X86::VPSRLVQZ256rm,       0 },
-    { X86::VPSRLVWZ128rr,     X86::VPSRLVWZ128rm,       0 },
-    { X86::VPSRLVWZ256rr,     X86::VPSRLVWZ256rm,       0 },
-    { X86::VPSRLWZ128rr,      X86::VPSRLWZ128rm,        0 },
-    { X86::VPSRLWZ256rr,      X86::VPSRLWZ256rm,        0 },
-    { X86::VPSUBBZ128rr,      X86::VPSUBBZ128rm,        0 },
-    { X86::VPSUBBZ256rr,      X86::VPSUBBZ256rm,        0 },
-    { X86::VPSUBDZ128rr,      X86::VPSUBDZ128rm,        0 },
-    { X86::VPSUBDZ256rr,      X86::VPSUBDZ256rm,        0 },
-    { X86::VPSUBQZ128rr,      X86::VPSUBQZ128rm,        0 },
-    { X86::VPSUBQZ256rr,      X86::VPSUBQZ256rm,        0 },
-    { X86::VPSUBSBZ128rr,     X86::VPSUBSBZ128rm,       0 },
-    { X86::VPSUBSBZ256rr,     X86::VPSUBSBZ256rm,       0 },
-    { X86::VPSUBSWZ128rr,     X86::VPSUBSWZ128rm,       0 },
-    { X86::VPSUBSWZ256rr,     X86::VPSUBSWZ256rm,       0 },
-    { X86::VPSUBUSBZ128rr,    X86::VPSUBUSBZ128rm,      0 },
-    { X86::VPSUBUSBZ256rr,    X86::VPSUBUSBZ256rm,      0 },
-    { X86::VPSUBUSWZ128rr,    X86::VPSUBUSWZ128rm,      0 },
-    { X86::VPSUBUSWZ256rr,    X86::VPSUBUSWZ256rm,      0 },
-    { X86::VPSUBWZ128rr,      X86::VPSUBWZ128rm,        0 },
-    { X86::VPSUBWZ256rr,      X86::VPSUBWZ256rm,        0 },
-    { X86::VPUNPCKHBWZ128rr,  X86::VPUNPCKHBWZ128rm,    0 },
-    { X86::VPUNPCKHBWZ256rr,  X86::VPUNPCKHBWZ256rm,    0 },
-    { X86::VPUNPCKHDQZ128rr,  X86::VPUNPCKHDQZ128rm,    0 },
-    { X86::VPUNPCKHDQZ256rr,  X86::VPUNPCKHDQZ256rm,    0 },
-    { X86::VPUNPCKHQDQZ128rr, X86::VPUNPCKHQDQZ128rm,   0 },
-    { X86::VPUNPCKHQDQZ256rr, X86::VPUNPCKHQDQZ256rm,   0 },
-    { X86::VPUNPCKHWDZ128rr,  X86::VPUNPCKHWDZ128rm,    0 },
-    { X86::VPUNPCKHWDZ256rr,  X86::VPUNPCKHWDZ256rm,    0 },
-    { X86::VPUNPCKLBWZ128rr,  X86::VPUNPCKLBWZ128rm,    0 },
-    { X86::VPUNPCKLBWZ256rr,  X86::VPUNPCKLBWZ256rm,    0 },
-    { X86::VPUNPCKLDQZ128rr,  X86::VPUNPCKLDQZ128rm,    0 },
-    { X86::VPUNPCKLDQZ256rr,  X86::VPUNPCKLDQZ256rm,    0 },
-    { X86::VPUNPCKLQDQZ128rr, X86::VPUNPCKLQDQZ128rm,   0 },
-    { X86::VPUNPCKLQDQZ256rr, X86::VPUNPCKLQDQZ256rm,   0 },
-    { X86::VPUNPCKLWDZ128rr,  X86::VPUNPCKLWDZ128rm,    0 },
-    { X86::VPUNPCKLWDZ256rr,  X86::VPUNPCKLWDZ256rm,    0 },
-    { X86::VPXORDZ128rr,      X86::VPXORDZ128rm,        0 },
-    { X86::VPXORDZ256rr,      X86::VPXORDZ256rm,        0 },
-    { X86::VPXORQZ128rr,      X86::VPXORQZ128rm,        0 },
-    { X86::VPXORQZ256rr,      X86::VPXORQZ256rm,        0 },
-    { X86::VSHUFPDZ128rri,    X86::VSHUFPDZ128rmi,      0 },
-    { X86::VSHUFPDZ256rri,    X86::VSHUFPDZ256rmi,      0 },
-    { X86::VSHUFPSZ128rri,    X86::VSHUFPSZ128rmi,      0 },
-    { X86::VSHUFPSZ256rri,    X86::VSHUFPSZ256rmi,      0 },
-    { X86::VSUBPDZ128rr,      X86::VSUBPDZ128rm,        0 },
-    { X86::VSUBPDZ256rr,      X86::VSUBPDZ256rm,        0 },
-    { X86::VSUBPSZ128rr,      X86::VSUBPSZ128rm,        0 },
-    { X86::VSUBPSZ256rr,      X86::VSUBPSZ256rm,        0 },
-    { X86::VUNPCKHPDZ128rr,   X86::VUNPCKHPDZ128rm,     0 },
-    { X86::VUNPCKHPDZ256rr,   X86::VUNPCKHPDZ256rm,     0 },
-    { X86::VUNPCKHPSZ128rr,   X86::VUNPCKHPSZ128rm,     0 },
-    { X86::VUNPCKHPSZ256rr,   X86::VUNPCKHPSZ256rm,     0 },
-    { X86::VUNPCKLPDZ128rr,   X86::VUNPCKLPDZ128rm,     0 },
-    { X86::VUNPCKLPDZ256rr,   X86::VUNPCKLPDZ256rm,     0 },
-    { X86::VUNPCKLPSZ128rr,   X86::VUNPCKLPSZ128rm,     0 },
-    { X86::VUNPCKLPSZ256rr,   X86::VUNPCKLPSZ256rm,     0 },
-    { X86::VXORPDZ128rr,      X86::VXORPDZ128rm,        0 },
-    { X86::VXORPDZ256rr,      X86::VXORPDZ256rm,        0 },
-    { X86::VXORPSZ128rr,      X86::VXORPSZ128rm,        0 },
-    { X86::VXORPSZ256rr,      X86::VXORPSZ256rm,        0 },
-
-    // AVX-512 masked foldable instructions
-    { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
-    { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
-    { X86::VPABSBZrrkz,       X86::VPABSBZrmkz,         0 },
-    { X86::VPABSDZrrkz,       X86::VPABSDZrmkz,         0 },
-    { X86::VPABSQZrrkz,       X86::VPABSQZrmkz,         0 },
-    { X86::VPABSWZrrkz,       X86::VPABSWZrmkz,         0 },
-    { X86::VPERMILPDZrikz,    X86::VPERMILPDZmikz,      0 },
-    { X86::VPERMILPSZrikz,    X86::VPERMILPSZmikz,      0 },
-    { X86::VPERMPDZrikz,      X86::VPERMPDZmikz,        0 },
-    { X86::VPERMQZrikz,       X86::VPERMQZmikz,         0 },
-    { X86::VPMOVSXBDZrrkz,    X86::VPMOVSXBDZrmkz,      0 },
-    { X86::VPMOVSXBQZrrkz,    X86::VPMOVSXBQZrmkz,      TB_NO_REVERSE },
-    { X86::VPMOVSXBWZrrkz,    X86::VPMOVSXBWZrmkz,      0 },
-    { X86::VPMOVSXDQZrrkz,    X86::VPMOVSXDQZrmkz,      0 },
-    { X86::VPMOVSXWDZrrkz,    X86::VPMOVSXWDZrmkz,      0 },
-    { X86::VPMOVSXWQZrrkz,    X86::VPMOVSXWQZrmkz,      0 },
-    { X86::VPMOVZXBDZrrkz,    X86::VPMOVZXBDZrmkz,      0 },
-    { X86::VPMOVZXBQZrrkz,    X86::VPMOVZXBQZrmkz,      TB_NO_REVERSE },
-    { X86::VPMOVZXBWZrrkz,    X86::VPMOVZXBWZrmkz,      0 },
-    { X86::VPMOVZXDQZrrkz,    X86::VPMOVZXDQZrmkz,      0 },
-    { X86::VPMOVZXWDZrrkz,    X86::VPMOVZXWDZrmkz,      0 },
-    { X86::VPMOVZXWQZrrkz,    X86::VPMOVZXWQZrmkz,      0 },
-    { X86::VPSHUFDZrikz,      X86::VPSHUFDZmikz,        0 },
-    { X86::VPSHUFHWZrikz,     X86::VPSHUFHWZmikz,       0 },
-    { X86::VPSHUFLWZrikz,     X86::VPSHUFLWZmikz,       0 },
-    { X86::VPSLLDZrikz,       X86::VPSLLDZmikz,         0 },
-    { X86::VPSLLQZrikz,       X86::VPSLLQZmikz,         0 },
-    { X86::VPSLLWZrikz,       X86::VPSLLWZmikz,         0 },
-    { X86::VPSRADZrikz,       X86::VPSRADZmikz,         0 },
-    { X86::VPSRAQZrikz,       X86::VPSRAQZmikz,         0 },
-    { X86::VPSRAWZrikz,       X86::VPSRAWZmikz,         0 },
-    { X86::VPSRLDZrikz,       X86::VPSRLDZmikz,         0 },
-    { X86::VPSRLQZrikz,       X86::VPSRLQZmikz,         0 },
-    { X86::VPSRLWZrikz,       X86::VPSRLWZmikz,         0 },
-
-    // AVX-512VL 256-bit masked foldable instructions
-    { X86::VBROADCASTSDZ256rkz,  X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
-    { X86::VBROADCASTSSZ256rkz,  X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
-    { X86::VPABSBZ256rrkz,    X86::VPABSBZ256rmkz,      0 },
-    { X86::VPABSDZ256rrkz,    X86::VPABSDZ256rmkz,      0 },
-    { X86::VPABSQZ256rrkz,    X86::VPABSQZ256rmkz,      0 },
-    { X86::VPABSWZ256rrkz,    X86::VPABSWZ256rmkz,      0 },
-    { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz,   0 },
-    { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz,   0 },
-    { X86::VPERMPDZ256rikz,   X86::VPERMPDZ256mikz,     0 },
-    { X86::VPERMQZ256rikz,    X86::VPERMQZ256mikz,      0 },
-    { X86::VPMOVSXBDZ256rrkz, X86::VPMOVSXBDZ256rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXBQZ256rrkz, X86::VPMOVSXBQZ256rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXBWZ256rrkz, X86::VPMOVSXBWZ256rmkz,   0 },
-    { X86::VPMOVSXDQZ256rrkz, X86::VPMOVSXDQZ256rmkz,   0 },
-    { X86::VPMOVSXWDZ256rrkz, X86::VPMOVSXWDZ256rmkz,   0 },
-    { X86::VPMOVSXWQZ256rrkz, X86::VPMOVSXWQZ256rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXBDZ256rrkz, X86::VPMOVZXBDZ256rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXBQZ256rrkz, X86::VPMOVZXBQZ256rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXBWZ256rrkz, X86::VPMOVZXBWZ256rmkz,   0 },
-    { X86::VPMOVZXDQZ256rrkz, X86::VPMOVZXDQZ256rmkz,   0 },
-    { X86::VPMOVZXWDZ256rrkz, X86::VPMOVZXWDZ256rmkz,   0 },
-    { X86::VPMOVZXWQZ256rrkz, X86::VPMOVZXWQZ256rmkz,   TB_NO_REVERSE },
-    { X86::VPSHUFDZ256rikz,   X86::VPSHUFDZ256mikz,     0 },
-    { X86::VPSHUFHWZ256rikz,  X86::VPSHUFHWZ256mikz,    0 },
-    { X86::VPSHUFLWZ256rikz,  X86::VPSHUFLWZ256mikz,    0 },
-    { X86::VPSLLDZ256rikz,    X86::VPSLLDZ256mikz,      0 },
-    { X86::VPSLLQZ256rikz,    X86::VPSLLQZ256mikz,      0 },
-    { X86::VPSLLWZ256rikz,    X86::VPSLLWZ256mikz,      0 },
-    { X86::VPSRADZ256rikz,    X86::VPSRADZ256mikz,      0 },
-    { X86::VPSRAQZ256rikz,    X86::VPSRAQZ256mikz,      0 },
-    { X86::VPSRAWZ256rikz,    X86::VPSRAWZ256mikz,      0 },
-    { X86::VPSRLDZ256rikz,    X86::VPSRLDZ256mikz,      0 },
-    { X86::VPSRLQZ256rikz,    X86::VPSRLQZ256mikz,      0 },
-    { X86::VPSRLWZ256rikz,    X86::VPSRLWZ256mikz,      0 },
-
-    // AVX-512VL 128-bit masked foldable instructions
-    { X86::VBROADCASTSSZ128rkz,  X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
-    { X86::VPABSBZ128rrkz,    X86::VPABSBZ128rmkz,      0 },
-    { X86::VPABSDZ128rrkz,    X86::VPABSDZ128rmkz,      0 },
-    { X86::VPABSQZ128rrkz,    X86::VPABSQZ128rmkz,      0 },
-    { X86::VPABSWZ128rrkz,    X86::VPABSWZ128rmkz,      0 },
-    { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz,   0 },
-    { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz,   0 },
-    { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXBQZ128rrkz, X86::VPMOVSXBQZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXBWZ128rrkz, X86::VPMOVSXBWZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXDQZ128rrkz, X86::VPMOVSXDQZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXWDZ128rrkz, X86::VPMOVSXWDZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVSXWQZ128rrkz, X86::VPMOVSXWQZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXBDZ128rrkz, X86::VPMOVZXBDZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXBQZ128rrkz, X86::VPMOVZXBQZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXBWZ128rrkz, X86::VPMOVZXBWZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXDQZ128rrkz, X86::VPMOVZXDQZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXWDZ128rrkz, X86::VPMOVZXWDZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPMOVZXWQZ128rrkz, X86::VPMOVZXWQZ128rmkz,   TB_NO_REVERSE },
-    { X86::VPSHUFDZ128rikz,   X86::VPSHUFDZ128mikz,     0 },
-    { X86::VPSHUFHWZ128rikz,  X86::VPSHUFHWZ128mikz,    0 },
-    { X86::VPSHUFLWZ128rikz,  X86::VPSHUFLWZ128mikz,    0 },
-    { X86::VPSLLDZ128rikz,    X86::VPSLLDZ128mikz,      0 },
-    { X86::VPSLLQZ128rikz,    X86::VPSLLQZ128mikz,      0 },
-    { X86::VPSLLWZ128rikz,    X86::VPSLLWZ128mikz,      0 },
-    { X86::VPSRADZ128rikz,    X86::VPSRADZ128mikz,      0 },
-    { X86::VPSRAQZ128rikz,    X86::VPSRAQZ128mikz,      0 },
-    { X86::VPSRAWZ128rikz,    X86::VPSRAWZ128mikz,      0 },
-    { X86::VPSRLDZ128rikz,    X86::VPSRLDZ128mikz,      0 },
-    { X86::VPSRLQZ128rikz,    X86::VPSRLQZ128mikz,      0 },
-    { X86::VPSRLWZ128rikz,    X86::VPSRLWZ128mikz,      0 },
-
-    // AES foldable instructions
-    { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
-    { X86::AESDECrr,          X86::AESDECrm,            TB_ALIGN_16 },
-    { X86::AESENCLASTrr,      X86::AESENCLASTrm,        TB_ALIGN_16 },
-    { X86::AESENCrr,          X86::AESENCrm,            TB_ALIGN_16 },
-    { X86::VAESDECLASTrr,     X86::VAESDECLASTrm,       0 },
-    { X86::VAESDECrr,         X86::VAESDECrm,           0 },
-    { X86::VAESENCLASTrr,     X86::VAESENCLASTrm,       0 },
-    { X86::VAESENCrr,         X86::VAESENCrm,           0 },
-
-    // SHA foldable instructions
-    { X86::SHA1MSG1rr,        X86::SHA1MSG1rm,          TB_ALIGN_16 },
-    { X86::SHA1MSG2rr,        X86::SHA1MSG2rm,          TB_ALIGN_16 },
-    { X86::SHA1NEXTErr,       X86::SHA1NEXTErm,         TB_ALIGN_16 },
-    { X86::SHA1RNDS4rri,      X86::SHA1RNDS4rmi,        TB_ALIGN_16 },
-    { X86::SHA256MSG1rr,      X86::SHA256MSG1rm,        TB_ALIGN_16 },
-    { X86::SHA256MSG2rr,      X86::SHA256MSG2rm,        TB_ALIGN_16 },
-    { X86::SHA256RNDS2rr,     X86::SHA256RNDS2rm,       TB_ALIGN_16 }
-  };
-
   for (X86MemoryFoldTableEntry Entry : MemoryFoldTable2) {
     AddTableEntry(RegOp2MemOpTable2, MemOp2RegOpTable,
                   Entry.RegOp, Entry.MemOp,
@@ -2435,1103 +150,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
                   Entry.Flags | TB_INDEX_2 | TB_FOLDED_LOAD);
   }
 
-  static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
-    // FMA4 foldable patterns
-    { X86::VFMADDSS4rr,           X86::VFMADDSS4rm,           TB_ALIGN_NONE },
-    { X86::VFMADDSS4rr_Int,       X86::VFMADDSS4rm_Int,       TB_NO_REVERSE },
-    { X86::VFMADDSD4rr,           X86::VFMADDSD4rm,           TB_ALIGN_NONE },
-    { X86::VFMADDSD4rr_Int,       X86::VFMADDSD4rm_Int,       TB_NO_REVERSE },
-    { X86::VFMADDPS4rr,           X86::VFMADDPS4rm,           TB_ALIGN_NONE },
-    { X86::VFMADDPD4rr,           X86::VFMADDPD4rm,           TB_ALIGN_NONE },
-    { X86::VFMADDPS4Yrr,          X86::VFMADDPS4Yrm,          TB_ALIGN_NONE },
-    { X86::VFMADDPD4Yrr,          X86::VFMADDPD4Yrm,          TB_ALIGN_NONE },
-    { X86::VFNMADDSS4rr,          X86::VFNMADDSS4rm,          TB_ALIGN_NONE },
-    { X86::VFNMADDSS4rr_Int,      X86::VFNMADDSS4rm_Int,      TB_NO_REVERSE },
-    { X86::VFNMADDSD4rr,          X86::VFNMADDSD4rm,          TB_ALIGN_NONE },
-    { X86::VFNMADDSD4rr_Int,      X86::VFNMADDSD4rm_Int,      TB_NO_REVERSE },
-    { X86::VFNMADDPS4rr,          X86::VFNMADDPS4rm,          TB_ALIGN_NONE },
-    { X86::VFNMADDPD4rr,          X86::VFNMADDPD4rm,          TB_ALIGN_NONE },
-    { X86::VFNMADDPS4Yrr,         X86::VFNMADDPS4Yrm,         TB_ALIGN_NONE },
-    { X86::VFNMADDPD4Yrr,         X86::VFNMADDPD4Yrm,         TB_ALIGN_NONE },
-    { X86::VFMSUBSS4rr,           X86::VFMSUBSS4rm,           TB_ALIGN_NONE },
-    { X86::VFMSUBSS4rr_Int,       X86::VFMSUBSS4rm_Int,       TB_NO_REVERSE },
-    { X86::VFMSUBSD4rr,           X86::VFMSUBSD4rm,           TB_ALIGN_NONE },
-    { X86::VFMSUBSD4rr_Int,       X86::VFMSUBSD4rm_Int,       TB_NO_REVERSE },
-    { X86::VFMSUBPS4rr,           X86::VFMSUBPS4rm,           TB_ALIGN_NONE },
-    { X86::VFMSUBPD4rr,           X86::VFMSUBPD4rm,           TB_ALIGN_NONE },
-    { X86::VFMSUBPS4Yrr,          X86::VFMSUBPS4Yrm,          TB_ALIGN_NONE },
-    { X86::VFMSUBPD4Yrr,          X86::VFMSUBPD4Yrm,          TB_ALIGN_NONE },
-    { X86::VFNMSUBSS4rr,          X86::VFNMSUBSS4rm,          TB_ALIGN_NONE },
-    { X86::VFNMSUBSS4rr_Int,      X86::VFNMSUBSS4rm_Int,      TB_NO_REVERSE },
-    { X86::VFNMSUBSD4rr,          X86::VFNMSUBSD4rm,          TB_ALIGN_NONE },
-    { X86::VFNMSUBSD4rr_Int,      X86::VFNMSUBSD4rm_Int,      TB_NO_REVERSE },
-    { X86::VFNMSUBPS4rr,          X86::VFNMSUBPS4rm,          TB_ALIGN_NONE },
-    { X86::VFNMSUBPD4rr,          X86::VFNMSUBPD4rm,          TB_ALIGN_NONE },
-    { X86::VFNMSUBPS4Yrr,         X86::VFNMSUBPS4Yrm,         TB_ALIGN_NONE },
-    { X86::VFNMSUBPD4Yrr,         X86::VFNMSUBPD4Yrm,         TB_ALIGN_NONE },
-    { X86::VFMADDSUBPS4rr,        X86::VFMADDSUBPS4rm,        TB_ALIGN_NONE },
-    { X86::VFMADDSUBPD4rr,        X86::VFMADDSUBPD4rm,        TB_ALIGN_NONE },
-    { X86::VFMADDSUBPS4Yrr,       X86::VFMADDSUBPS4Yrm,       TB_ALIGN_NONE },
-    { X86::VFMADDSUBPD4Yrr,       X86::VFMADDSUBPD4Yrm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADDPS4rr,        X86::VFMSUBADDPS4rm,        TB_ALIGN_NONE },
-    { X86::VFMSUBADDPD4rr,        X86::VFMSUBADDPD4rm,        TB_ALIGN_NONE },
-    { X86::VFMSUBADDPS4Yrr,       X86::VFMSUBADDPS4Yrm,       TB_ALIGN_NONE },
-    { X86::VFMSUBADDPD4Yrr,       X86::VFMSUBADDPD4Yrm,       TB_ALIGN_NONE },
-
-    // XOP foldable instructions
-    { X86::VPCMOVrrr,             X86::VPCMOVrrm,             0 },
-    { X86::VPCMOVYrrr,            X86::VPCMOVYrrm,            0 },
-    { X86::VPERMIL2PDrr,          X86::VPERMIL2PDrm,          0 },
-    { X86::VPERMIL2PDYrr,         X86::VPERMIL2PDYrm,         0 },
-    { X86::VPERMIL2PSrr,          X86::VPERMIL2PSrm,          0 },
-    { X86::VPERMIL2PSYrr,         X86::VPERMIL2PSYrm,         0 },
-    { X86::VPPERMrrr,             X86::VPPERMrrm,             0 },
-
-    // AVX-512 instructions with 3 source operands.
-    { X86::VPERMI2Brr,            X86::VPERMI2Brm,            0 },
-    { X86::VPERMI2Drr,            X86::VPERMI2Drm,            0 },
-    { X86::VPERMI2PSrr,           X86::VPERMI2PSrm,           0 },
-    { X86::VPERMI2PDrr,           X86::VPERMI2PDrm,           0 },
-    { X86::VPERMI2Qrr,            X86::VPERMI2Qrm,            0 },
-    { X86::VPERMI2Wrr,            X86::VPERMI2Wrm,            0 },
-    { X86::VPERMT2Brr,            X86::VPERMT2Brm,            0 },
-    { X86::VPERMT2Drr,            X86::VPERMT2Drm,            0 },
-    { X86::VPERMT2PSrr,           X86::VPERMT2PSrm,           0 },
-    { X86::VPERMT2PDrr,           X86::VPERMT2PDrm,           0 },
-    { X86::VPERMT2Qrr,            X86::VPERMT2Qrm,            0 },
-    { X86::VPERMT2Wrr,            X86::VPERMT2Wrm,            0 },
-    { X86::VPTERNLOGDZrri,        X86::VPTERNLOGDZrmi,        0 },
-    { X86::VPTERNLOGQZrri,        X86::VPTERNLOGQZrmi,        0 },
-
-    // AVX-512VL 256-bit instructions with 3 source operands.
-    { X86::VPERMI2B256rr,         X86::VPERMI2B256rm,         0 },
-    { X86::VPERMI2D256rr,         X86::VPERMI2D256rm,         0 },
-    { X86::VPERMI2PD256rr,        X86::VPERMI2PD256rm,        0 },
-    { X86::VPERMI2PS256rr,        X86::VPERMI2PS256rm,        0 },
-    { X86::VPERMI2Q256rr,         X86::VPERMI2Q256rm,         0 },
-    { X86::VPERMI2W256rr,         X86::VPERMI2W256rm,         0 },
-    { X86::VPERMT2B256rr,         X86::VPERMT2B256rm,         0 },
-    { X86::VPERMT2D256rr,         X86::VPERMT2D256rm,         0 },
-    { X86::VPERMT2PD256rr,        X86::VPERMT2PD256rm,        0 },
-    { X86::VPERMT2PS256rr,        X86::VPERMT2PS256rm,        0 },
-    { X86::VPERMT2Q256rr,         X86::VPERMT2Q256rm,         0 },
-    { X86::VPERMT2W256rr,         X86::VPERMT2W256rm,         0 },
-    { X86::VPTERNLOGDZ256rri,     X86::VPTERNLOGDZ256rmi,     0 },
-    { X86::VPTERNLOGQZ256rri,     X86::VPTERNLOGQZ256rmi,     0 },
-
-    // AVX-512VL 128-bit instructions with 3 source operands.
-    { X86::VPERMI2B128rr,         X86::VPERMI2B128rm,         0 },
-    { X86::VPERMI2D128rr,         X86::VPERMI2D128rm,         0 },
-    { X86::VPERMI2PD128rr,        X86::VPERMI2PD128rm,        0 },
-    { X86::VPERMI2PS128rr,        X86::VPERMI2PS128rm,        0 },
-    { X86::VPERMI2Q128rr,         X86::VPERMI2Q128rm,         0 },
-    { X86::VPERMI2W128rr,         X86::VPERMI2W128rm,         0 },
-    { X86::VPERMT2B128rr,         X86::VPERMT2B128rm,         0 },
-    { X86::VPERMT2D128rr,         X86::VPERMT2D128rm,         0 },
-    { X86::VPERMT2PD128rr,        X86::VPERMT2PD128rm,        0 },
-    { X86::VPERMT2PS128rr,        X86::VPERMT2PS128rm,        0 },
-    { X86::VPERMT2Q128rr,         X86::VPERMT2Q128rm,         0 },
-    { X86::VPERMT2W128rr,         X86::VPERMT2W128rm,         0 },
-    { X86::VPTERNLOGDZ128rri,     X86::VPTERNLOGDZ128rmi,     0 },
-    { X86::VPTERNLOGQZ128rri,     X86::VPTERNLOGQZ128rmi,     0 },
-
-    // AVX-512 masked instructions
-    { X86::VADDPDZrrkz,           X86::VADDPDZrmkz,           0 },
-    { X86::VADDPSZrrkz,           X86::VADDPSZrmkz,           0 },
-    { X86::VADDSDZrr_Intkz,       X86::VADDSDZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VADDSSZrr_Intkz,       X86::VADDSSZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VALIGNDZrrikz,         X86::VALIGNDZrmikz,         0 },
-    { X86::VALIGNQZrrikz,         X86::VALIGNQZrmikz,         0 },
-    { X86::VANDNPDZrrkz,          X86::VANDNPDZrmkz,          0 },
-    { X86::VANDNPSZrrkz,          X86::VANDNPSZrmkz,          0 },
-    { X86::VANDPDZrrkz,           X86::VANDPDZrmkz,           0 },
-    { X86::VANDPSZrrkz,           X86::VANDPSZrmkz,           0 },
-    { X86::VDIVPDZrrkz,           X86::VDIVPDZrmkz,           0 },
-    { X86::VDIVPSZrrkz,           X86::VDIVPSZrmkz,           0 },
-    { X86::VDIVSDZrr_Intkz,       X86::VDIVSDZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VDIVSSZrr_Intkz,       X86::VDIVSSZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VINSERTF32x4Zrrkz,     X86::VINSERTF32x4Zrmkz,     0 },
-    { X86::VINSERTF32x8Zrrkz,     X86::VINSERTF32x8Zrmkz,     0 },
-    { X86::VINSERTF64x2Zrrkz,     X86::VINSERTF64x2Zrmkz,     0 },
-    { X86::VINSERTF64x4Zrrkz,     X86::VINSERTF64x4Zrmkz,     0 },
-    { X86::VINSERTI32x4Zrrkz,     X86::VINSERTI32x4Zrmkz,     0 },
-    { X86::VINSERTI32x8Zrrkz,     X86::VINSERTI32x8Zrmkz,     0 },
-    { X86::VINSERTI64x2Zrrkz,     X86::VINSERTI64x2Zrmkz,     0 },
-    { X86::VINSERTI64x4Zrrkz,     X86::VINSERTI64x4Zrmkz,     0 },
-    { X86::VMAXCPDZrrkz,          X86::VMAXCPDZrmkz,          0 },
-    { X86::VMAXCPSZrrkz,          X86::VMAXCPSZrmkz,          0 },
-    { X86::VMAXPDZrrkz,           X86::VMAXPDZrmkz,           0 },
-    { X86::VMAXPSZrrkz,           X86::VMAXPSZrmkz,           0 },
-    { X86::VMAXSDZrr_Intkz,       X86::VMAXSDZrm_Intkz,       0 },
-    { X86::VMAXSSZrr_Intkz,       X86::VMAXSSZrm_Intkz,       0 },
-    { X86::VMINCPDZrrkz,          X86::VMINCPDZrmkz,          0 },
-    { X86::VMINCPSZrrkz,          X86::VMINCPSZrmkz,          0 },
-    { X86::VMINPDZrrkz,           X86::VMINPDZrmkz,           0 },
-    { X86::VMINPSZrrkz,           X86::VMINPSZrmkz,           0 },
-    { X86::VMINSDZrr_Intkz,       X86::VMINSDZrm_Intkz,       0 },
-    { X86::VMINSSZrr_Intkz,       X86::VMINSSZrm_Intkz,       0 },
-    { X86::VMULPDZrrkz,           X86::VMULPDZrmkz,           0 },
-    { X86::VMULPSZrrkz,           X86::VMULPSZrmkz,           0 },
-    { X86::VMULSDZrr_Intkz,       X86::VMULSDZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VMULSSZrr_Intkz,       X86::VMULSSZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VORPDZrrkz,            X86::VORPDZrmkz,            0 },
-    { X86::VORPSZrrkz,            X86::VORPSZrmkz,            0 },
-    { X86::VPACKSSDWZrrkz,        X86::VPACKSSDWZrmkz,        0 },
-    { X86::VPACKSSWBZrrkz,        X86::VPACKSSWBZrmkz,        0 },
-    { X86::VPACKUSDWZrrkz,        X86::VPACKUSDWZrmkz,        0 },
-    { X86::VPACKUSWBZrrkz,        X86::VPACKUSWBZrmkz,        0 },
-    { X86::VPADDBZrrkz,           X86::VPADDBZrmkz,           0 },
-    { X86::VPADDDZrrkz,           X86::VPADDDZrmkz,           0 },
-    { X86::VPADDQZrrkz,           X86::VPADDQZrmkz,           0 },
-    { X86::VPADDSBZrrkz,          X86::VPADDSBZrmkz,          0 },
-    { X86::VPADDSWZrrkz,          X86::VPADDSWZrmkz,          0 },
-    { X86::VPADDUSBZrrkz,         X86::VPADDUSBZrmkz,         0 },
-    { X86::VPADDUSWZrrkz,         X86::VPADDUSWZrmkz,         0 },
-    { X86::VPADDWZrrkz,           X86::VPADDWZrmkz,           0 },
-    { X86::VPALIGNRZrrikz,        X86::VPALIGNRZrmikz,        0 },
-    { X86::VPANDDZrrkz,           X86::VPANDDZrmkz,           0 },
-    { X86::VPANDNDZrrkz,          X86::VPANDNDZrmkz,          0 },
-    { X86::VPANDNQZrrkz,          X86::VPANDNQZrmkz,          0 },
-    { X86::VPANDQZrrkz,           X86::VPANDQZrmkz,           0 },
-    { X86::VPAVGBZrrkz,           X86::VPAVGBZrmkz,           0 },
-    { X86::VPAVGWZrrkz,           X86::VPAVGWZrmkz,           0 },
-    { X86::VPERMBZrrkz,           X86::VPERMBZrmkz,           0 },
-    { X86::VPERMDZrrkz,           X86::VPERMDZrmkz,           0 },
-    { X86::VPERMILPDZrrkz,        X86::VPERMILPDZrmkz,        0 },
-    { X86::VPERMILPSZrrkz,        X86::VPERMILPSZrmkz,        0 },
-    { X86::VPERMPDZrrkz,          X86::VPERMPDZrmkz,          0 },
-    { X86::VPERMPSZrrkz,          X86::VPERMPSZrmkz,          0 },
-    { X86::VPERMQZrrkz,           X86::VPERMQZrmkz,           0 },
-    { X86::VPERMWZrrkz,           X86::VPERMWZrmkz,           0 },
-    { X86::VPMADDUBSWZrrkz,       X86::VPMADDUBSWZrmkz,       0 },
-    { X86::VPMADDWDZrrkz,         X86::VPMADDWDZrmkz,         0 },
-    { X86::VPMAXSBZrrkz,          X86::VPMAXSBZrmkz,          0 },
-    { X86::VPMAXSDZrrkz,          X86::VPMAXSDZrmkz,          0 },
-    { X86::VPMAXSQZrrkz,          X86::VPMAXSQZrmkz,          0 },
-    { X86::VPMAXSWZrrkz,          X86::VPMAXSWZrmkz,          0 },
-    { X86::VPMAXUBZrrkz,          X86::VPMAXUBZrmkz,          0 },
-    { X86::VPMAXUDZrrkz,          X86::VPMAXUDZrmkz,          0 },
-    { X86::VPMAXUQZrrkz,          X86::VPMAXUQZrmkz,          0 },
-    { X86::VPMAXUWZrrkz,          X86::VPMAXUWZrmkz,          0 },
-    { X86::VPMINSBZrrkz,          X86::VPMINSBZrmkz,          0 },
-    { X86::VPMINSDZrrkz,          X86::VPMINSDZrmkz,          0 },
-    { X86::VPMINSQZrrkz,          X86::VPMINSQZrmkz,          0 },
-    { X86::VPMINSWZrrkz,          X86::VPMINSWZrmkz,          0 },
-    { X86::VPMINUBZrrkz,          X86::VPMINUBZrmkz,          0 },
-    { X86::VPMINUDZrrkz,          X86::VPMINUDZrmkz,          0 },
-    { X86::VPMINUQZrrkz,          X86::VPMINUQZrmkz,          0 },
-    { X86::VPMINUWZrrkz,          X86::VPMINUWZrmkz,          0 },
-    { X86::VPMULLDZrrkz,          X86::VPMULLDZrmkz,          0 },
-    { X86::VPMULLQZrrkz,          X86::VPMULLQZrmkz,          0 },
-    { X86::VPMULLWZrrkz,          X86::VPMULLWZrmkz,          0 },
-    { X86::VPMULDQZrrkz,          X86::VPMULDQZrmkz,          0 },
-    { X86::VPMULUDQZrrkz,         X86::VPMULUDQZrmkz,         0 },
-    { X86::VPORDZrrkz,            X86::VPORDZrmkz,            0 },
-    { X86::VPORQZrrkz,            X86::VPORQZrmkz,            0 },
-    { X86::VPSHUFBZrrkz,          X86::VPSHUFBZrmkz,          0 },
-    { X86::VPSLLDZrrkz,           X86::VPSLLDZrmkz,           0 },
-    { X86::VPSLLQZrrkz,           X86::VPSLLQZrmkz,           0 },
-    { X86::VPSLLVDZrrkz,          X86::VPSLLVDZrmkz,          0 },
-    { X86::VPSLLVQZrrkz,          X86::VPSLLVQZrmkz,          0 },
-    { X86::VPSLLVWZrrkz,          X86::VPSLLVWZrmkz,          0 },
-    { X86::VPSLLWZrrkz,           X86::VPSLLWZrmkz,           0 },
-    { X86::VPSRADZrrkz,           X86::VPSRADZrmkz,           0 },
-    { X86::VPSRAQZrrkz,           X86::VPSRAQZrmkz,           0 },
-    { X86::VPSRAVDZrrkz,          X86::VPSRAVDZrmkz,          0 },
-    { X86::VPSRAVQZrrkz,          X86::VPSRAVQZrmkz,          0 },
-    { X86::VPSRAVWZrrkz,          X86::VPSRAVWZrmkz,          0 },
-    { X86::VPSRAWZrrkz,           X86::VPSRAWZrmkz,           0 },
-    { X86::VPSRLDZrrkz,           X86::VPSRLDZrmkz,           0 },
-    { X86::VPSRLQZrrkz,           X86::VPSRLQZrmkz,           0 },
-    { X86::VPSRLVDZrrkz,          X86::VPSRLVDZrmkz,          0 },
-    { X86::VPSRLVQZrrkz,          X86::VPSRLVQZrmkz,          0 },
-    { X86::VPSRLVWZrrkz,          X86::VPSRLVWZrmkz,          0 },
-    { X86::VPSRLWZrrkz,           X86::VPSRLWZrmkz,           0 },
-    { X86::VPSUBBZrrkz,           X86::VPSUBBZrmkz,           0 },
-    { X86::VPSUBDZrrkz,           X86::VPSUBDZrmkz,           0 },
-    { X86::VPSUBQZrrkz,           X86::VPSUBQZrmkz,           0 },
-    { X86::VPSUBSBZrrkz,          X86::VPSUBSBZrmkz,          0 },
-    { X86::VPSUBSWZrrkz,          X86::VPSUBSWZrmkz,          0 },
-    { X86::VPSUBUSBZrrkz,         X86::VPSUBUSBZrmkz,         0 },
-    { X86::VPSUBUSWZrrkz,         X86::VPSUBUSWZrmkz,         0 },
-    { X86::VPSUBWZrrkz,           X86::VPSUBWZrmkz,           0 },
-    { X86::VPUNPCKHBWZrrkz,       X86::VPUNPCKHBWZrmkz,       0 },
-    { X86::VPUNPCKHDQZrrkz,       X86::VPUNPCKHDQZrmkz,       0 },
-    { X86::VPUNPCKHQDQZrrkz,      X86::VPUNPCKHQDQZrmkz,      0 },
-    { X86::VPUNPCKHWDZrrkz,       X86::VPUNPCKHWDZrmkz,       0 },
-    { X86::VPUNPCKLBWZrrkz,       X86::VPUNPCKLBWZrmkz,       0 },
-    { X86::VPUNPCKLDQZrrkz,       X86::VPUNPCKLDQZrmkz,       0 },
-    { X86::VPUNPCKLQDQZrrkz,      X86::VPUNPCKLQDQZrmkz,      0 },
-    { X86::VPUNPCKLWDZrrkz,       X86::VPUNPCKLWDZrmkz,       0 },
-    { X86::VPXORDZrrkz,           X86::VPXORDZrmkz,           0 },
-    { X86::VPXORQZrrkz,           X86::VPXORQZrmkz,           0 },
-    { X86::VSHUFPDZrrikz,         X86::VSHUFPDZrmikz,         0 },
-    { X86::VSHUFPSZrrikz,         X86::VSHUFPSZrmikz,         0 },
-    { X86::VSUBPDZrrkz,           X86::VSUBPDZrmkz,           0 },
-    { X86::VSUBPSZrrkz,           X86::VSUBPSZrmkz,           0 },
-    { X86::VSUBSDZrr_Intkz,       X86::VSUBSDZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VSUBSSZrr_Intkz,       X86::VSUBSSZrm_Intkz,       TB_NO_REVERSE },
-    { X86::VUNPCKHPDZrrkz,        X86::VUNPCKHPDZrmkz,        0 },
-    { X86::VUNPCKHPSZrrkz,        X86::VUNPCKHPSZrmkz,        0 },
-    { X86::VUNPCKLPDZrrkz,        X86::VUNPCKLPDZrmkz,        0 },
-    { X86::VUNPCKLPSZrrkz,        X86::VUNPCKLPSZrmkz,        0 },
-    { X86::VXORPDZrrkz,           X86::VXORPDZrmkz,           0 },
-    { X86::VXORPSZrrkz,           X86::VXORPSZrmkz,           0 },
-
-    // AVX-512{F,VL} masked arithmetic instructions 256-bit
-    { X86::VADDPDZ256rrkz,        X86::VADDPDZ256rmkz,        0 },
-    { X86::VADDPSZ256rrkz,        X86::VADDPSZ256rmkz,        0 },
-    { X86::VALIGNDZ256rrikz,      X86::VALIGNDZ256rmikz,      0 },
-    { X86::VALIGNQZ256rrikz,      X86::VALIGNQZ256rmikz,      0 },
-    { X86::VANDNPDZ256rrkz,       X86::VANDNPDZ256rmkz,       0 },
-    { X86::VANDNPSZ256rrkz,       X86::VANDNPSZ256rmkz,       0 },
-    { X86::VANDPDZ256rrkz,        X86::VANDPDZ256rmkz,        0 },
-    { X86::VANDPSZ256rrkz,        X86::VANDPSZ256rmkz,        0 },
-    { X86::VDIVPDZ256rrkz,        X86::VDIVPDZ256rmkz,        0 },
-    { X86::VDIVPSZ256rrkz,        X86::VDIVPSZ256rmkz,        0 },
-    { X86::VINSERTF32x4Z256rrkz,  X86::VINSERTF32x4Z256rmkz,  0 },
-    { X86::VINSERTF64x2Z256rrkz,  X86::VINSERTF64x2Z256rmkz,  0 },
-    { X86::VINSERTI32x4Z256rrkz,  X86::VINSERTI32x4Z256rmkz,  0 },
-    { X86::VINSERTI64x2Z256rrkz,  X86::VINSERTI64x2Z256rmkz,  0 },
-    { X86::VMAXCPDZ256rrkz,       X86::VMAXCPDZ256rmkz,       0 },
-    { X86::VMAXCPSZ256rrkz,       X86::VMAXCPSZ256rmkz,       0 },
-    { X86::VMAXPDZ256rrkz,        X86::VMAXPDZ256rmkz,        0 },
-    { X86::VMAXPSZ256rrkz,        X86::VMAXPSZ256rmkz,        0 },
-    { X86::VMINCPDZ256rrkz,       X86::VMINCPDZ256rmkz,       0 },
-    { X86::VMINCPSZ256rrkz,       X86::VMINCPSZ256rmkz,       0 },
-    { X86::VMINPDZ256rrkz,        X86::VMINPDZ256rmkz,        0 },
-    { X86::VMINPSZ256rrkz,        X86::VMINPSZ256rmkz,        0 },
-    { X86::VMULPDZ256rrkz,        X86::VMULPDZ256rmkz,        0 },
-    { X86::VMULPSZ256rrkz,        X86::VMULPSZ256rmkz,        0 },
-    { X86::VORPDZ256rrkz,         X86::VORPDZ256rmkz,         0 },
-    { X86::VORPSZ256rrkz,         X86::VORPSZ256rmkz,         0 },
-    { X86::VPACKSSDWZ256rrkz,     X86::VPACKSSDWZ256rmkz,     0 },
-    { X86::VPACKSSWBZ256rrkz,     X86::VPACKSSWBZ256rmkz,     0 },
-    { X86::VPACKUSDWZ256rrkz,     X86::VPACKUSDWZ256rmkz,     0 },
-    { X86::VPACKUSWBZ256rrkz,     X86::VPACKUSWBZ256rmkz,     0 },
-    { X86::VPADDBZ256rrkz,        X86::VPADDBZ256rmkz,        0 },
-    { X86::VPADDDZ256rrkz,        X86::VPADDDZ256rmkz,        0 },
-    { X86::VPADDQZ256rrkz,        X86::VPADDQZ256rmkz,        0 },
-    { X86::VPADDSBZ256rrkz,       X86::VPADDSBZ256rmkz,       0 },
-    { X86::VPADDSWZ256rrkz,       X86::VPADDSWZ256rmkz,       0 },
-    { X86::VPADDUSBZ256rrkz,      X86::VPADDUSBZ256rmkz,      0 },
-    { X86::VPADDUSWZ256rrkz,      X86::VPADDUSWZ256rmkz,      0 },
-    { X86::VPADDWZ256rrkz,        X86::VPADDWZ256rmkz,        0 },
-    { X86::VPALIGNRZ256rrikz,     X86::VPALIGNRZ256rmikz,     0 },
-    { X86::VPANDDZ256rrkz,        X86::VPANDDZ256rmkz,        0 },
-    { X86::VPANDNDZ256rrkz,       X86::VPANDNDZ256rmkz,       0 },
-    { X86::VPANDNQZ256rrkz,       X86::VPANDNQZ256rmkz,       0 },
-    { X86::VPANDQZ256rrkz,        X86::VPANDQZ256rmkz,        0 },
-    { X86::VPAVGBZ256rrkz,        X86::VPAVGBZ256rmkz,        0 },
-    { X86::VPAVGWZ256rrkz,        X86::VPAVGWZ256rmkz,        0 },
-    { X86::VPERMBZ256rrkz,        X86::VPERMBZ256rmkz,        0 },
-    { X86::VPERMDZ256rrkz,        X86::VPERMDZ256rmkz,        0 },
-    { X86::VPERMILPDZ256rrkz,     X86::VPERMILPDZ256rmkz,     0 },
-    { X86::VPERMILPSZ256rrkz,     X86::VPERMILPSZ256rmkz,     0 },
-    { X86::VPERMPDZ256rrkz,       X86::VPERMPDZ256rmkz,       0 },
-    { X86::VPERMPSZ256rrkz,       X86::VPERMPSZ256rmkz,       0 },
-    { X86::VPERMQZ256rrkz,        X86::VPERMQZ256rmkz,        0 },
-    { X86::VPERMWZ256rrkz,        X86::VPERMWZ256rmkz,        0 },
-    { X86::VPMADDUBSWZ256rrkz,    X86::VPMADDUBSWZ256rmkz,    0 },
-    { X86::VPMADDWDZ256rrkz,      X86::VPMADDWDZ256rmkz,      0 },
-    { X86::VPMAXSBZ256rrkz,       X86::VPMAXSBZ256rmkz,       0 },
-    { X86::VPMAXSDZ256rrkz,       X86::VPMAXSDZ256rmkz,       0 },
-    { X86::VPMAXSQZ256rrkz,       X86::VPMAXSQZ256rmkz,       0 },
-    { X86::VPMAXSWZ256rrkz,       X86::VPMAXSWZ256rmkz,       0 },
-    { X86::VPMAXUBZ256rrkz,       X86::VPMAXUBZ256rmkz,       0 },
-    { X86::VPMAXUDZ256rrkz,       X86::VPMAXUDZ256rmkz,       0 },
-    { X86::VPMAXUQZ256rrkz,       X86::VPMAXUQZ256rmkz,       0 },
-    { X86::VPMAXUWZ256rrkz,       X86::VPMAXUWZ256rmkz,       0 },
-    { X86::VPMINSBZ256rrkz,       X86::VPMINSBZ256rmkz,       0 },
-    { X86::VPMINSDZ256rrkz,       X86::VPMINSDZ256rmkz,       0 },
-    { X86::VPMINSQZ256rrkz,       X86::VPMINSQZ256rmkz,       0 },
-    { X86::VPMINSWZ256rrkz,       X86::VPMINSWZ256rmkz,       0 },
-    { X86::VPMINUBZ256rrkz,       X86::VPMINUBZ256rmkz,       0 },
-    { X86::VPMINUDZ256rrkz,       X86::VPMINUDZ256rmkz,       0 },
-    { X86::VPMINUQZ256rrkz,       X86::VPMINUQZ256rmkz,       0 },
-    { X86::VPMINUWZ256rrkz,       X86::VPMINUWZ256rmkz,       0 },
-    { X86::VPMULDQZ256rrkz,       X86::VPMULDQZ256rmkz,       0 },
-    { X86::VPMULLDZ256rrkz,       X86::VPMULLDZ256rmkz,       0 },
-    { X86::VPMULLQZ256rrkz,       X86::VPMULLQZ256rmkz,       0 },
-    { X86::VPMULLWZ256rrkz,       X86::VPMULLWZ256rmkz,       0 },
-    { X86::VPMULUDQZ256rrkz,      X86::VPMULUDQZ256rmkz,      0 },
-    { X86::VPORDZ256rrkz,         X86::VPORDZ256rmkz,         0 },
-    { X86::VPORQZ256rrkz,         X86::VPORQZ256rmkz,         0 },
-    { X86::VPSHUFBZ256rrkz,       X86::VPSHUFBZ256rmkz,       0 },
-    { X86::VPSLLDZ256rrkz,        X86::VPSLLDZ256rmkz,        0 },
-    { X86::VPSLLQZ256rrkz,        X86::VPSLLQZ256rmkz,        0 },
-    { X86::VPSLLVDZ256rrkz,       X86::VPSLLVDZ256rmkz,       0 },
-    { X86::VPSLLVQZ256rrkz,       X86::VPSLLVQZ256rmkz,       0 },
-    { X86::VPSLLVWZ256rrkz,       X86::VPSLLVWZ256rmkz,       0 },
-    { X86::VPSLLWZ256rrkz,        X86::VPSLLWZ256rmkz,        0 },
-    { X86::VPSRADZ256rrkz,        X86::VPSRADZ256rmkz,        0 },
-    { X86::VPSRAQZ256rrkz,        X86::VPSRAQZ256rmkz,        0 },
-    { X86::VPSRAVDZ256rrkz,       X86::VPSRAVDZ256rmkz,       0 },
-    { X86::VPSRAVQZ256rrkz,       X86::VPSRAVQZ256rmkz,       0 },
-    { X86::VPSRAVWZ256rrkz,       X86::VPSRAVWZ256rmkz,       0 },
-    { X86::VPSRAWZ256rrkz,        X86::VPSRAWZ256rmkz,        0 },
-    { X86::VPSRLDZ256rrkz,        X86::VPSRLDZ256rmkz,        0 },
-    { X86::VPSRLQZ256rrkz,        X86::VPSRLQZ256rmkz,        0 },
-    { X86::VPSRLVDZ256rrkz,       X86::VPSRLVDZ256rmkz,       0 },
-    { X86::VPSRLVQZ256rrkz,       X86::VPSRLVQZ256rmkz,       0 },
-    { X86::VPSRLVWZ256rrkz,       X86::VPSRLVWZ256rmkz,       0 },
-    { X86::VPSRLWZ256rrkz,        X86::VPSRLWZ256rmkz,        0 },
-    { X86::VPSUBBZ256rrkz,        X86::VPSUBBZ256rmkz,        0 },
-    { X86::VPSUBDZ256rrkz,        X86::VPSUBDZ256rmkz,        0 },
-    { X86::VPSUBQZ256rrkz,        X86::VPSUBQZ256rmkz,        0 },
-    { X86::VPSUBSBZ256rrkz,       X86::VPSUBSBZ256rmkz,       0 },
-    { X86::VPSUBSWZ256rrkz,       X86::VPSUBSWZ256rmkz,       0 },
-    { X86::VPSUBUSBZ256rrkz,      X86::VPSUBUSBZ256rmkz,      0 },
-    { X86::VPSUBUSWZ256rrkz,      X86::VPSUBUSWZ256rmkz,      0 },
-    { X86::VPSUBWZ256rrkz,        X86::VPSUBWZ256rmkz,        0 },
-    { X86::VPUNPCKHBWZ256rrkz,    X86::VPUNPCKHBWZ256rmkz,    0 },
-    { X86::VPUNPCKHDQZ256rrkz,    X86::VPUNPCKHDQZ256rmkz,    0 },
-    { X86::VPUNPCKHQDQZ256rrkz,   X86::VPUNPCKHQDQZ256rmkz,   0 },
-    { X86::VPUNPCKHWDZ256rrkz,    X86::VPUNPCKHWDZ256rmkz,    0 },
-    { X86::VPUNPCKLBWZ256rrkz,    X86::VPUNPCKLBWZ256rmkz,    0 },
-    { X86::VPUNPCKLDQZ256rrkz,    X86::VPUNPCKLDQZ256rmkz,    0 },
-    { X86::VPUNPCKLQDQZ256rrkz,   X86::VPUNPCKLQDQZ256rmkz,   0 },
-    { X86::VPUNPCKLWDZ256rrkz,    X86::VPUNPCKLWDZ256rmkz,    0 },
-    { X86::VPXORDZ256rrkz,        X86::VPXORDZ256rmkz,        0 },
-    { X86::VPXORQZ256rrkz,        X86::VPXORQZ256rmkz,        0 },
-    { X86::VSHUFPDZ256rrikz,      X86::VSHUFPDZ256rmikz,      0 },
-    { X86::VSHUFPSZ256rrikz,      X86::VSHUFPSZ256rmikz,      0 },
-    { X86::VSUBPDZ256rrkz,        X86::VSUBPDZ256rmkz,        0 },
-    { X86::VSUBPSZ256rrkz,        X86::VSUBPSZ256rmkz,        0 },
-    { X86::VUNPCKHPDZ256rrkz,     X86::VUNPCKHPDZ256rmkz,     0 },
-    { X86::VUNPCKHPSZ256rrkz,     X86::VUNPCKHPSZ256rmkz,     0 },
-    { X86::VUNPCKLPDZ256rrkz,     X86::VUNPCKLPDZ256rmkz,     0 },
-    { X86::VUNPCKLPSZ256rrkz,     X86::VUNPCKLPSZ256rmkz,     0 },
-    { X86::VXORPDZ256rrkz,        X86::VXORPDZ256rmkz,        0 },
-    { X86::VXORPSZ256rrkz,        X86::VXORPSZ256rmkz,        0 },
-
-    // AVX-512{F,VL} masked arithmetic instructions 128-bit
-    { X86::VADDPDZ128rrkz,        X86::VADDPDZ128rmkz,        0 },
-    { X86::VADDPSZ128rrkz,        X86::VADDPSZ128rmkz,        0 },
-    { X86::VALIGNDZ128rrikz,      X86::VALIGNDZ128rmikz,      0 },
-    { X86::VALIGNQZ128rrikz,      X86::VALIGNQZ128rmikz,      0 },
-    { X86::VANDNPDZ128rrkz,       X86::VANDNPDZ128rmkz,       0 },
-    { X86::VANDNPSZ128rrkz,       X86::VANDNPSZ128rmkz,       0 },
-    { X86::VANDPDZ128rrkz,        X86::VANDPDZ128rmkz,        0 },
-    { X86::VANDPSZ128rrkz,        X86::VANDPSZ128rmkz,        0 },
-    { X86::VDIVPDZ128rrkz,        X86::VDIVPDZ128rmkz,        0 },
-    { X86::VDIVPSZ128rrkz,        X86::VDIVPSZ128rmkz,        0 },
-    { X86::VMAXCPDZ128rrkz,       X86::VMAXCPDZ128rmkz,       0 },
-    { X86::VMAXCPSZ128rrkz,       X86::VMAXCPSZ128rmkz,       0 },
-    { X86::VMAXPDZ128rrkz,        X86::VMAXPDZ128rmkz,        0 },
-    { X86::VMAXPSZ128rrkz,        X86::VMAXPSZ128rmkz,        0 },
-    { X86::VMINCPDZ128rrkz,       X86::VMINCPDZ128rmkz,       0 },
-    { X86::VMINCPSZ128rrkz,       X86::VMINCPSZ128rmkz,       0 },
-    { X86::VMINPDZ128rrkz,        X86::VMINPDZ128rmkz,        0 },
-    { X86::VMINPSZ128rrkz,        X86::VMINPSZ128rmkz,        0 },
-    { X86::VMULPDZ128rrkz,        X86::VMULPDZ128rmkz,        0 },
-    { X86::VMULPSZ128rrkz,        X86::VMULPSZ128rmkz,        0 },
-    { X86::VORPDZ128rrkz,         X86::VORPDZ128rmkz,         0 },
-    { X86::VORPSZ128rrkz,         X86::VORPSZ128rmkz,         0 },
-    { X86::VPACKSSDWZ128rrkz,     X86::VPACKSSDWZ128rmkz,     0 },
-    { X86::VPACKSSWBZ128rrkz,     X86::VPACKSSWBZ128rmkz,     0 },
-    { X86::VPACKUSDWZ128rrkz,     X86::VPACKUSDWZ128rmkz,     0 },
-    { X86::VPACKUSWBZ128rrkz,     X86::VPACKUSWBZ128rmkz,     0 },
-    { X86::VPADDBZ128rrkz,        X86::VPADDBZ128rmkz,        0 },
-    { X86::VPADDDZ128rrkz,        X86::VPADDDZ128rmkz,        0 },
-    { X86::VPADDQZ128rrkz,        X86::VPADDQZ128rmkz,        0 },
-    { X86::VPADDSBZ128rrkz,       X86::VPADDSBZ128rmkz,       0 },
-    { X86::VPADDSWZ128rrkz,       X86::VPADDSWZ128rmkz,       0 },
-    { X86::VPADDUSBZ128rrkz,      X86::VPADDUSBZ128rmkz,      0 },
-    { X86::VPADDUSWZ128rrkz,      X86::VPADDUSWZ128rmkz,      0 },
-    { X86::VPADDWZ128rrkz,        X86::VPADDWZ128rmkz,        0 },
-    { X86::VPALIGNRZ128rrikz,     X86::VPALIGNRZ128rmikz,     0 },
-    { X86::VPANDDZ128rrkz,        X86::VPANDDZ128rmkz,        0 },
-    { X86::VPANDNDZ128rrkz,       X86::VPANDNDZ128rmkz,       0 },
-    { X86::VPANDNQZ128rrkz,       X86::VPANDNQZ128rmkz,       0 },
-    { X86::VPANDQZ128rrkz,        X86::VPANDQZ128rmkz,        0 },
-    { X86::VPAVGBZ128rrkz,        X86::VPAVGBZ128rmkz,        0 },
-    { X86::VPAVGWZ128rrkz,        X86::VPAVGWZ128rmkz,        0 },
-    { X86::VPERMBZ128rrkz,        X86::VPERMBZ128rmkz,        0 },
-    { X86::VPERMILPDZ128rrkz,     X86::VPERMILPDZ128rmkz,     0 },
-    { X86::VPERMILPSZ128rrkz,     X86::VPERMILPSZ128rmkz,     0 },
-    { X86::VPERMWZ128rrkz,        X86::VPERMWZ128rmkz,        0 },
-    { X86::VPMADDUBSWZ128rrkz,    X86::VPMADDUBSWZ128rmkz,    0 },
-    { X86::VPMADDWDZ128rrkz,      X86::VPMADDWDZ128rmkz,      0 },
-    { X86::VPMAXSBZ128rrkz,       X86::VPMAXSBZ128rmkz,       0 },
-    { X86::VPMAXSDZ128rrkz,       X86::VPMAXSDZ128rmkz,       0 },
-    { X86::VPMAXSQZ128rrkz,       X86::VPMAXSQZ128rmkz,       0 },
-    { X86::VPMAXSWZ128rrkz,       X86::VPMAXSWZ128rmkz,       0 },
-    { X86::VPMAXUBZ128rrkz,       X86::VPMAXUBZ128rmkz,       0 },
-    { X86::VPMAXUDZ128rrkz,       X86::VPMAXUDZ128rmkz,       0 },
-    { X86::VPMAXUQZ128rrkz,       X86::VPMAXUQZ128rmkz,       0 },
-    { X86::VPMAXUWZ128rrkz,       X86::VPMAXUWZ128rmkz,       0 },
-    { X86::VPMINSBZ128rrkz,       X86::VPMINSBZ128rmkz,       0 },
-    { X86::VPMINSDZ128rrkz,       X86::VPMINSDZ128rmkz,       0 },
-    { X86::VPMINSQZ128rrkz,       X86::VPMINSQZ128rmkz,       0 },
-    { X86::VPMINSWZ128rrkz,       X86::VPMINSWZ128rmkz,       0 },
-    { X86::VPMINUBZ128rrkz,       X86::VPMINUBZ128rmkz,       0 },
-    { X86::VPMINUDZ128rrkz,       X86::VPMINUDZ128rmkz,       0 },
-    { X86::VPMINUQZ128rrkz,       X86::VPMINUQZ128rmkz,       0 },
-    { X86::VPMINUWZ128rrkz,       X86::VPMINUWZ128rmkz,       0 },
-    { X86::VPMULDQZ128rrkz,       X86::VPMULDQZ128rmkz,       0 },
-    { X86::VPMULLDZ128rrkz,       X86::VPMULLDZ128rmkz,       0 },
-    { X86::VPMULLQZ128rrkz,       X86::VPMULLQZ128rmkz,       0 },
-    { X86::VPMULLWZ128rrkz,       X86::VPMULLWZ128rmkz,       0 },
-    { X86::VPMULUDQZ128rrkz,      X86::VPMULUDQZ128rmkz,      0 },
-    { X86::VPORDZ128rrkz,         X86::VPORDZ128rmkz,         0 },
-    { X86::VPORQZ128rrkz,         X86::VPORQZ128rmkz,         0 },
-    { X86::VPSHUFBZ128rrkz,       X86::VPSHUFBZ128rmkz,       0 },
-    { X86::VPSLLDZ128rrkz,        X86::VPSLLDZ128rmkz,        0 },
-    { X86::VPSLLQZ128rrkz,        X86::VPSLLQZ128rmkz,        0 },
-    { X86::VPSLLVDZ128rrkz,       X86::VPSLLVDZ128rmkz,       0 },
-    { X86::VPSLLVQZ128rrkz,       X86::VPSLLVQZ128rmkz,       0 },
-    { X86::VPSLLVWZ128rrkz,       X86::VPSLLVWZ128rmkz,       0 },
-    { X86::VPSLLWZ128rrkz,        X86::VPSLLWZ128rmkz,        0 },
-    { X86::VPSRADZ128rrkz,        X86::VPSRADZ128rmkz,        0 },
-    { X86::VPSRAQZ128rrkz,        X86::VPSRAQZ128rmkz,        0 },
-    { X86::VPSRAVDZ128rrkz,       X86::VPSRAVDZ128rmkz,       0 },
-    { X86::VPSRAVQZ128rrkz,       X86::VPSRAVQZ128rmkz,       0 },
-    { X86::VPSRAVWZ128rrkz,       X86::VPSRAVWZ128rmkz,       0 },
-    { X86::VPSRAWZ128rrkz,        X86::VPSRAWZ128rmkz,        0 },
-    { X86::VPSRLDZ128rrkz,        X86::VPSRLDZ128rmkz,        0 },
-    { X86::VPSRLQZ128rrkz,        X86::VPSRLQZ128rmkz,        0 },
-    { X86::VPSRLVDZ128rrkz,       X86::VPSRLVDZ128rmkz,       0 },
-    { X86::VPSRLVQZ128rrkz,       X86::VPSRLVQZ128rmkz,       0 },
-    { X86::VPSRLVWZ128rrkz,       X86::VPSRLVWZ128rmkz,       0 },
-    { X86::VPSRLWZ128rrkz,        X86::VPSRLWZ128rmkz,        0 },
-    { X86::VPSUBBZ128rrkz,        X86::VPSUBBZ128rmkz,        0 },
-    { X86::VPSUBDZ128rrkz,        X86::VPSUBDZ128rmkz,        0 },
-    { X86::VPSUBQZ128rrkz,        X86::VPSUBQZ128rmkz,        0 },
-    { X86::VPSUBSBZ128rrkz,       X86::VPSUBSBZ128rmkz,       0 },
-    { X86::VPSUBSWZ128rrkz,       X86::VPSUBSWZ128rmkz,       0 },
-    { X86::VPSUBUSBZ128rrkz,      X86::VPSUBUSBZ128rmkz,      0 },
-    { X86::VPSUBUSWZ128rrkz,      X86::VPSUBUSWZ128rmkz,      0 },
-    { X86::VPSUBWZ128rrkz,        X86::VPSUBWZ128rmkz,        0 },
-    { X86::VPUNPCKHBWZ128rrkz,    X86::VPUNPCKHBWZ128rmkz,    0 },
-    { X86::VPUNPCKHDQZ128rrkz,    X86::VPUNPCKHDQZ128rmkz,    0 },
-    { X86::VPUNPCKHQDQZ128rrkz,   X86::VPUNPCKHQDQZ128rmkz,   0 },
-    { X86::VPUNPCKHWDZ128rrkz,    X86::VPUNPCKHWDZ128rmkz,    0 },
-    { X86::VPUNPCKLBWZ128rrkz,    X86::VPUNPCKLBWZ128rmkz,    0 },
-    { X86::VPUNPCKLDQZ128rrkz,    X86::VPUNPCKLDQZ128rmkz,    0 },
-    { X86::VPUNPCKLQDQZ128rrkz,   X86::VPUNPCKLQDQZ128rmkz,   0 },
-    { X86::VPUNPCKLWDZ128rrkz,    X86::VPUNPCKLWDZ128rmkz,    0 },
-    { X86::VPXORDZ128rrkz,        X86::VPXORDZ128rmkz,        0 },
-    { X86::VPXORQZ128rrkz,        X86::VPXORQZ128rmkz,        0 },
-    { X86::VSHUFPDZ128rrikz,      X86::VSHUFPDZ128rmikz,      0 },
-    { X86::VSHUFPSZ128rrikz,      X86::VSHUFPSZ128rmikz,      0 },
-    { X86::VSUBPDZ128rrkz,        X86::VSUBPDZ128rmkz,        0 },
-    { X86::VSUBPSZ128rrkz,        X86::VSUBPSZ128rmkz,        0 },
-    { X86::VUNPCKHPDZ128rrkz,     X86::VUNPCKHPDZ128rmkz,     0 },
-    { X86::VUNPCKHPSZ128rrkz,     X86::VUNPCKHPSZ128rmkz,     0 },
-    { X86::VUNPCKLPDZ128rrkz,     X86::VUNPCKLPDZ128rmkz,     0 },
-    { X86::VUNPCKLPSZ128rrkz,     X86::VUNPCKLPSZ128rmkz,     0 },
-    { X86::VXORPDZ128rrkz,        X86::VXORPDZ128rmkz,        0 },
-    { X86::VXORPSZ128rrkz,        X86::VXORPSZ128rmkz,        0 },
-
-    // AVX-512 masked foldable instructions
-    { X86::VBROADCASTSSZrk,       X86::VBROADCASTSSZmk,       TB_NO_REVERSE },
-    { X86::VBROADCASTSDZrk,       X86::VBROADCASTSDZmk,       TB_NO_REVERSE },
-    { X86::VPABSBZrrk,            X86::VPABSBZrmk,            0 },
-    { X86::VPABSDZrrk,            X86::VPABSDZrmk,            0 },
-    { X86::VPABSQZrrk,            X86::VPABSQZrmk,            0 },
-    { X86::VPABSWZrrk,            X86::VPABSWZrmk,            0 },
-    { X86::VPERMILPDZrik,         X86::VPERMILPDZmik,         0 },
-    { X86::VPERMILPSZrik,         X86::VPERMILPSZmik,         0 },
-    { X86::VPERMPDZrik,           X86::VPERMPDZmik,           0 },
-    { X86::VPERMQZrik,            X86::VPERMQZmik,            0 },
-    { X86::VPMOVSXBDZrrk,         X86::VPMOVSXBDZrmk,         0 },
-    { X86::VPMOVSXBQZrrk,         X86::VPMOVSXBQZrmk,         TB_NO_REVERSE },
-    { X86::VPMOVSXBWZrrk,         X86::VPMOVSXBWZrmk,         0 },
-    { X86::VPMOVSXDQZrrk,         X86::VPMOVSXDQZrmk,         0 },
-    { X86::VPMOVSXWDZrrk,         X86::VPMOVSXWDZrmk,         0 },
-    { X86::VPMOVSXWQZrrk,         X86::VPMOVSXWQZrmk,         0 },
-    { X86::VPMOVZXBDZrrk,         X86::VPMOVZXBDZrmk,         0 },
-    { X86::VPMOVZXBQZrrk,         X86::VPMOVZXBQZrmk,         TB_NO_REVERSE },
-    { X86::VPMOVZXBWZrrk,         X86::VPMOVZXBWZrmk,         0 },
-    { X86::VPMOVZXDQZrrk,         X86::VPMOVZXDQZrmk,         0 },
-    { X86::VPMOVZXWDZrrk,         X86::VPMOVZXWDZrmk,         0 },
-    { X86::VPMOVZXWQZrrk,         X86::VPMOVZXWQZrmk,         0 },
-    { X86::VPSHUFDZrik,           X86::VPSHUFDZmik,           0 },
-    { X86::VPSHUFHWZrik,          X86::VPSHUFHWZmik,          0 },
-    { X86::VPSHUFLWZrik,          X86::VPSHUFLWZmik,          0 },
-    { X86::VPSLLDZrik,            X86::VPSLLDZmik,            0 },
-    { X86::VPSLLQZrik,            X86::VPSLLQZmik,            0 },
-    { X86::VPSLLWZrik,            X86::VPSLLWZmik,            0 },
-    { X86::VPSRADZrik,            X86::VPSRADZmik,            0 },
-    { X86::VPSRAQZrik,            X86::VPSRAQZmik,            0 },
-    { X86::VPSRAWZrik,            X86::VPSRAWZmik,            0 },
-    { X86::VPSRLDZrik,            X86::VPSRLDZmik,            0 },
-    { X86::VPSRLQZrik,            X86::VPSRLQZmik,            0 },
-    { X86::VPSRLWZrik,            X86::VPSRLWZmik,            0 },
-
-    // AVX-512VL 256-bit masked foldable instructions
-    { X86::VBROADCASTSSZ256rk,    X86::VBROADCASTSSZ256mk,    TB_NO_REVERSE },
-    { X86::VBROADCASTSDZ256rk,    X86::VBROADCASTSDZ256mk,    TB_NO_REVERSE },
-    { X86::VPABSBZ256rrk,         X86::VPABSBZ256rmk,         0 },
-    { X86::VPABSDZ256rrk,         X86::VPABSDZ256rmk,         0 },
-    { X86::VPABSQZ256rrk,         X86::VPABSQZ256rmk,         0 },
-    { X86::VPABSWZ256rrk,         X86::VPABSWZ256rmk,         0 },
-    { X86::VPERMILPDZ256rik,      X86::VPERMILPDZ256mik,      0 },
-    { X86::VPERMILPSZ256rik,      X86::VPERMILPSZ256mik,      0 },
-    { X86::VPERMPDZ256rik,        X86::VPERMPDZ256mik,        0 },
-    { X86::VPERMQZ256rik,         X86::VPERMQZ256mik,         0 },
-    { X86::VPMOVSXBDZ256rrk,      X86::VPMOVSXBDZ256rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXBQZ256rrk,      X86::VPMOVSXBQZ256rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXBWZ256rrk,      X86::VPMOVSXBWZ256rmk,      0 },
-    { X86::VPMOVSXDQZ256rrk,      X86::VPMOVSXDQZ256rmk,      0 },
-    { X86::VPMOVSXWDZ256rrk,      X86::VPMOVSXWDZ256rmk,      0 },
-    { X86::VPMOVSXWQZ256rrk,      X86::VPMOVSXWQZ256rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXBDZ256rrk,      X86::VPMOVZXBDZ256rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXBQZ256rrk,      X86::VPMOVZXBQZ256rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXBWZ256rrk,      X86::VPMOVZXBWZ256rmk,      0 },
-    { X86::VPMOVZXDQZ256rrk,      X86::VPMOVZXDQZ256rmk,      0 },
-    { X86::VPMOVZXWDZ256rrk,      X86::VPMOVZXWDZ256rmk,      0 },
-    { X86::VPMOVZXWQZ256rrk,      X86::VPMOVZXWQZ256rmk,      TB_NO_REVERSE },
-    { X86::VPSHUFDZ256rik,        X86::VPSHUFDZ256mik,        0 },
-    { X86::VPSHUFHWZ256rik,       X86::VPSHUFHWZ256mik,       0 },
-    { X86::VPSHUFLWZ256rik,       X86::VPSHUFLWZ256mik,       0 },
-    { X86::VPSLLDZ256rik,         X86::VPSLLDZ256mik,         0 },
-    { X86::VPSLLQZ256rik,         X86::VPSLLQZ256mik,         0 },
-    { X86::VPSLLWZ256rik,         X86::VPSLLWZ256mik,         0 },
-    { X86::VPSRADZ256rik,         X86::VPSRADZ256mik,         0 },
-    { X86::VPSRAQZ256rik,         X86::VPSRAQZ256mik,         0 },
-    { X86::VPSRAWZ256rik,         X86::VPSRAWZ256mik,         0 },
-    { X86::VPSRLDZ256rik,         X86::VPSRLDZ256mik,         0 },
-    { X86::VPSRLQZ256rik,         X86::VPSRLQZ256mik,         0 },
-    { X86::VPSRLWZ256rik,         X86::VPSRLWZ256mik,         0 },
-
-    // AVX-512VL 128-bit masked foldable instructions
-    { X86::VBROADCASTSSZ128rk,    X86::VBROADCASTSSZ128mk,    TB_NO_REVERSE },
-    { X86::VPABSBZ128rrk,         X86::VPABSBZ128rmk,         0 },
-    { X86::VPABSDZ128rrk,         X86::VPABSDZ128rmk,         0 },
-    { X86::VPABSQZ128rrk,         X86::VPABSQZ128rmk,         0 },
-    { X86::VPABSWZ128rrk,         X86::VPABSWZ128rmk,         0 },
-    { X86::VPERMILPDZ128rik,      X86::VPERMILPDZ128mik,      0 },
-    { X86::VPERMILPSZ128rik,      X86::VPERMILPSZ128mik,      0 },
-    { X86::VPMOVSXBDZ128rrk,      X86::VPMOVSXBDZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXBQZ128rrk,      X86::VPMOVSXBQZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXBWZ128rrk,      X86::VPMOVSXBWZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXDQZ128rrk,      X86::VPMOVSXDQZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXWDZ128rrk,      X86::VPMOVSXWDZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVSXWQZ128rrk,      X86::VPMOVSXWQZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXBDZ128rrk,      X86::VPMOVZXBDZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXBQZ128rrk,      X86::VPMOVZXBQZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXBWZ128rrk,      X86::VPMOVZXBWZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXDQZ128rrk,      X86::VPMOVZXDQZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXWDZ128rrk,      X86::VPMOVZXWDZ128rmk,      TB_NO_REVERSE },
-    { X86::VPMOVZXWQZ128rrk,      X86::VPMOVZXWQZ128rmk,      TB_NO_REVERSE },
-    { X86::VPSHUFDZ128rik,        X86::VPSHUFDZ128mik,        0 },
-    { X86::VPSHUFHWZ128rik,       X86::VPSHUFHWZ128mik,       0 },
-    { X86::VPSHUFLWZ128rik,       X86::VPSHUFLWZ128mik,       0 },
-    { X86::VPSLLDZ128rik,         X86::VPSLLDZ128mik,         0 },
-    { X86::VPSLLQZ128rik,         X86::VPSLLQZ128mik,         0 },
-    { X86::VPSLLWZ128rik,         X86::VPSLLWZ128mik,         0 },
-    { X86::VPSRADZ128rik,         X86::VPSRADZ128mik,         0 },
-    { X86::VPSRAQZ128rik,         X86::VPSRAQZ128mik,         0 },
-    { X86::VPSRAWZ128rik,         X86::VPSRAWZ128mik,         0 },
-    { X86::VPSRLDZ128rik,         X86::VPSRLDZ128mik,         0 },
-    { X86::VPSRLQZ128rik,         X86::VPSRLQZ128mik,         0 },
-    { X86::VPSRLWZ128rik,         X86::VPSRLWZ128mik,         0 },
-  };
-
   for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
     AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
                   Entry.RegOp, Entry.MemOp,
                   // Index 3, folded load
                   Entry.Flags | TB_INDEX_3 | TB_FOLDED_LOAD);
   }
-  auto I = X86InstrFMA3Info::rm_begin();
-  auto E = X86InstrFMA3Info::rm_end();
-  for (; I != E; ++I) {
-    if (!I.getGroup()->isKMasked()) {
-      // Intrinsic forms need to pass TB_NO_REVERSE.
-      if (I.getGroup()->isIntrinsic()) {
-        AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
-                      I.getRegOpcode(), I.getMemOpcode(),
-                      TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD | TB_NO_REVERSE);
-      } else {
-        AddTableEntry(RegOp2MemOpTable3, MemOp2RegOpTable,
-                      I.getRegOpcode(), I.getMemOpcode(),
-                      TB_ALIGN_NONE | TB_INDEX_3 | TB_FOLDED_LOAD);
-      }
-    }
-  }
-
-  static const X86MemoryFoldTableEntry MemoryFoldTable4[] = {
-    // AVX-512 foldable masked instructions
-    { X86::VADDPDZrrk,         X86::VADDPDZrmk,           0 },
-    { X86::VADDPSZrrk,         X86::VADDPSZrmk,           0 },
-    { X86::VADDSDZrr_Intk,     X86::VADDSDZrm_Intk,       TB_NO_REVERSE },
-    { X86::VADDSSZrr_Intk,     X86::VADDSSZrm_Intk,       TB_NO_REVERSE },
-    { X86::VALIGNDZrrik,       X86::VALIGNDZrmik,         0 },
-    { X86::VALIGNQZrrik,       X86::VALIGNQZrmik,         0 },
-    { X86::VANDNPDZrrk,        X86::VANDNPDZrmk,          0 },
-    { X86::VANDNPSZrrk,        X86::VANDNPSZrmk,          0 },
-    { X86::VANDPDZrrk,         X86::VANDPDZrmk,           0 },
-    { X86::VANDPSZrrk,         X86::VANDPSZrmk,           0 },
-    { X86::VDIVPDZrrk,         X86::VDIVPDZrmk,           0 },
-    { X86::VDIVPSZrrk,         X86::VDIVPSZrmk,           0 },
-    { X86::VDIVSDZrr_Intk,     X86::VDIVSDZrm_Intk,       TB_NO_REVERSE },
-    { X86::VDIVSSZrr_Intk,     X86::VDIVSSZrm_Intk,       TB_NO_REVERSE },
-    { X86::VINSERTF32x4Zrrk,   X86::VINSERTF32x4Zrmk,     0 },
-    { X86::VINSERTF32x8Zrrk,   X86::VINSERTF32x8Zrmk,     0 },
-    { X86::VINSERTF64x2Zrrk,   X86::VINSERTF64x2Zrmk,     0 },
-    { X86::VINSERTF64x4Zrrk,   X86::VINSERTF64x4Zrmk,     0 },
-    { X86::VINSERTI32x4Zrrk,   X86::VINSERTI32x4Zrmk,     0 },
-    { X86::VINSERTI32x8Zrrk,   X86::VINSERTI32x8Zrmk,     0 },
-    { X86::VINSERTI64x2Zrrk,   X86::VINSERTI64x2Zrmk,     0 },
-    { X86::VINSERTI64x4Zrrk,   X86::VINSERTI64x4Zrmk,     0 },
-    { X86::VMAXCPDZrrk,        X86::VMAXCPDZrmk,          0 },
-    { X86::VMAXCPSZrrk,        X86::VMAXCPSZrmk,          0 },
-    { X86::VMAXPDZrrk,         X86::VMAXPDZrmk,           0 },
-    { X86::VMAXPSZrrk,         X86::VMAXPSZrmk,           0 },
-    { X86::VMAXSDZrr_Intk,     X86::VMAXSDZrm_Intk,       0 },
-    { X86::VMAXSSZrr_Intk,     X86::VMAXSSZrm_Intk,       0 },
-    { X86::VMINCPDZrrk,        X86::VMINCPDZrmk,          0 },
-    { X86::VMINCPSZrrk,        X86::VMINCPSZrmk,          0 },
-    { X86::VMINPDZrrk,         X86::VMINPDZrmk,           0 },
-    { X86::VMINPSZrrk,         X86::VMINPSZrmk,           0 },
-    { X86::VMINSDZrr_Intk,     X86::VMINSDZrm_Intk,       0 },
-    { X86::VMINSSZrr_Intk,     X86::VMINSSZrm_Intk,       0 },
-    { X86::VMULPDZrrk,         X86::VMULPDZrmk,           0 },
-    { X86::VMULPSZrrk,         X86::VMULPSZrmk,           0 },
-    { X86::VMULSDZrr_Intk,     X86::VMULSDZrm_Intk,       TB_NO_REVERSE },
-    { X86::VMULSSZrr_Intk,     X86::VMULSSZrm_Intk,       TB_NO_REVERSE },
-    { X86::VORPDZrrk,          X86::VORPDZrmk,            0 },
-    { X86::VORPSZrrk,          X86::VORPSZrmk,            0 },
-    { X86::VPACKSSDWZrrk,      X86::VPACKSSDWZrmk,        0 },
-    { X86::VPACKSSWBZrrk,      X86::VPACKSSWBZrmk,        0 },
-    { X86::VPACKUSDWZrrk,      X86::VPACKUSDWZrmk,        0 },
-    { X86::VPACKUSWBZrrk,      X86::VPACKUSWBZrmk,        0 },
-    { X86::VPADDBZrrk,         X86::VPADDBZrmk,           0 },
-    { X86::VPADDDZrrk,         X86::VPADDDZrmk,           0 },
-    { X86::VPADDQZrrk,         X86::VPADDQZrmk,           0 },
-    { X86::VPADDSBZrrk,        X86::VPADDSBZrmk,          0 },
-    { X86::VPADDSWZrrk,        X86::VPADDSWZrmk,          0 },
-    { X86::VPADDUSBZrrk,       X86::VPADDUSBZrmk,         0 },
-    { X86::VPADDUSWZrrk,       X86::VPADDUSWZrmk,         0 },
-    { X86::VPADDWZrrk,         X86::VPADDWZrmk,           0 },
-    { X86::VPALIGNRZrrik,      X86::VPALIGNRZrmik,        0 },
-    { X86::VPANDDZrrk,         X86::VPANDDZrmk,           0 },
-    { X86::VPANDNDZrrk,        X86::VPANDNDZrmk,          0 },
-    { X86::VPANDNQZrrk,        X86::VPANDNQZrmk,          0 },
-    { X86::VPANDQZrrk,         X86::VPANDQZrmk,           0 },
-    { X86::VPAVGBZrrk,         X86::VPAVGBZrmk,           0 },
-    { X86::VPAVGWZrrk,         X86::VPAVGWZrmk,           0 },
-    { X86::VPERMBZrrk,         X86::VPERMBZrmk,           0 },
-    { X86::VPERMDZrrk,         X86::VPERMDZrmk,           0 },
-    { X86::VPERMI2Brrk,        X86::VPERMI2Brmk,          0 },
-    { X86::VPERMI2Drrk,        X86::VPERMI2Drmk,          0 },
-    { X86::VPERMI2PSrrk,       X86::VPERMI2PSrmk,         0 },
-    { X86::VPERMI2PDrrk,       X86::VPERMI2PDrmk,         0 },
-    { X86::VPERMI2Qrrk,        X86::VPERMI2Qrmk,          0 },
-    { X86::VPERMI2Wrrk,        X86::VPERMI2Wrmk,          0 },
-    { X86::VPERMILPDZrrk,      X86::VPERMILPDZrmk,        0 },
-    { X86::VPERMILPSZrrk,      X86::VPERMILPSZrmk,        0 },
-    { X86::VPERMPDZrrk,        X86::VPERMPDZrmk,          0 },
-    { X86::VPERMPSZrrk,        X86::VPERMPSZrmk,          0 },
-    { X86::VPERMQZrrk,         X86::VPERMQZrmk,           0 },
-    { X86::VPERMT2Brrk,        X86::VPERMT2Brmk,          0 },
-    { X86::VPERMT2Drrk,        X86::VPERMT2Drmk,          0 },
-    { X86::VPERMT2PSrrk,       X86::VPERMT2PSrmk,         0 },
-    { X86::VPERMT2PDrrk,       X86::VPERMT2PDrmk,         0 },
-    { X86::VPERMT2Qrrk,        X86::VPERMT2Qrmk,          0 },
-    { X86::VPERMT2Wrrk,        X86::VPERMT2Wrmk,          0 },
-    { X86::VPERMWZrrk,         X86::VPERMWZrmk,           0 },
-    { X86::VPMADDUBSWZrrk,     X86::VPMADDUBSWZrmk,       0 },
-    { X86::VPMADDWDZrrk,       X86::VPMADDWDZrmk,         0 },
-    { X86::VPMAXSBZrrk,        X86::VPMAXSBZrmk,          0 },
-    { X86::VPMAXSDZrrk,        X86::VPMAXSDZrmk,          0 },
-    { X86::VPMAXSQZrrk,        X86::VPMAXSQZrmk,          0 },
-    { X86::VPMAXSWZrrk,        X86::VPMAXSWZrmk,          0 },
-    { X86::VPMAXUBZrrk,        X86::VPMAXUBZrmk,          0 },
-    { X86::VPMAXUDZrrk,        X86::VPMAXUDZrmk,          0 },
-    { X86::VPMAXUQZrrk,        X86::VPMAXUQZrmk,          0 },
-    { X86::VPMAXUWZrrk,        X86::VPMAXUWZrmk,          0 },
-    { X86::VPMINSBZrrk,        X86::VPMINSBZrmk,          0 },
-    { X86::VPMINSDZrrk,        X86::VPMINSDZrmk,          0 },
-    { X86::VPMINSQZrrk,        X86::VPMINSQZrmk,          0 },
-    { X86::VPMINSWZrrk,        X86::VPMINSWZrmk,          0 },
-    { X86::VPMINUBZrrk,        X86::VPMINUBZrmk,          0 },
-    { X86::VPMINUDZrrk,        X86::VPMINUDZrmk,          0 },
-    { X86::VPMINUQZrrk,        X86::VPMINUQZrmk,          0 },
-    { X86::VPMINUWZrrk,        X86::VPMINUWZrmk,          0 },
-    { X86::VPMULDQZrrk,        X86::VPMULDQZrmk,          0 },
-    { X86::VPMULLDZrrk,        X86::VPMULLDZrmk,          0 },
-    { X86::VPMULLQZrrk,        X86::VPMULLQZrmk,          0 },
-    { X86::VPMULLWZrrk,        X86::VPMULLWZrmk,          0 },
-    { X86::VPMULUDQZrrk,       X86::VPMULUDQZrmk,         0 },
-    { X86::VPORDZrrk,          X86::VPORDZrmk,            0 },
-    { X86::VPORQZrrk,          X86::VPORQZrmk,            0 },
-    { X86::VPSHUFBZrrk,        X86::VPSHUFBZrmk,          0 },
-    { X86::VPSLLDZrrk,         X86::VPSLLDZrmk,           0 },
-    { X86::VPSLLQZrrk,         X86::VPSLLQZrmk,           0 },
-    { X86::VPSLLVDZrrk,        X86::VPSLLVDZrmk,          0 },
-    { X86::VPSLLVQZrrk,        X86::VPSLLVQZrmk,          0 },
-    { X86::VPSLLVWZrrk,        X86::VPSLLVWZrmk,          0 },
-    { X86::VPSLLWZrrk,         X86::VPSLLWZrmk,           0 },
-    { X86::VPSRADZrrk,         X86::VPSRADZrmk,           0 },
-    { X86::VPSRAQZrrk,         X86::VPSRAQZrmk,           0 },
-    { X86::VPSRAVDZrrk,        X86::VPSRAVDZrmk,          0 },
-    { X86::VPSRAVQZrrk,        X86::VPSRAVQZrmk,          0 },
-    { X86::VPSRAVWZrrk,        X86::VPSRAVWZrmk,          0 },
-    { X86::VPSRAWZrrk,         X86::VPSRAWZrmk,           0 },
-    { X86::VPSRLDZrrk,         X86::VPSRLDZrmk,           0 },
-    { X86::VPSRLQZrrk,         X86::VPSRLQZrmk,           0 },
-    { X86::VPSRLVDZrrk,        X86::VPSRLVDZrmk,          0 },
-    { X86::VPSRLVQZrrk,        X86::VPSRLVQZrmk,          0 },
-    { X86::VPSRLVWZrrk,        X86::VPSRLVWZrmk,          0 },
-    { X86::VPSRLWZrrk,         X86::VPSRLWZrmk,           0 },
-    { X86::VPSUBBZrrk,         X86::VPSUBBZrmk,           0 },
-    { X86::VPSUBDZrrk,         X86::VPSUBDZrmk,           0 },
-    { X86::VPSUBQZrrk,         X86::VPSUBQZrmk,           0 },
-    { X86::VPSUBSBZrrk,        X86::VPSUBSBZrmk,          0 },
-    { X86::VPSUBSWZrrk,        X86::VPSUBSWZrmk,          0 },
-    { X86::VPSUBUSBZrrk,       X86::VPSUBUSBZrmk,         0 },
-    { X86::VPSUBUSWZrrk,       X86::VPSUBUSWZrmk,         0 },
-    { X86::VPTERNLOGDZrrik,    X86::VPTERNLOGDZrmik,      0 },
-    { X86::VPTERNLOGQZrrik,    X86::VPTERNLOGQZrmik,      0 },
-    { X86::VPUNPCKHBWZrrk,     X86::VPUNPCKHBWZrmk,       0 },
-    { X86::VPUNPCKHDQZrrk,     X86::VPUNPCKHDQZrmk,       0 },
-    { X86::VPUNPCKHQDQZrrk,    X86::VPUNPCKHQDQZrmk,      0 },
-    { X86::VPUNPCKHWDZrrk,     X86::VPUNPCKHWDZrmk,       0 },
-    { X86::VPUNPCKLBWZrrk,     X86::VPUNPCKLBWZrmk,       0 },
-    { X86::VPUNPCKLDQZrrk,     X86::VPUNPCKLDQZrmk,       0 },
-    { X86::VPUNPCKLQDQZrrk,    X86::VPUNPCKLQDQZrmk,      0 },
-    { X86::VPUNPCKLWDZrrk,     X86::VPUNPCKLWDZrmk,       0 },
-    { X86::VPXORDZrrk,         X86::VPXORDZrmk,           0 },
-    { X86::VPXORQZrrk,         X86::VPXORQZrmk,           0 },
-    { X86::VSHUFPDZrrik,       X86::VSHUFPDZrmik,         0 },
-    { X86::VSHUFPSZrrik,       X86::VSHUFPSZrmik,         0 },
-    { X86::VSUBPDZrrk,         X86::VSUBPDZrmk,           0 },
-    { X86::VSUBPSZrrk,         X86::VSUBPSZrmk,           0 },
-    { X86::VSUBSDZrr_Intk,     X86::VSUBSDZrm_Intk,       TB_NO_REVERSE },
-    { X86::VSUBSSZrr_Intk,     X86::VSUBSSZrm_Intk,       TB_NO_REVERSE },
-    { X86::VUNPCKHPDZrrk,      X86::VUNPCKHPDZrmk,        0 },
-    { X86::VUNPCKHPSZrrk,      X86::VUNPCKHPSZrmk,        0 },
-    { X86::VUNPCKLPDZrrk,      X86::VUNPCKLPDZrmk,        0 },
-    { X86::VUNPCKLPSZrrk,      X86::VUNPCKLPSZrmk,        0 },
-    { X86::VXORPDZrrk,         X86::VXORPDZrmk,           0 },
-    { X86::VXORPSZrrk,         X86::VXORPSZrmk,           0 },
-
-    // AVX-512{F,VL} foldable masked instructions 256-bit
-    { X86::VADDPDZ256rrk,      X86::VADDPDZ256rmk,        0 },
-    { X86::VADDPSZ256rrk,      X86::VADDPSZ256rmk,        0 },
-    { X86::VALIGNDZ256rrik,    X86::VALIGNDZ256rmik,      0 },
-    { X86::VALIGNQZ256rrik,    X86::VALIGNQZ256rmik,      0 },
-    { X86::VANDNPDZ256rrk,     X86::VANDNPDZ256rmk,       0 },
-    { X86::VANDNPSZ256rrk,     X86::VANDNPSZ256rmk,       0 },
-    { X86::VANDPDZ256rrk,      X86::VANDPDZ256rmk,        0 },
-    { X86::VANDPSZ256rrk,      X86::VANDPSZ256rmk,        0 },
-    { X86::VDIVPDZ256rrk,      X86::VDIVPDZ256rmk,        0 },
-    { X86::VDIVPSZ256rrk,      X86::VDIVPSZ256rmk,        0 },
-    { X86::VINSERTF32x4Z256rrk,X86::VINSERTF32x4Z256rmk,  0 },
-    { X86::VINSERTF64x2Z256rrk,X86::VINSERTF64x2Z256rmk,  0 },
-    { X86::VINSERTI32x4Z256rrk,X86::VINSERTI32x4Z256rmk,  0 },
-    { X86::VINSERTI64x2Z256rrk,X86::VINSERTI64x2Z256rmk,  0 },
-    { X86::VMAXCPDZ256rrk,     X86::VMAXCPDZ256rmk,       0 },
-    { X86::VMAXCPSZ256rrk,     X86::VMAXCPSZ256rmk,       0 },
-    { X86::VMAXPDZ256rrk,      X86::VMAXPDZ256rmk,        0 },
-    { X86::VMAXPSZ256rrk,      X86::VMAXPSZ256rmk,        0 },
-    { X86::VMINCPDZ256rrk,     X86::VMINCPDZ256rmk,       0 },
-    { X86::VMINCPSZ256rrk,     X86::VMINCPSZ256rmk,       0 },
-    { X86::VMINPDZ256rrk,      X86::VMINPDZ256rmk,        0 },
-    { X86::VMINPSZ256rrk,      X86::VMINPSZ256rmk,        0 },
-    { X86::VMULPDZ256rrk,      X86::VMULPDZ256rmk,        0 },
-    { X86::VMULPSZ256rrk,      X86::VMULPSZ256rmk,        0 },
-    { X86::VORPDZ256rrk,       X86::VORPDZ256rmk,         0 },
-    { X86::VORPSZ256rrk,       X86::VORPSZ256rmk,         0 },
-    { X86::VPACKSSDWZ256rrk,   X86::VPACKSSDWZ256rmk,     0 },
-    { X86::VPACKSSWBZ256rrk,   X86::VPACKSSWBZ256rmk,     0 },
-    { X86::VPACKUSDWZ256rrk,   X86::VPACKUSDWZ256rmk,     0 },
-    { X86::VPACKUSWBZ256rrk,   X86::VPACKUSWBZ256rmk,     0 },
-    { X86::VPADDBZ256rrk,      X86::VPADDBZ256rmk,        0 },
-    { X86::VPADDDZ256rrk,      X86::VPADDDZ256rmk,        0 },
-    { X86::VPADDQZ256rrk,      X86::VPADDQZ256rmk,        0 },
-    { X86::VPADDSBZ256rrk,     X86::VPADDSBZ256rmk,       0 },
-    { X86::VPADDSWZ256rrk,     X86::VPADDSWZ256rmk,       0 },
-    { X86::VPADDUSBZ256rrk,    X86::VPADDUSBZ256rmk,      0 },
-    { X86::VPADDUSWZ256rrk,    X86::VPADDUSWZ256rmk,      0 },
-    { X86::VPADDWZ256rrk,      X86::VPADDWZ256rmk,        0 },
-    { X86::VPALIGNRZ256rrik,   X86::VPALIGNRZ256rmik,     0 },
-    { X86::VPANDDZ256rrk,      X86::VPANDDZ256rmk,        0 },
-    { X86::VPANDNDZ256rrk,     X86::VPANDNDZ256rmk,       0 },
-    { X86::VPANDNQZ256rrk,     X86::VPANDNQZ256rmk,       0 },
-    { X86::VPANDQZ256rrk,      X86::VPANDQZ256rmk,        0 },
-    { X86::VPAVGBZ256rrk,      X86::VPAVGBZ256rmk,        0 },
-    { X86::VPAVGWZ256rrk,      X86::VPAVGWZ256rmk,        0 },
-    { X86::VPERMBZ256rrk,      X86::VPERMBZ256rmk,        0 },
-    { X86::VPERMDZ256rrk,      X86::VPERMDZ256rmk,        0 },
-    { X86::VPERMI2B256rrk,     X86::VPERMI2B256rmk,       0 },
-    { X86::VPERMI2D256rrk,     X86::VPERMI2D256rmk,       0 },
-    { X86::VPERMI2PD256rrk,    X86::VPERMI2PD256rmk,      0 },
-    { X86::VPERMI2PS256rrk,    X86::VPERMI2PS256rmk,      0 },
-    { X86::VPERMI2Q256rrk,     X86::VPERMI2Q256rmk,       0 },
-    { X86::VPERMI2W256rrk,     X86::VPERMI2W256rmk,       0 },
-    { X86::VPERMILPDZ256rrk,   X86::VPERMILPDZ256rmk,     0 },
-    { X86::VPERMILPSZ256rrk,   X86::VPERMILPSZ256rmk,     0 },
-    { X86::VPERMPDZ256rrk,     X86::VPERMPDZ256rmk,       0 },
-    { X86::VPERMPSZ256rrk,     X86::VPERMPSZ256rmk,       0 },
-    { X86::VPERMQZ256rrk,      X86::VPERMQZ256rmk,        0 },
-    { X86::VPERMT2B256rrk,     X86::VPERMT2B256rmk,       0 },
-    { X86::VPERMT2D256rrk,     X86::VPERMT2D256rmk,       0 },
-    { X86::VPERMT2PD256rrk,    X86::VPERMT2PD256rmk,      0 },
-    { X86::VPERMT2PS256rrk,    X86::VPERMT2PS256rmk,      0 },
-    { X86::VPERMT2Q256rrk,     X86::VPERMT2Q256rmk,       0 },
-    { X86::VPERMT2W256rrk,     X86::VPERMT2W256rmk,       0 },
-    { X86::VPERMWZ256rrk,      X86::VPERMWZ256rmk,        0 },
-    { X86::VPMADDUBSWZ256rrk,  X86::VPMADDUBSWZ256rmk,    0 },
-    { X86::VPMADDWDZ256rrk,    X86::VPMADDWDZ256rmk,      0 },
-    { X86::VPMAXSBZ256rrk,     X86::VPMAXSBZ256rmk,       0 },
-    { X86::VPMAXSDZ256rrk,     X86::VPMAXSDZ256rmk,       0 },
-    { X86::VPMAXSQZ256rrk,     X86::VPMAXSQZ256rmk,       0 },
-    { X86::VPMAXSWZ256rrk,     X86::VPMAXSWZ256rmk,       0 },
-    { X86::VPMAXUBZ256rrk,     X86::VPMAXUBZ256rmk,       0 },
-    { X86::VPMAXUDZ256rrk,     X86::VPMAXUDZ256rmk,       0 },
-    { X86::VPMAXUQZ256rrk,     X86::VPMAXUQZ256rmk,       0 },
-    { X86::VPMAXUWZ256rrk,     X86::VPMAXUWZ256rmk,       0 },
-    { X86::VPMINSBZ256rrk,     X86::VPMINSBZ256rmk,       0 },
-    { X86::VPMINSDZ256rrk,     X86::VPMINSDZ256rmk,       0 },
-    { X86::VPMINSQZ256rrk,     X86::VPMINSQZ256rmk,       0 },
-    { X86::VPMINSWZ256rrk,     X86::VPMINSWZ256rmk,       0 },
-    { X86::VPMINUBZ256rrk,     X86::VPMINUBZ256rmk,       0 },
-    { X86::VPMINUDZ256rrk,     X86::VPMINUDZ256rmk,       0 },
-    { X86::VPMINUQZ256rrk,     X86::VPMINUQZ256rmk,       0 },
-    { X86::VPMINUWZ256rrk,     X86::VPMINUWZ256rmk,       0 },
-    { X86::VPMULDQZ256rrk,     X86::VPMULDQZ256rmk,       0 },
-    { X86::VPMULLDZ256rrk,     X86::VPMULLDZ256rmk,       0 },
-    { X86::VPMULLQZ256rrk,     X86::VPMULLQZ256rmk,       0 },
-    { X86::VPMULLWZ256rrk,     X86::VPMULLWZ256rmk,       0 },
-    { X86::VPMULUDQZ256rrk,    X86::VPMULUDQZ256rmk,      0 },
-    { X86::VPORDZ256rrk,       X86::VPORDZ256rmk,         0 },
-    { X86::VPORQZ256rrk,       X86::VPORQZ256rmk,         0 },
-    { X86::VPSHUFBZ256rrk,     X86::VPSHUFBZ256rmk,       0 },
-    { X86::VPSLLDZ256rrk,      X86::VPSLLDZ256rmk,        0 },
-    { X86::VPSLLQZ256rrk,      X86::VPSLLQZ256rmk,        0 },
-    { X86::VPSLLVDZ256rrk,     X86::VPSLLVDZ256rmk,       0 },
-    { X86::VPSLLVQZ256rrk,     X86::VPSLLVQZ256rmk,       0 },
-    { X86::VPSLLVWZ256rrk,     X86::VPSLLVWZ256rmk,       0 },
-    { X86::VPSLLWZ256rrk,      X86::VPSLLWZ256rmk,        0 },
-    { X86::VPSRADZ256rrk,      X86::VPSRADZ256rmk,        0 },
-    { X86::VPSRAQZ256rrk,      X86::VPSRAQZ256rmk,        0 },
-    { X86::VPSRAVDZ256rrk,     X86::VPSRAVDZ256rmk,       0 },
-    { X86::VPSRAVQZ256rrk,     X86::VPSRAVQZ256rmk,       0 },
-    { X86::VPSRAVWZ256rrk,     X86::VPSRAVWZ256rmk,       0 },
-    { X86::VPSRAWZ256rrk,      X86::VPSRAWZ256rmk,        0 },
-    { X86::VPSRLDZ256rrk,      X86::VPSRLDZ256rmk,        0 },
-    { X86::VPSRLQZ256rrk,      X86::VPSRLQZ256rmk,        0 },
-    { X86::VPSRLVDZ256rrk,     X86::VPSRLVDZ256rmk,       0 },
-    { X86::VPSRLVQZ256rrk,     X86::VPSRLVQZ256rmk,       0 },
-    { X86::VPSRLVWZ256rrk,     X86::VPSRLVWZ256rmk,       0 },
-    { X86::VPSRLWZ256rrk,      X86::VPSRLWZ256rmk,        0 },
-    { X86::VPSUBBZ256rrk,      X86::VPSUBBZ256rmk,        0 },
-    { X86::VPSUBDZ256rrk,      X86::VPSUBDZ256rmk,        0 },
-    { X86::VPSUBQZ256rrk,      X86::VPSUBQZ256rmk,        0 },
-    { X86::VPSUBSBZ256rrk,     X86::VPSUBSBZ256rmk,       0 },
-    { X86::VPSUBSWZ256rrk,     X86::VPSUBSWZ256rmk,       0 },
-    { X86::VPSUBUSBZ256rrk,    X86::VPSUBUSBZ256rmk,      0 },
-    { X86::VPSUBUSWZ256rrk,    X86::VPSUBUSWZ256rmk,      0 },
-    { X86::VPSUBWZ256rrk,      X86::VPSUBWZ256rmk,        0 },
-    { X86::VPTERNLOGDZ256rrik, X86::VPTERNLOGDZ256rmik,   0 },
-    { X86::VPTERNLOGQZ256rrik, X86::VPTERNLOGQZ256rmik,   0 },
-    { X86::VPUNPCKHBWZ256rrk,  X86::VPUNPCKHBWZ256rmk,    0 },
-    { X86::VPUNPCKHDQZ256rrk,  X86::VPUNPCKHDQZ256rmk,    0 },
-    { X86::VPUNPCKHQDQZ256rrk, X86::VPUNPCKHQDQZ256rmk,   0 },
-    { X86::VPUNPCKHWDZ256rrk,  X86::VPUNPCKHWDZ256rmk,    0 },
-    { X86::VPUNPCKLBWZ256rrk,  X86::VPUNPCKLBWZ256rmk,    0 },
-    { X86::VPUNPCKLDQZ256rrk,  X86::VPUNPCKLDQZ256rmk,    0 },
-    { X86::VPUNPCKLQDQZ256rrk, X86::VPUNPCKLQDQZ256rmk,   0 },
-    { X86::VPUNPCKLWDZ256rrk,  X86::VPUNPCKLWDZ256rmk,    0 },
-    { X86::VPXORDZ256rrk,      X86::VPXORDZ256rmk,        0 },
-    { X86::VPXORQZ256rrk,      X86::VPXORQZ256rmk,        0 },
-    { X86::VSHUFPDZ256rrik,    X86::VSHUFPDZ256rmik,      0 },
-    { X86::VSHUFPSZ256rrik,    X86::VSHUFPSZ256rmik,      0 },
-    { X86::VSUBPDZ256rrk,      X86::VSUBPDZ256rmk,        0 },
-    { X86::VSUBPSZ256rrk,      X86::VSUBPSZ256rmk,        0 },
-    { X86::VUNPCKHPDZ256rrk,   X86::VUNPCKHPDZ256rmk,     0 },
-    { X86::VUNPCKHPSZ256rrk,   X86::VUNPCKHPSZ256rmk,     0 },
-    { X86::VUNPCKLPDZ256rrk,   X86::VUNPCKLPDZ256rmk,     0 },
-    { X86::VUNPCKLPSZ256rrk,   X86::VUNPCKLPSZ256rmk,     0 },
-    { X86::VXORPDZ256rrk,      X86::VXORPDZ256rmk,        0 },
-    { X86::VXORPSZ256rrk,      X86::VXORPSZ256rmk,        0 },
-
-    // AVX-512{F,VL} foldable instructions 128-bit
-    { X86::VADDPDZ128rrk,      X86::VADDPDZ128rmk,        0 },
-    { X86::VADDPSZ128rrk,      X86::VADDPSZ128rmk,        0 },
-    { X86::VALIGNDZ128rrik,    X86::VALIGNDZ128rmik,      0 },
-    { X86::VALIGNQZ128rrik,    X86::VALIGNQZ128rmik,      0 },
-    { X86::VANDNPDZ128rrk,     X86::VANDNPDZ128rmk,       0 },
-    { X86::VANDNPSZ128rrk,     X86::VANDNPSZ128rmk,       0 },
-    { X86::VANDPDZ128rrk,      X86::VANDPDZ128rmk,        0 },
-    { X86::VANDPSZ128rrk,      X86::VANDPSZ128rmk,        0 },
-    { X86::VDIVPDZ128rrk,      X86::VDIVPDZ128rmk,        0 },
-    { X86::VDIVPSZ128rrk,      X86::VDIVPSZ128rmk,        0 },
-    { X86::VMAXCPDZ128rrk,     X86::VMAXCPDZ128rmk,       0 },
-    { X86::VMAXCPSZ128rrk,     X86::VMAXCPSZ128rmk,       0 },
-    { X86::VMAXPDZ128rrk,      X86::VMAXPDZ128rmk,        0 },
-    { X86::VMAXPSZ128rrk,      X86::VMAXPSZ128rmk,        0 },
-    { X86::VMINCPDZ128rrk,     X86::VMINCPDZ128rmk,       0 },
-    { X86::VMINCPSZ128rrk,     X86::VMINCPSZ128rmk,       0 },
-    { X86::VMINPDZ128rrk,      X86::VMINPDZ128rmk,        0 },
-    { X86::VMINPSZ128rrk,      X86::VMINPSZ128rmk,        0 },
-    { X86::VMULPDZ128rrk,      X86::VMULPDZ128rmk,        0 },
-    { X86::VMULPSZ128rrk,      X86::VMULPSZ128rmk,        0 },
-    { X86::VORPDZ128rrk,       X86::VORPDZ128rmk,         0 },
-    { X86::VORPSZ128rrk,       X86::VORPSZ128rmk,         0 },
-    { X86::VPACKSSDWZ128rrk,   X86::VPACKSSDWZ128rmk,     0 },
-    { X86::VPACKSSWBZ128rrk,   X86::VPACKSSWBZ128rmk,     0 },
-    { X86::VPACKUSDWZ128rrk,   X86::VPACKUSDWZ128rmk,     0 },
-    { X86::VPACKUSWBZ128rrk,   X86::VPACKUSWBZ128rmk,     0 },
-    { X86::VPADDBZ128rrk,      X86::VPADDBZ128rmk,        0 },
-    { X86::VPADDDZ128rrk,      X86::VPADDDZ128rmk,        0 },
-    { X86::VPADDQZ128rrk,      X86::VPADDQZ128rmk,        0 },
-    { X86::VPADDSBZ128rrk,     X86::VPADDSBZ128rmk,       0 },
-    { X86::VPADDSWZ128rrk,     X86::VPADDSWZ128rmk,       0 },
-    { X86::VPADDUSBZ128rrk,    X86::VPADDUSBZ128rmk,      0 },
-    { X86::VPADDUSWZ128rrk,    X86::VPADDUSWZ128rmk,      0 },
-    { X86::VPADDWZ128rrk,      X86::VPADDWZ128rmk,        0 },
-    { X86::VPALIGNRZ128rrik,   X86::VPALIGNRZ128rmik,     0 },
-    { X86::VPANDDZ128rrk,      X86::VPANDDZ128rmk,        0 },
-    { X86::VPANDNDZ128rrk,     X86::VPANDNDZ128rmk,       0 },
-    { X86::VPANDNQZ128rrk,     X86::VPANDNQZ128rmk,       0 },
-    { X86::VPANDQZ128rrk,      X86::VPANDQZ128rmk,        0 },
-    { X86::VPAVGBZ128rrk,      X86::VPAVGBZ128rmk,        0 },
-    { X86::VPAVGWZ128rrk,      X86::VPAVGWZ128rmk,        0 },
-    { X86::VPERMBZ128rrk,      X86::VPERMBZ128rmk,        0 },
-    { X86::VPERMI2B128rrk,     X86::VPERMI2B128rmk,       0 },
-    { X86::VPERMI2D128rrk,     X86::VPERMI2D128rmk,       0 },
-    { X86::VPERMI2PD128rrk,    X86::VPERMI2PD128rmk,      0 },
-    { X86::VPERMI2PS128rrk,    X86::VPERMI2PS128rmk,      0 },
-    { X86::VPERMI2Q128rrk,     X86::VPERMI2Q128rmk,       0 },
-    { X86::VPERMI2W128rrk,     X86::VPERMI2W128rmk,       0 },
-    { X86::VPERMILPDZ128rrk,   X86::VPERMILPDZ128rmk,     0 },
-    { X86::VPERMILPSZ128rrk,   X86::VPERMILPSZ128rmk,     0 },
-    { X86::VPERMT2B128rrk,     X86::VPERMT2B128rmk,       0 },
-    { X86::VPERMT2D128rrk,     X86::VPERMT2D128rmk,       0 },
-    { X86::VPERMT2PD128rrk,    X86::VPERMT2PD128rmk,      0 },
-    { X86::VPERMT2PS128rrk,    X86::VPERMT2PS128rmk,      0 },
-    { X86::VPERMT2Q128rrk,     X86::VPERMT2Q128rmk,       0 },
-    { X86::VPERMT2W128rrk,     X86::VPERMT2W128rmk,       0 },
-    { X86::VPERMWZ128rrk,      X86::VPERMWZ128rmk,        0 },
-    { X86::VPMADDUBSWZ128rrk,  X86::VPMADDUBSWZ128rmk,    0 },
-    { X86::VPMADDWDZ128rrk,    X86::VPMADDWDZ128rmk,      0 },
-    { X86::VPMAXSBZ128rrk,     X86::VPMAXSBZ128rmk,       0 },
-    { X86::VPMAXSDZ128rrk,     X86::VPMAXSDZ128rmk,       0 },
-    { X86::VPMAXSQZ128rrk,     X86::VPMAXSQZ128rmk,       0 },
-    { X86::VPMAXSWZ128rrk,     X86::VPMAXSWZ128rmk,       0 },
-    { X86::VPMAXUBZ128rrk,     X86::VPMAXUBZ128rmk,       0 },
-    { X86::VPMAXUDZ128rrk,     X86::VPMAXUDZ128rmk,       0 },
-    { X86::VPMAXUQZ128rrk,     X86::VPMAXUQZ128rmk,       0 },
-    { X86::VPMAXUWZ128rrk,     X86::VPMAXUWZ128rmk,       0 },
-    { X86::VPMINSBZ128rrk,     X86::VPMINSBZ128rmk,       0 },
-    { X86::VPMINSDZ128rrk,     X86::VPMINSDZ128rmk,       0 },
-    { X86::VPMINSQZ128rrk,     X86::VPMINSQZ128rmk,       0 },
-    { X86::VPMINSWZ128rrk,     X86::VPMINSWZ128rmk,       0 },
-    { X86::VPMINUBZ128rrk,     X86::VPMINUBZ128rmk,       0 },
-    { X86::VPMINUDZ128rrk,     X86::VPMINUDZ128rmk,       0 },
-    { X86::VPMINUQZ128rrk,     X86::VPMINUQZ128rmk,       0 },
-    { X86::VPMINUWZ128rrk,     X86::VPMINUWZ128rmk,       0 },
-    { X86::VPMULDQZ128rrk,     X86::VPMULDQZ128rmk,       0 },
-    { X86::VPMULLDZ128rrk,     X86::VPMULLDZ128rmk,       0 },
-    { X86::VPMULLQZ128rrk,     X86::VPMULLQZ128rmk,       0 },
-    { X86::VPMULLWZ128rrk,     X86::VPMULLWZ128rmk,       0 },
-    { X86::VPMULUDQZ128rrk,    X86::VPMULUDQZ128rmk,      0 },
-    { X86::VPORDZ128rrk,       X86::VPORDZ128rmk,         0 },
-    { X86::VPORQZ128rrk,       X86::VPORQZ128rmk,         0 },
-    { X86::VPSHUFBZ128rrk,     X86::VPSHUFBZ128rmk,       0 },
-    { X86::VPSLLDZ128rrk,      X86::VPSLLDZ128rmk,        0 },
-    { X86::VPSLLQZ128rrk,      X86::VPSLLQZ128rmk,        0 },
-    { X86::VPSLLVDZ128rrk,     X86::VPSLLVDZ128rmk,       0 },
-    { X86::VPSLLVQZ128rrk,     X86::VPSLLVQZ128rmk,       0 },
-    { X86::VPSLLVWZ128rrk,     X86::VPSLLVWZ128rmk,       0 },
-    { X86::VPSLLWZ128rrk,      X86::VPSLLWZ128rmk,        0 },
-    { X86::VPSRADZ128rrk,      X86::VPSRADZ128rmk,        0 },
-    { X86::VPSRAQZ128rrk,      X86::VPSRAQZ128rmk,        0 },
-    { X86::VPSRAVDZ128rrk,     X86::VPSRAVDZ128rmk,       0 },
-    { X86::VPSRAVQZ128rrk,     X86::VPSRAVQZ128rmk,       0 },
-    { X86::VPSRAVWZ128rrk,     X86::VPSRAVWZ128rmk,       0 },
-    { X86::VPSRAWZ128rrk,      X86::VPSRAWZ128rmk,        0 },
-    { X86::VPSRLDZ128rrk,      X86::VPSRLDZ128rmk,        0 },
-    { X86::VPSRLQZ128rrk,      X86::VPSRLQZ128rmk,        0 },
-    { X86::VPSRLVDZ128rrk,     X86::VPSRLVDZ128rmk,       0 },
-    { X86::VPSRLVQZ128rrk,     X86::VPSRLVQZ128rmk,       0 },
-    { X86::VPSRLVWZ128rrk,     X86::VPSRLVWZ128rmk,       0 },
-    { X86::VPSRLWZ128rrk,      X86::VPSRLWZ128rmk,        0 },
-    { X86::VPSUBBZ128rrk,      X86::VPSUBBZ128rmk,        0 },
-    { X86::VPSUBDZ128rrk,      X86::VPSUBDZ128rmk,        0 },
-    { X86::VPSUBQZ128rrk,      X86::VPSUBQZ128rmk,        0 },
-    { X86::VPSUBSBZ128rrk,     X86::VPSUBSBZ128rmk,       0 },
-    { X86::VPSUBSWZ128rrk,     X86::VPSUBSWZ128rmk,       0 },
-    { X86::VPSUBUSBZ128rrk,    X86::VPSUBUSBZ128rmk,      0 },
-    { X86::VPSUBUSWZ128rrk,    X86::VPSUBUSWZ128rmk,      0 },
-    { X86::VPSUBWZ128rrk,      X86::VPSUBWZ128rmk,        0 },
-    { X86::VPTERNLOGDZ128rrik, X86::VPTERNLOGDZ128rmik,   0 },
-    { X86::VPTERNLOGQZ128rrik, X86::VPTERNLOGQZ128rmik,   0 },
-    { X86::VPUNPCKHBWZ128rrk,  X86::VPUNPCKHBWZ128rmk,    0 },
-    { X86::VPUNPCKHDQZ128rrk,  X86::VPUNPCKHDQZ128rmk,    0 },
-    { X86::VPUNPCKHQDQZ128rrk, X86::VPUNPCKHQDQZ128rmk,   0 },
-    { X86::VPUNPCKHWDZ128rrk,  X86::VPUNPCKHWDZ128rmk,    0 },
-    { X86::VPUNPCKLBWZ128rrk,  X86::VPUNPCKLBWZ128rmk,    0 },
-    { X86::VPUNPCKLDQZ128rrk,  X86::VPUNPCKLDQZ128rmk,    0 },
-    { X86::VPUNPCKLQDQZ128rrk, X86::VPUNPCKLQDQZ128rmk,   0 },
-    { X86::VPUNPCKLWDZ128rrk,  X86::VPUNPCKLWDZ128rmk,    0 },
-    { X86::VPXORDZ128rrk,      X86::VPXORDZ128rmk,        0 },
-    { X86::VPXORQZ128rrk,      X86::VPXORQZ128rmk,        0 },
-    { X86::VSHUFPDZ128rrik,    X86::VSHUFPDZ128rmik,      0 },
-    { X86::VSHUFPSZ128rrik,    X86::VSHUFPSZ128rmik,      0 },
-    { X86::VSUBPDZ128rrk,      X86::VSUBPDZ128rmk,        0 },
-    { X86::VSUBPSZ128rrk,      X86::VSUBPSZ128rmk,        0 },
-    { X86::VUNPCKHPDZ128rrk,   X86::VUNPCKHPDZ128rmk,     0 },
-    { X86::VUNPCKHPSZ128rrk,   X86::VUNPCKHPSZ128rmk,     0 },
-    { X86::VUNPCKLPDZ128rrk,   X86::VUNPCKLPDZ128rmk,     0 },
-    { X86::VUNPCKLPSZ128rrk,   X86::VUNPCKLPSZ128rmk,     0 },
-    { X86::VXORPDZ128rrk,      X86::VXORPDZ128rmk,        0 },
-    { X86::VXORPSZ128rrk,      X86::VXORPSZ128rmk,        0 },
-
-    // 512-bit three source instructions with zero masking.
-    { X86::VPERMI2Brrkz,       X86::VPERMI2Brmkz,         0 },
-    { X86::VPERMI2Drrkz,       X86::VPERMI2Drmkz,         0 },
-    { X86::VPERMI2PSrrkz,      X86::VPERMI2PSrmkz,        0 },
-    { X86::VPERMI2PDrrkz,      X86::VPERMI2PDrmkz,        0 },
-    { X86::VPERMI2Qrrkz,       X86::VPERMI2Qrmkz,         0 },
-    { X86::VPERMI2Wrrkz,       X86::VPERMI2Wrmkz,         0 },
-    { X86::VPERMT2Brrkz,       X86::VPERMT2Brmkz,         0 },
-    { X86::VPERMT2Drrkz,       X86::VPERMT2Drmkz,         0 },
-    { X86::VPERMT2PSrrkz,      X86::VPERMT2PSrmkz,        0 },
-    { X86::VPERMT2PDrrkz,      X86::VPERMT2PDrmkz,        0 },
-    { X86::VPERMT2Qrrkz,       X86::VPERMT2Qrmkz,         0 },
-    { X86::VPERMT2Wrrkz,       X86::VPERMT2Wrmkz,         0 },
-    { X86::VPTERNLOGDZrrikz,   X86::VPTERNLOGDZrmikz,     0 },
-    { X86::VPTERNLOGQZrrikz,   X86::VPTERNLOGQZrmikz,     0 },
-
-    // 256-bit three source instructions with zero masking.
-    { X86::VPERMI2B256rrkz,    X86::VPERMI2B256rmkz,      0 },
-    { X86::VPERMI2D256rrkz,    X86::VPERMI2D256rmkz,      0 },
-    { X86::VPERMI2PD256rrkz,   X86::VPERMI2PD256rmkz,     0 },
-    { X86::VPERMI2PS256rrkz,   X86::VPERMI2PS256rmkz,     0 },
-    { X86::VPERMI2Q256rrkz,    X86::VPERMI2Q256rmkz,      0 },
-    { X86::VPERMI2W256rrkz,    X86::VPERMI2W256rmkz,      0 },
-    { X86::VPERMT2B256rrkz,    X86::VPERMT2B256rmkz,      0 },
-    { X86::VPERMT2D256rrkz,    X86::VPERMT2D256rmkz,      0 },
-    { X86::VPERMT2PD256rrkz,   X86::VPERMT2PD256rmkz,     0 },
-    { X86::VPERMT2PS256rrkz,   X86::VPERMT2PS256rmkz,     0 },
-    { X86::VPERMT2Q256rrkz,    X86::VPERMT2Q256rmkz,      0 },
-    { X86::VPERMT2W256rrkz,    X86::VPERMT2W256rmkz,      0 },
-    { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz,  0 },
-    { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz,  0 },
-
-    // 128-bit three source instructions with zero masking.
-    { X86::VPERMI2B128rrkz,    X86::VPERMI2B128rmkz,      0 },
-    { X86::VPERMI2D128rrkz,    X86::VPERMI2D128rmkz,      0 },
-    { X86::VPERMI2PD128rrkz,   X86::VPERMI2PD128rmkz,     0 },
-    { X86::VPERMI2PS128rrkz,   X86::VPERMI2PS128rmkz,     0 },
-    { X86::VPERMI2Q128rrkz,    X86::VPERMI2Q128rmkz,      0 },
-    { X86::VPERMI2W128rrkz,    X86::VPERMI2W128rmkz,      0 },
-    { X86::VPERMT2B128rrkz,    X86::VPERMT2B128rmkz,      0 },
-    { X86::VPERMT2D128rrkz,    X86::VPERMT2D128rmkz,      0 },
-    { X86::VPERMT2PD128rrkz,   X86::VPERMT2PD128rmkz,     0 },
-    { X86::VPERMT2PS128rrkz,   X86::VPERMT2PS128rmkz,     0 },
-    { X86::VPERMT2Q128rrkz,    X86::VPERMT2Q128rmkz,      0 },
-    { X86::VPERMT2W128rrkz,    X86::VPERMT2W128rmkz,      0 },
-    { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz,  0 },
-    { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz,  0 },
-  };
 
   for (X86MemoryFoldTableEntry Entry : MemoryFoldTable4) {
     AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
@@ -3539,20 +163,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
                   // Index 4, folded load
                   Entry.Flags | TB_INDEX_4 | TB_FOLDED_LOAD);
   }
-  for (I = X86InstrFMA3Info::rm_begin(); I != E; ++I) {
-    if (I.getGroup()->isKMasked()) {
-      // Intrinsics need to pass TB_NO_REVERSE.
-      if (I.getGroup()->isIntrinsic()) {
-        AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
-                      I.getRegOpcode(), I.getMemOpcode(),
-                      TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD | TB_NO_REVERSE);
-      } else {
-        AddTableEntry(RegOp2MemOpTable4, MemOp2RegOpTable,
-                      I.getRegOpcode(), I.getMemOpcode(),
-                      TB_ALIGN_NONE | TB_INDEX_4 | TB_FOLDED_LOAD);
-      }
-    }
-  }
 }
 
 void
@@ -5930,7 +2540,7 @@ void X86InstrInfo::replaceBranchWithTailCall(
 
   // Add implicit uses and defs of all live regs potentially clobbered by the
   // call. This way they still appear live across the call.
-  LivePhysRegs LiveRegs(&getRegisterInfo());
+  LivePhysRegs LiveRegs(getRegisterInfo());
   LiveRegs.addLiveOuts(MBB);
   SmallVector<std::pair<unsigned, const MachineOperand *>, 8> Clobbers;
   LiveRegs.stepForward(*MIB, Clobbers);
@@ -6545,9 +3155,9 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     // first frame index.
     // See X86ISelLowering.cpp - X86::hasCopyImplyingStackAdjustment.
 
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    const TargetRegisterInfo &TRI = getRegisterInfo();
     MachineBasicBlock::LivenessQueryResult LQR =
-        MBB.computeRegisterLiveness(TRI, AX, MI);
+        MBB.computeRegisterLiveness(&TRI, AX, MI);
     // We do not want to save and restore AX if we do not have to.
     // Moreover, if we do so whereas AX is dead, we would need to set
     // an undef flag on the use of AX, otherwise the verifier will
@@ -6564,7 +3174,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       }
       // AX contains the top most register in the aliasing hierarchy.
       // It may not be live, but one of its aliases may be.
-      for (MCRegAliasIterator AI(AX, TRI, true);
+      for (MCRegAliasIterator AI(AX, &TRI, true);
            AI.isValid() && LQR != MachineBasicBlock::LQR_Live; ++AI)
         LQR = LPR.contains(*AI) ? MachineBasicBlock::LQR_Live
                                 : MachineBasicBlock::LQR_Dead;
@@ -8374,7 +4984,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
   unsigned Opc = LoadMI.getOpcode();
   unsigned UserOpc = UserMI.getOpcode();
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-  const TargetRegisterClass *RC = 
+  const TargetRegisterClass *RC =
       MF.getRegInfo().getRegClass(LoadMI.getOperand(0).getReg());
   unsigned RegSize = TRI.getRegSizeInBits(*RC);
 
@@ -10473,7 +7083,7 @@ X86InstrInfo::getOutliningType(MachineInstr &MI) const {
   // catch it.
   if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
       MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
-      MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP)) 
+      MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP))
     return MachineOutlinerInstrType::Illegal;
 
   // Outlined calls change the instruction pointer, so don't read from it.
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 01df07e1715f..fab70e918b8a 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -813,6 +813,8 @@ def UseAVX2      : Predicate<"Subtarget->hasAVX2() && !Subtarget->hasAVX512()">;
 def NoAVX512     : Predicate<"!Subtarget->hasAVX512()">;
 def HasCDI       : Predicate<"Subtarget->hasCDI()">,
                      AssemblerPredicate<"FeatureCDI", "AVX-512 CD ISA">;
+def HasVPOPCNTDQ : Predicate<"Subtarget->hasVPOPCNTDQ()">,
+                   AssemblerPredicate<"FeatureVPOPCNTDQ", "AVX-512 VPOPCNTDQ ISA">;
 def HasPFI       : Predicate<"Subtarget->hasPFI()">,
                      AssemblerPredicate<"FeaturePFI", "AVX-512 PF ISA">;
 def HasERI       : Predicate<"Subtarget->hasERI()">,
@@ -1436,11 +1438,14 @@ def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
 // Longer forms that use a ModR/M byte. Needed for disassembler
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 def MOV8ri_alt  : Ii8 <0xC6, MRM0r, (outs GR8 :$dst), (ins i8imm :$src),
-                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+                   FoldGenData<"MOV8ri">;
 def MOV16ri_alt : Ii16<0xC7, MRM0r, (outs GR16:$dst), (ins i16imm:$src),
-                   "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+                   "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16,
+                   FoldGenData<"MOV16ri">;
 def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
-                   "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+                   "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32,
+                   FoldGenData<"MOV32ri">;
 }
 } // SchedRW
 
@@ -1563,13 +1568,17 @@ def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst),
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
     SchedRW = [WriteMove] in {
 def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
-                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+                   "mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+                   FoldGenData<"MOV8rr">;
 def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
-                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16;
+                    "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize16,
+                    FoldGenData<"MOV16rr">;
 def MOV32rr_REV : I<0x8B, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
-                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32;
+                    "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV>, OpSize32,
+                    FoldGenData<"MOV32rr">;
 def MOV64rr_REV : RI<0x8B, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
-                     "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
+                     "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV>,
+                     FoldGenData<"MOV64rr">;
 }
 
 let canFoldAsLoad = 1, isReMaterializable = 1, SchedRW = [WriteLoad] in {
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index dc3800ce381b..2c047722db24 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -248,7 +248,8 @@ def MMX_MOVD64grr : MMXI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR64:$src),
                          "movd\t{$src, $dst|$dst, $src}",
                          [(set GR32:$dst,
                           (MMX_X86movd2w (x86mmx VR64:$src)))],
-                          IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>;
+                          IIC_MMX_MOV_REG_MM>, Sched<[WriteMove]>,
+                          FoldGenData<"MMX_MOVD64rr">;
 
 let isBitcast = 1 in
 def MMX_MOVD64to64rr : MMXRI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR64:$src),
@@ -277,7 +278,7 @@ def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}", [],
-                        IIC_MMX_MOVQ_RR>;
+                        IIC_MMX_MOVQ_RR>, FoldGenData<"MMX_MOVQ64rr">;
 }
 } // SchedRW
 
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index f73d85e7e01b..a3e677209305 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -507,7 +507,8 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
 
 multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
                          X86MemOperand x86memop, string base_opc,
-                         string asm_opr, Domain d = GenericDomain> {
+                         string asm_opr, Domain d = GenericDomain,
+                         string Name> {
   let isCommutable = 1 in
   def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
               (ins VR128:$src1, RC:$src2),
@@ -521,15 +522,17 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
   def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
                   (ins VR128:$src1, RC:$src2),
                   !strconcat(base_opc, asm_opr),
-                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>;
+                  [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>,
+                  FoldGenData<Name#rr>;
 }
 
 multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
                       X86MemOperand x86memop, string OpcodeStr,
-                      Domain d = GenericDomain> {
+                      Domain d = GenericDomain, string Name> {
   // AVX
   defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
-                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
+                              "V"#Name>,
                               VEX_4V, VEX_LIG, VEX_WIG;
 
   def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
@@ -539,7 +542,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
   // SSE1 & 2
   let Constraints = "$src1 = $dst" in {
     defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
-                              "\t{$src2, $dst|$dst, $src2}", d>;
+                              "\t{$src2, $dst|$dst, $src2}", d, Name>;
   }
 
   def NAME#mr   : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
@@ -563,9 +566,9 @@ multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
 }
 
 defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss",
-                        SSEPackedSingle>, XS;
+                        SSEPackedSingle, "MOVSS">, XS;
 defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
-                        SSEPackedDouble>, XD;
+                        SSEPackedDouble, "MOVSD">, XD;
 
 let canFoldAsLoad = 1, isReMaterializable = 1 in {
   defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss",
@@ -864,35 +867,43 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
   def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
                           (ins VR128:$src),
                           "movaps\t{$src, $dst|$dst, $src}", [],
-                          IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG;
+                          IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG,
+                          FoldGenData<"VMOVAPSrr">;
   def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
                            "movapd\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG;
+                           IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG,
+                           FoldGenData<"VMOVAPDrr">;
   def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
                            "movups\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG;
+                           IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG,
+                           FoldGenData<"VMOVUPSrr">;
   def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
                            "movupd\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG;
+                           IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG,
+                           FoldGenData<"VMOVUPDrr">;
   def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movaps\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
+                            FoldGenData<"VMOVAPSYrr">;
   def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movapd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
+                            FoldGenData<"VMOVAPDYrr">;
   def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movups\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
+                            FoldGenData<"VMOVUPSYrr">;
   def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movupd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
+                            FoldGenData<"VMOVUPDYrr">;
 }
 
 // Aliases to help the assembler pick two byte VEX encodings by swapping the
@@ -938,16 +949,16 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
     SchedRW = [WriteFShuffle] in {
   def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                          "movaps\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_MOVA_P_RR>;
+                         IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPSrr">;
   def MOVAPDrr_REV : PDI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                          "movapd\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_MOVA_P_RR>;
+                         IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVAPDrr">;
   def MOVUPSrr_REV : PSI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                          "movups\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_MOVU_P_RR>;
+                         IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPSrr">;
   def MOVUPDrr_REV : PDI<0x11, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                          "movupd\t{$src, $dst|$dst, $src}", [],
-                         IIC_SSE_MOVU_P_RR>;
+                         IIC_SSE_MOVU_P_RR>, FoldGenData<"MOVUPDrr">;
 }
 
 let Predicates = [HasAVX, NoVLX] in {
@@ -3752,17 +3763,19 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
 def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVA_P_RR>,
-                        VEX, VEX_WIG;
+                        VEX, VEX_WIG, FoldGenData<"VMOVDQArr">;
 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG;
+                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG,
+                        FoldGenData<"VMOVDQAYrr">;
 def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVU_P_RR>,
-                        VEX, VEX_WIG;
+                        VEX, VEX_WIG, FoldGenData<"VMOVDQUrr">;
 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG;
+                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG,
+                        FoldGenData<"VMOVDQUYrr">;
 }
 
 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
@@ -3820,11 +3833,12 @@ def MOVDQUrr :   I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
 def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                        "movdqa\t{$src, $dst|$dst, $src}", [],
-                       IIC_SSE_MOVA_P_RR>;
+                       IIC_SSE_MOVA_P_RR>, FoldGenData<"MOVDQArr">;
 
 def MOVDQUrr_REV :   I<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                        "movdqu\t{$src, $dst|$dst, $src}",
-                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
+                       [], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>,
+                       FoldGenData<"MOVDQUrr">;
 }
 } // SchedRW
 
@@ -5915,7 +5929,7 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
                    (ins VR128:$src1, u8imm:$src2),
                    !strconcat(OpcodeStr,
                    "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                   []>, Sched<[WriteShuffle]>;
+                   []>, Sched<[WriteShuffle]>, FoldGenData<NAME#ri>;
 
   let hasSideEffects = 0, mayStore = 1,
       SchedRW = [WriteShuffleLd, WriteRMW] in
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 53224431c0e9..5dde2d07babe 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -111,7 +111,7 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
                (ins VR128:$src1, VR128:$src2),
                !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                []>,
-               XOP_4V, VEX_W, Sched<[WriteVarVecShift]>;
+               XOP_4V, VEX_W, Sched<[WriteVarVecShift]>, FoldGenData<NAME#rr>;
 }
 
 let ExeDomain = SSEPackedInt in {
@@ -282,7 +282,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
                 (ins VR128:$src1, VR128:$src2, VR128:$src3),
                 !strconcat(OpcodeStr,
                 "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-                []>, XOP_4V, VEX_W;
+                []>, XOP_4V, VEX_W, FoldGenData<NAME#rrr>;
 }
 
 let ExeDomain = SSEPackedInt in {
@@ -318,7 +318,7 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
             (ins RC:$src1, RC:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            []>, XOP_4V, VEX_W;
+            []>, XOP_4V, VEX_W, FoldGenData<NAME#rrr>;
 }
 
 let ExeDomain = SSEPackedInt in {
@@ -357,7 +357,7 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
         (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        []>, VEX_W;
+        []>, VEX_W, FoldGenData<NAME#rr>;
 }
 
 let ExeDomain = SSEPackedDouble in {
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 61956f741820..77dead8d2413 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -302,6 +302,26 @@ unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
                               : HasAVX512
                                     ? X86::VMOVUPSZ128mr_NOVLX
                                     : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+  } else if (Ty.isVector() && Ty.getSizeInBits() == 256) {
+    if (Alignment >= 32)
+      return Isload ? (HasVLX ? X86::VMOVAPSZ256rm
+                              : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
+                                          : X86::VMOVAPSYrm)
+                    : (HasVLX ? X86::VMOVAPSZ256mr
+                              : HasAVX512 ? X86::VMOVAPSZ256mr_NOVLX
+                                          : X86::VMOVAPSYmr);
+    else
+      return Isload ? (HasVLX ? X86::VMOVUPSZ256rm
+                              : HasAVX512 ? X86::VMOVUPSZ256rm_NOVLX
+                                          : X86::VMOVUPSYrm)
+                    : (HasVLX ? X86::VMOVUPSZ256mr
+                              : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
+                                          : X86::VMOVUPSYmr);
+  } else if (Ty.isVector() && Ty.getSizeInBits() == 512) {
+    if (Alignment >= 64)
+      return Isload ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
+    else
+      return Isload ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
   }
   return Opc;
 }
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index da724f5d8989..979aaee110aa 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -35,6 +35,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   setLegalizerInfoSSE1();
   setLegalizerInfoSSE2();
   setLegalizerInfoSSE41();
+  setLegalizerInfoAVX();
   setLegalizerInfoAVX2();
   setLegalizerInfoAVX512();
   setLegalizerInfoAVX512DQ();
@@ -209,6 +210,18 @@ void X86LegalizerInfo::setLegalizerInfoSSE41() {
   setAction({G_MUL, v4s32}, Legal);
 }
 
+void X86LegalizerInfo::setLegalizerInfoAVX() {
+  if (!Subtarget.hasAVX())
+    return;
+
+  const LLT v8s32 = LLT::vector(8, 32);
+  const LLT v4s64 = LLT::vector(4, 64);
+
+  for (unsigned MemOp : {G_LOAD, G_STORE})
+    for (auto Ty : {v8s32, v4s64})
+      setAction({MemOp, Ty}, Legal);
+}
+
 void X86LegalizerInfo::setLegalizerInfoAVX2() {
   if (!Subtarget.hasAVX2())
     return;
@@ -239,6 +252,10 @@ void X86LegalizerInfo::setLegalizerInfoAVX512() {
 
   setAction({G_MUL, v16s32}, Legal);
 
+  for (unsigned MemOp : {G_LOAD, G_STORE})
+    for (auto Ty : {v16s32, v8s64})
+      setAction({MemOp, Ty}, Legal);
+
   /************ VLX *******************/
   if (!Subtarget.hasVLX())
     return;
diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h
index ab5405a70427..135950a95f84 100644
--- a/lib/Target/X86/X86LegalizerInfo.h
+++ b/lib/Target/X86/X86LegalizerInfo.h
@@ -39,6 +39,7 @@ private:
   void setLegalizerInfoSSE1();
   void setLegalizerInfoSSE2();
   void setLegalizerInfoSSE41();
+  void setLegalizerInfoAVX();
   void setLegalizerInfoAVX2();
   void setLegalizerInfoAVX512();
   void setLegalizerInfoAVX512DQ();
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 2b1f43bffd71..84ec98484f8e 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -286,6 +286,7 @@ void X86Subtarget::initializeEnvironment() {
   HasCDI = false;
   HasPFI = false;
   HasDQI = false;
+  HasVPOPCNTDQ = false;
   HasBWI = false;
   HasVLX = false;
   HasADX = false;
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index a9f3a2aee1be..550e95c39ab5 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -270,6 +270,9 @@ protected:
   /// Processor has AVX-512 Conflict Detection Instructions
   bool HasCDI;
 
+  /// Processor has AVX-512 population count Instructions
+  bool HasVPOPCNTDQ;
+
   /// Processor has AVX-512 Doubleword and Quadword instructions
   bool HasDQI;
 
@@ -494,6 +497,7 @@ public:
   bool slow3OpsLEA() const { return Slow3OpsLEA; }
   bool slowIncDec() const { return SlowIncDec; }
   bool hasCDI() const { return HasCDI; }
+  bool hasVPOPCNTDQ() const { return HasVPOPCNTDQ; }
   bool hasPFI() const { return HasPFI; }
   bool hasERI() const { return HasERI; }
   bool hasDQI() const { return HasDQI; }
diff --git a/lib/Transforms/Coroutines/CoroCleanup.cpp b/lib/Transforms/Coroutines/CoroCleanup.cpp
index a97db6fde454..5cf2a8c25d83 100644
--- a/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -124,6 +124,7 @@ struct CoroCleanup : FunctionPass {
     if (!L)
       AU.setPreservesAll();
   }
+  StringRef getPassName() const override { return "Coroutine Cleanup"; }
 };
 }
 
diff --git a/lib/Transforms/Coroutines/CoroEarly.cpp b/lib/Transforms/Coroutines/CoroEarly.cpp
index e8bb0ca99d8a..b52989186165 100644
--- a/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -208,6 +208,9 @@ struct CoroEarly : public FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
   }
+  StringRef getPassName() const override {
+    return "Lower early coroutine intrinsics";
+  }
 };
 }
 
diff --git a/lib/Transforms/Coroutines/CoroElide.cpp b/lib/Transforms/Coroutines/CoroElide.cpp
index c6ac3f614ff7..acb22449142b 100644
--- a/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/lib/Transforms/Coroutines/CoroElide.cpp
@@ -301,6 +301,7 @@ struct CoroElide : FunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AAResultsWrapperPass>();
   }
+  StringRef getPassName() const override { return "Coroutine Elision"; }
 };
 }
 
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index 417d57f7625b..85e9003ec3c5 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -799,9 +799,9 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
     splitAround(CSI, "CoroSuspend");
   }
 
-  // Put fallthrough CoroEnd into its own block. Note: Shape::buildFrom places
-  // the fallthrough coro.end as the first element of CoroEnds array.
-  splitAround(Shape.CoroEnds.front(), "CoroEnd");
+  // Put CoroEnds into their own blocks.
+  for (CoroEndInst *CE : Shape.CoroEnds)
+    splitAround(CE, "CoroEnd");
 
   // Transforms multi-edge PHI Nodes, so that any value feeding into a PHI will
   // never has its definition separated from the PHI by the suspend point.
@@ -813,19 +813,24 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
   IRBuilder<> Builder(F.getContext());
   SpillInfo Spills;
 
-  // See if there are materializable instructions across suspend points.
-  for (Instruction &I : instructions(F))
-    if (materializable(I))
-      for (User *U : I.users())
-        if (Checker.isDefinitionAcrossSuspend(I, U))
-          Spills.emplace_back(&I, U);
+  for (int Repeat = 0; Repeat < 4; ++Repeat) {
+    // See if there are materializable instructions across suspend points.
+    for (Instruction &I : instructions(F))
+      if (materializable(I))
+        for (User *U : I.users())
+          if (Checker.isDefinitionAcrossSuspend(I, U))
+            Spills.emplace_back(&I, U);
 
-  // Rewrite materializable instructions to be materialized at the use point.
-  DEBUG(dump("Materializations", Spills));
-  rewriteMaterializableInstructions(Builder, Spills);
+    if (Spills.empty())
+      break;
+
+    // Rewrite materializable instructions to be materialized at the use point.
+    DEBUG(dump("Materializations", Spills));
+    rewriteMaterializableInstructions(Builder, Spills);
+    Spills.clear();
+  }
 
   // Collect the spills for arguments and other not-materializable values.
-  Spills.clear();
   for (Argument &A : F.args())
     for (User *U : A.users())
       if (Checker.isDefinitionAcrossSuspend(A, U))
@@ -847,8 +852,6 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
         if (I.getType()->isTokenTy())
           report_fatal_error(
               "token definition is separated from the use by a suspend point");
-        assert(!materializable(I) &&
-               "rewriteMaterializable did not do its job");
         Spills.emplace_back(&I, U);
       }
   }
diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp
index 12eb16789825..cd549e4be282 100644
--- a/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -228,15 +228,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
 
   SmallVector<ReturnInst *, 4> Returns;
 
-  if (DISubprogram *SP = F.getSubprogram()) {
-    // If we have debug info, add mapping for the metadata nodes that should not
-    // be cloned by CloneFunctionInfo.
-    auto &MD = VMap.MD();
-    MD[SP->getUnit()].reset(SP->getUnit());
-    MD[SP->getType()].reset(SP->getType());
-    MD[SP->getFile()].reset(SP->getFile());
-  }
-  CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/true, Returns);
+  CloneFunctionInto(NewF, &F, VMap, /*ModuleLevelChanges=*/false, Returns);
 
   // Remove old returns.
   for (ReturnInst *Return : Returns)
@@ -509,12 +501,87 @@ static void simplifySuspendPoints(coro::Shape &Shape) {
   S.resize(N);
 }
 
+static SmallPtrSet<BasicBlock *, 4> getCoroBeginPredBlocks(CoroBeginInst *CB) {
+  // Collect all blocks that we need to look for instructions to relocate.
+  SmallPtrSet<BasicBlock *, 4> RelocBlocks;
+  SmallVector<BasicBlock *, 4> Work;
+  Work.push_back(CB->getParent());
+
+  do {
+    BasicBlock *Current = Work.pop_back_val();
+    for (BasicBlock *BB : predecessors(Current))
+      if (RelocBlocks.count(BB) == 0) {
+        RelocBlocks.insert(BB);
+        Work.push_back(BB);
+      }
+  } while (!Work.empty());
+  return RelocBlocks;
+}
+
+static SmallPtrSet<Instruction *, 8>
+getNotRelocatableInstructions(CoroBeginInst *CoroBegin,
+                              SmallPtrSetImpl<BasicBlock *> &RelocBlocks) {
+  SmallPtrSet<Instruction *, 8> DoNotRelocate;
+  // Collect all instructions that we should not relocate
+  SmallVector<Instruction *, 8> Work;
+
+  // Start with CoroBegin and terminators of all preceding blocks.
+  Work.push_back(CoroBegin);
+  BasicBlock *CoroBeginBB = CoroBegin->getParent();
+  for (BasicBlock *BB : RelocBlocks)
+    if (BB != CoroBeginBB)
+      Work.push_back(BB->getTerminator());
+
+  // For every instruction in the Work list, place its operands in DoNotRelocate
+  // set.
+  do {
+    Instruction *Current = Work.pop_back_val();
+    DoNotRelocate.insert(Current);
+    for (Value *U : Current->operands()) {
+      auto *I = dyn_cast<Instruction>(U);
+      if (!I)
+        continue;
+      if (isa<AllocaInst>(U))
+        continue;
+      if (DoNotRelocate.count(I) == 0) {
+        Work.push_back(I);
+        DoNotRelocate.insert(I);
+      }
+    }
+  } while (!Work.empty());
+  return DoNotRelocate;
+}
+
+static void relocateInstructionBefore(CoroBeginInst *CoroBegin, Function &F) {
+  // Analyze which non-alloca instructions are needed for allocation and
+  // relocate the rest to after coro.begin. We need to do it, since some of the
+  // targets of those instructions may be placed into coroutine frame memory
+  // for which becomes available after coro.begin intrinsic.
+
+  auto BlockSet = getCoroBeginPredBlocks(CoroBegin);
+  auto DoNotRelocateSet = getNotRelocatableInstructions(CoroBegin, BlockSet);
+
+  Instruction *InsertPt = CoroBegin->getNextNode();
+  BasicBlock &BB = F.getEntryBlock(); // TODO: Look at other blocks as well.
+  for (auto B = BB.begin(), E = BB.end(); B != E;) {
+    Instruction &I = *B++;
+    if (isa<AllocaInst>(&I))
+      continue;
+    if (&I == CoroBegin)
+      break;
+    if (DoNotRelocateSet.count(&I))
+      continue;
+    I.moveBefore(InsertPt);
+  }
+}
+
 static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) {
   coro::Shape Shape(F);
   if (!Shape.CoroBegin)
     return;
 
   simplifySuspendPoints(Shape);
+  relocateInstructionBefore(Shape.CoroBegin, F);
   buildCoroutineFrame(F, Shape);
   replaceFrameSize(Shape);
 
@@ -660,6 +727,7 @@ struct CoroSplit : public CallGraphSCCPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     CallGraphSCCPass::getAnalysisUsage(AU);
   }
+  StringRef getPassName() const override { return "Coroutine Splitting"; }
 };
 }
 
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 8dff2fb3be8a..4c417f1c55eb 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -558,17 +558,17 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap(
   std::vector<User *> Users(DuplicateFunction->user_begin(),
                             DuplicateFunction->user_end());
   Function *CurrentCaller = nullptr;
+  std::unique_ptr<BlockFrequencyInfo> TempBFI;
   BlockFrequencyInfo *CurrentCallerBFI = nullptr;
 
   auto ComputeCurrBFI = [&,this](Function *Caller) {
       // For the old pass manager:
       if (!GetBFI) {
-        if (CurrentCallerBFI)
-          delete CurrentCallerBFI;
         DominatorTree DT(*Caller);
         LoopInfo LI(DT);
         BranchProbabilityInfo BPI(*Caller, LI);
-        CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI);
+        TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
+        CurrentCallerBFI = TempBFI.get();
       } else {
         // New pass manager:
         CurrentCallerBFI = &(*GetBFI)(*Caller);
@@ -591,10 +591,6 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap(
     else
       CallSiteToProfCountMap[User] = 0;
   }
-  if (!GetBFI) {
-    if (CurrentCallerBFI)
-      delete CurrentCallerBFI;
-  }
 }
 
 Function *PartialInlinerImpl::unswitchFunction(Function *F) {
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index ec06d5f9fb05..9fd3a9021a27 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -155,6 +155,10 @@ static cl::opt<bool>
                              cl::Hidden,
                              cl::desc("Enable the simple loop unswitch pass."));
 
+static cl::opt<bool> EnableGVNSink(
+    "enable-gvn-sink", cl::init(false), cl::Hidden,
+    cl::desc("Enable the GVN sinking pass (default = on)"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -307,6 +311,11 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
   if (EnableGVNHoist)
     MPM.add(createGVNHoistPass());
+  if (EnableGVNSink) {
+    MPM.add(createGVNSinkPass());
+    MPM.add(createCFGSimplificationPass());
+  }
+
   // Speculative execution if the target has divergent branches; otherwise nop.
   MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
   MPM.add(createJumpThreadingPass());         // Thread jumps.
@@ -904,6 +913,12 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
 
   if (OptLevel != 0)
     addLTOOptimizationPasses(PM);
+  else {
+    // The whole-program-devirt pass needs to run at -O0 because only it knows
+    // about the llvm.type.checked.load intrinsic: it needs to both lower the
+    // intrinsic itself and handle it in the summary.
+    PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
+  }
 
   // Create a function that performs CFI checks for cross-DSO calls with targets
   // in the current module.
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 733eeb1767a3..7204bf517681 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -861,12 +861,9 @@ bool InstCombiner::willNotOverflowSignedSub(const Value *LHS,
       ComputeNumSignBits(RHS, 0, &CxtI) > 1)
     return true;
 
-  unsigned BitWidth = LHS->getType()->getScalarSizeInBits();
-  KnownBits LHSKnown(BitWidth);
-  computeKnownBits(LHS, LHSKnown, 0, &CxtI);
+  KnownBits LHSKnown = computeKnownBits(LHS, 0, &CxtI);
 
-  KnownBits RHSKnown(BitWidth);
-  computeKnownBits(RHS, RHSKnown, 0, &CxtI);
+  KnownBits RHSKnown = computeKnownBits(RHS, 0, &CxtI);
 
   // Subtraction of two 2's complement numbers having identical signs will
   // never overflow.
@@ -1059,9 +1056,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       // If this is a xor that was canonicalized from a sub, turn it back into
       // a sub and fuse this add with it.
       if (LHS->hasOneUse() && (XorRHS->getValue()+1).isPowerOf2()) {
-        IntegerType *IT = cast<IntegerType>(I.getType());
-        KnownBits LHSKnown(IT->getBitWidth());
-        computeKnownBits(XorLHS, LHSKnown, 0, &I);
+        KnownBits LHSKnown = computeKnownBits(XorLHS, 0, &I);
         if ((XorRHS->getValue() | LHSKnown.Zero).isAllOnesValue())
           return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI),
                                            XorLHS);
@@ -1577,8 +1572,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
     // zero.
     if (Op0C->isMask()) {
-      KnownBits RHSKnown(BitWidth);
-      computeKnownBits(Op1, RHSKnown, 0, &I);
+      KnownBits RHSKnown = computeKnownBits(Op1, 0, &I);
       if ((*Op0C | RHSKnown.Zero).isAllOnesValue())
         return BinaryOperator::CreateXor(Op1, Op0);
     }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 4227b2d01be8..1f8319efb3be 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1610,17 +1610,13 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
       Value *Mask = nullptr;
       Value *Masked = nullptr;
       if (LAnd->getOperand(0) == RAnd->getOperand(0) &&
-          isKnownToBeAPowerOfTwo(LAnd->getOperand(1), DL, false, 0, &AC, CxtI,
-                                 &DT) &&
-          isKnownToBeAPowerOfTwo(RAnd->getOperand(1), DL, false, 0, &AC, CxtI,
-                                 &DT)) {
+          isKnownToBeAPowerOfTwo(LAnd->getOperand(1), false, 0, CxtI) &&
+          isKnownToBeAPowerOfTwo(RAnd->getOperand(1), false, 0, CxtI)) {
         Mask = Builder->CreateOr(LAnd->getOperand(1), RAnd->getOperand(1));
         Masked = Builder->CreateAnd(LAnd->getOperand(0), Mask);
       } else if (LAnd->getOperand(1) == RAnd->getOperand(1) &&
-                 isKnownToBeAPowerOfTwo(LAnd->getOperand(0), DL, false, 0, &AC,
-                                        CxtI, &DT) &&
-                 isKnownToBeAPowerOfTwo(RAnd->getOperand(0), DL, false, 0, &AC,
-                                        CxtI, &DT)) {
+                 isKnownToBeAPowerOfTwo(LAnd->getOperand(0), false, 0, CxtI) &&
+                 isKnownToBeAPowerOfTwo(RAnd->getOperand(0), false, 0, CxtI)) {
         Mask = Builder->CreateOr(LAnd->getOperand(0), RAnd->getOperand(0));
         Masked = Builder->CreateAnd(LAnd->getOperand(1), Mask);
       }
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index face7abcc95f..92a38f26dde7 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1378,9 +1378,7 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
   if (!IT)
     return nullptr;
 
-  unsigned BitWidth = IT->getBitWidth();
-  KnownBits Known(BitWidth);
-  IC.computeKnownBits(Op0, Known, 0, &II);
+  KnownBits Known = IC.computeKnownBits(Op0, 0, &II);
 
   // Create a mask for bits above (ctlz) or below (cttz) the first known one.
   bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
@@ -1401,7 +1399,9 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
   // If the input to cttz/ctlz is known to be non-zero,
   // then change the 'ZeroIsUndef' parameter to 'true'
   // because we know the zero behavior can't affect the result.
-  if (Known.One != 0 || isKnownNonZero(Op0, IC.getDataLayout())) {
+  if (Known.One != 0 ||
+      isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
+                     &IC.getDominatorTree())) {
     if (!match(II.getArgOperand(1), m_One())) {
       II.setOperand(1, IC.Builder->getTrue());
       return &II;
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index f4bf5221f6a2..766939c56dff 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -692,8 +692,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
         // This only works for EQ and NE
         ICI->isEquality()) {
       // If Op1C some other power of two, convert:
-      KnownBits Known(Op1C->getType()->getBitWidth());
-      computeKnownBits(ICI->getOperand(0), Known, 0, &CI);
+      KnownBits Known = computeKnownBits(ICI->getOperand(0), 0, &CI);
 
       APInt KnownZeroMask(~Known.Zero);
       if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
@@ -737,14 +736,11 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
   // may lead to additional simplifications.
   if (ICI->isEquality() && CI.getType() == ICI->getOperand(0)->getType()) {
     if (IntegerType *ITy = dyn_cast<IntegerType>(CI.getType())) {
-      uint32_t BitWidth = ITy->getBitWidth();
       Value *LHS = ICI->getOperand(0);
       Value *RHS = ICI->getOperand(1);
 
-      KnownBits KnownLHS(BitWidth);
-      KnownBits KnownRHS(BitWidth);
-      computeKnownBits(LHS, KnownLHS, 0, &CI);
-      computeKnownBits(RHS, KnownRHS, 0, &CI);
+      KnownBits KnownLHS = computeKnownBits(LHS, 0, &CI);
+      KnownBits KnownRHS = computeKnownBits(RHS, 0, &CI);
 
       if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) {
         APInt KnownBits = KnownLHS.Zero | KnownLHS.One;
@@ -1063,9 +1059,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
     // the icmp and sext into bitwise/integer operations.
     if (ICI->hasOneUse() &&
         ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){
-      unsigned BitWidth = Op1C->getType()->getBitWidth();
-      KnownBits Known(BitWidth);
-      computeKnownBits(Op0, Known, 0, &CI);
+      KnownBits Known = computeKnownBits(Op0, 0, &CI);
 
       APInt KnownZeroMask(~Known.Zero);
       if (KnownZeroMask.isPowerOf2()) {
@@ -1104,7 +1098,7 @@ Instruction *InstCombiner::transformSExtICmp(ICmpInst *ICI, Instruction &CI) {
 
           // Distribute the bit over the whole bit width.
           In = Builder->CreateAShr(In, ConstantInt::get(In->getType(),
-                                                        BitWidth - 1), "sext");
+                                      KnownZeroMask.getBitWidth() - 1), "sext");
         }
 
         if (CI.getType() == In->getType())
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 6492eaedae9c..2c2b7317a1c0 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1402,9 +1402,9 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
   if (*C == 0 && Pred == ICmpInst::ICMP_SGT) {
     SelectPatternResult SPR = matchSelectPattern(X, A, B);
     if (SPR.Flavor == SPF_SMIN) {
-      if (isKnownPositive(A, DL))
+      if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT))
         return new ICmpInst(Pred, B, Cmp.getOperand(1));
-      if (isKnownPositive(B, DL))
+      if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT))
         return new ICmpInst(Pred, A, Cmp.getOperand(1));
     }
   }
@@ -1478,8 +1478,7 @@ Instruction *InstCombiner::foldICmpTruncConstant(ICmpInst &Cmp,
     // of the high bits truncated out of x are known.
     unsigned DstBits = Trunc->getType()->getScalarSizeInBits(),
              SrcBits = X->getType()->getScalarSizeInBits();
-    KnownBits Known(SrcBits);
-    computeKnownBits(X, Known, 0, &Cmp);
+    KnownBits Known = computeKnownBits(X, 0, &Cmp);
 
     // If all the high bits are known, we can do this xform.
     if ((Known.Zero | Known.One).countLeadingOnes() >= SrcBits - DstBits) {
@@ -3030,18 +3029,21 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
       break;
     case Instruction::Add:
     case Instruction::Sub:
-    case Instruction::Xor:
+    case Instruction::Xor: {
       if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
         return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
-      // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
-        if (CI->getValue().isSignMask()) {
+
+      const APInt *C;
+      if (match(BO0->getOperand(1), m_APInt(C))) {
+        // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
+        if (C->isSignMask()) {
           ICmpInst::Predicate NewPred =
               I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
           return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
         }
 
-        if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) {
+        // icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b
+        if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) {
           ICmpInst::Predicate NewPred =
               I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate();
           NewPred = I.getSwappedPredicate(NewPred);
@@ -3049,26 +3051,30 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
         }
       }
       break;
-    case Instruction::Mul:
+    }
+    case Instruction::Mul: {
       if (!I.isEquality())
         break;
 
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO0->getOperand(1))) {
-        // a * Cst icmp eq/ne b * Cst --> a & Mask icmp b & Mask
-        // Mask = -1 >> count-trailing-zeros(Cst).
-        if (!CI->isZero() && !CI->isOne()) {
-          const APInt &AP = CI->getValue();
-          ConstantInt *Mask = ConstantInt::get(
-              I.getContext(),
-              APInt::getLowBitsSet(AP.getBitWidth(),
-                                   AP.getBitWidth() - AP.countTrailingZeros()));
+      const APInt *C;
+      if (match(BO0->getOperand(1), m_APInt(C)) && *C != 0 && *C != 1) {
+        // icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask)
+        // Mask = -1 >> count-trailing-zeros(C).
+        if (unsigned TZs = C->countTrailingZeros()) {
+          Constant *Mask = ConstantInt::get(
+              BO0->getType(),
+              APInt::getLowBitsSet(C->getBitWidth(), C->getBitWidth() - TZs));
           Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask);
           Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask);
           return new ICmpInst(Pred, And1, And2);
         }
+        // If there are no trailing zeros in the multiplier, just eliminate
+        // the multiplies (no masking is needed):
+        // icmp eq/ne (X * C), (Y * C) --> icmp eq/ne X, Y
+        return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
       }
       break;
-
+    }
     case Instruction::UDiv:
     case Instruction::LShr:
       if (I.isSigned() || !BO0->isExact() || !BO1->isExact())
@@ -4497,7 +4503,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     // if A is a power of 2.
     if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
         match(Op1, m_Zero()) &&
-        isKnownToBeAPowerOfTwo(A, DL, false, 0, &AC, &I, &DT) && I.isEquality())
+        isKnownToBeAPowerOfTwo(A, false, 0, &I) && I.isEquality())
       return new ICmpInst(I.getInversePredicate(),
                           Builder->CreateAnd(A, B),
                           Op1);
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 6829be86885b..56f133de3de1 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -540,6 +540,12 @@ public:
     return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
   }
 
+  bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false,
+                              unsigned Depth = 0,
+                              const Instruction *CxtI = nullptr) {
+    return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT);
+  }
+
   bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0,
                          const Instruction *CxtI = nullptr) const {
     return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT);
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index fc13854f8fe7..4d408359eeea 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -47,9 +47,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
   // inexact.  Similarly for <<.
   BinaryOperator *I = dyn_cast<BinaryOperator>(V);
   if (I && I->isLogicalShift() &&
-      isKnownToBeAPowerOfTwo(I->getOperand(0), IC.getDataLayout(), false, 0,
-                             &IC.getAssumptionCache(), &CxtI,
-                             &IC.getDominatorTree())) {
+      IC.isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, &CxtI)) {
     // We know that this is an exact/nuw shift and that the input is a
     // non-zero context as well.
     if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
@@ -1240,7 +1238,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
       return BO;
     }
 
-    if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, &AC, &I, &DT)) {
+    if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
       // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
       // Safe because the only negative value (1 << Y) can take on is
       // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have
@@ -1487,7 +1485,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
                           I.getType());
 
   // X urem Y -> X and Y-1, where Y is a power of 2,
-  if (isKnownToBeAPowerOfTwo(Op1, DL, /*OrZero*/ true, 0, &AC, &I, &DT)) {
+  if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
     Constant *N1 = Constant::getAllOnesValue(I.getType());
     Value *Add = Builder->CreateAdd(Op1, N1);
     return BinaryOperator::CreateAnd(Op0, Add);
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 219effce7ba5..b40d067b2817 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -44,7 +44,8 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
   Value *A;
   Constant *C;
   if (match(Op0, m_Constant()) && match(Op1, m_Add(m_Value(A), m_Constant(C))))
-    if (isKnownNonNegative(A, DL) && isKnownNonNegative(C, DL))
+    if (isKnownNonNegative(A, DL, 0, &AC, &I, &DT) &&
+        isKnownNonNegative(C, DL, 0, &AC, &I, &DT))
       return BinaryOperator::Create(
           I.getOpcode(), Builder->CreateBinOp(I.getOpcode(), Op0, C), A);
 
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 4028a92771a4..5df55f01b83f 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -158,8 +158,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown,
                              Depth + 1))
       return I;
-    assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?");
-    assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?");
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
 
     // Output known-0 are known to be clear if zero in either the LHS | RHS.
     APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero;
@@ -192,8 +192,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown,
                              Depth + 1))
       return I;
-    assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?");
-    assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?");
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
 
     // Output known-0 bits are only known if clear in both the LHS & RHS.
     APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero;
@@ -224,8 +224,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
         SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1))
       return I;
-    assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?");
-    assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?");
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
 
     // Output known-0 bits are known if clear or set in both the LHS & RHS.
     APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) |
@@ -313,8 +313,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) ||
         SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1))
       return I;
-    assert(!(RHSKnown.Zero & RHSKnown.One) && "Bits known to be one AND zero?");
-    assert(!(LHSKnown.Zero & LHSKnown.One) && "Bits known to be one AND zero?");
+    assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+    assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
 
     // If the operands are constants, see if we can simplify them.
     if (ShrinkDemandedConstant(I, 1, DemandedMask) ||
@@ -325,15 +325,19 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     Known.One = RHSKnown.One & LHSKnown.One;
     Known.Zero = RHSKnown.Zero & LHSKnown.Zero;
     break;
+  case Instruction::ZExt:
   case Instruction::Trunc: {
-    unsigned truncBf = I->getOperand(0)->getType()->getScalarSizeInBits();
-    DemandedMask = DemandedMask.zext(truncBf);
-    Known = Known.zext(truncBf);
-    if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1))
+    unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
+
+    APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth);
+    KnownBits InputKnown(SrcBitWidth);
+    if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1))
       return I;
-    DemandedMask = DemandedMask.trunc(BitWidth);
-    Known = Known.trunc(BitWidth);
-    assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?");
+    Known = Known.zextOrTrunc(BitWidth);
+    // Any top bits are known to be zero.
+    if (BitWidth > SrcBitWidth)
+      Known.Zero.setBitsFrom(SrcBitWidth);
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     break;
   }
   case Instruction::BitCast:
@@ -355,56 +359,36 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
     if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1))
       return I;
-    assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?");
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     break;
-  case Instruction::ZExt: {
-    // Compute the bits in the result that are not present in the input.
-    unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits();
-
-    DemandedMask = DemandedMask.trunc(SrcBitWidth);
-    Known = Known.trunc(SrcBitWidth);
-    if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1))
-      return I;
-    DemandedMask = DemandedMask.zext(BitWidth);
-    Known = Known.zext(BitWidth);
-    assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?");
-    // The top bits are known to be zero.
-    Known.Zero.setBitsFrom(SrcBitWidth);
-    break;
-  }
   case Instruction::SExt: {
     // Compute the bits in the result that are not present in the input.
-    unsigned SrcBitWidth =I->getOperand(0)->getType()->getScalarSizeInBits();
+    unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
 
-    APInt InputDemandedBits = DemandedMask &
-                              APInt::getLowBitsSet(BitWidth, SrcBitWidth);
+    APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth);
 
-    APInt NewBits(APInt::getBitsSetFrom(BitWidth, SrcBitWidth));
     // If any of the sign extended bits are demanded, we know that the sign
     // bit is demanded.
-    if ((NewBits & DemandedMask) != 0)
+    if (DemandedMask.getActiveBits() > SrcBitWidth)
       InputDemandedBits.setBit(SrcBitWidth-1);
 
-    InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth);
-    Known = Known.trunc(SrcBitWidth);
-    if (SimplifyDemandedBits(I, 0, InputDemandedBits, Known, Depth + 1))
+    KnownBits InputKnown(SrcBitWidth);
+    if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1))
       return I;
-    InputDemandedBits = InputDemandedBits.zext(BitWidth);
-    Known = Known.zext(BitWidth);
-    assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?");
-
-    // If the sign bit of the input is known set or clear, then we know the
-    // top bits of the result.
 
     // If the input sign bit is known zero, or if the NewBits are not demanded
     // convert this into a zero extension.
-    if (Known.Zero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits) {
-      // Convert to ZExt cast
+    if (InputKnown.isNonNegative() ||
+        DemandedMask.getActiveBits() <= SrcBitWidth) {
+      // Convert to ZExt cast.
       CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
       return InsertNewInstWith(NewCast, *I);
-    } else if (Known.One[SrcBitWidth-1]) {    // Input sign bit known set
-      Known.One |= NewBits;
-    }
+     }
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    Known = InputKnown.sext(BitWidth);
+    assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     break;
   }
   case Instruction::Add:
@@ -467,7 +451,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
       if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
         return I;
-      assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?");
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero <<= ShiftAmt;
       Known.One  <<= ShiftAmt;
       // low bits known zero.
@@ -491,7 +475,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
       if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
         return I;
-      assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?");
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero.lshrInPlace(ShiftAmt);
       Known.One.lshrInPlace(ShiftAmt);
       if (ShiftAmt)
@@ -535,7 +519,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
         return I;
 
-      assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?");
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       // Compute the new bits that are at the top now.
       APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
       Known.Zero.lshrInPlace(ShiftAmt);
@@ -590,7 +574,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One))
           Known.One |= ~LowBits;
 
-        assert(!(Known.Zero & Known.One) && "Bits known to be one AND zero?");
+        assert(!Known.hasConflict() && "Bits known to be one AND zero?");
         break;
       }
     }
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 7ed9fd566b37..2730afc5c5b9 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1963,6 +1963,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
         // Give up the moment we see something we can't handle.
         return false;
 
+      case Instruction::AddrSpaceCast:
       case Instruction::BitCast:
       case Instruction::GetElementPtr:
         Users.emplace_back(I);
@@ -2064,7 +2065,8 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
         replaceInstUsesWith(*C,
                             ConstantInt::get(Type::getInt1Ty(C->getContext()),
                                              C->isFalseWhenEqual()));
-      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
+      } else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I) ||
+                 isa<AddrSpaceCastInst>(I)) {
         replaceInstUsesWith(*I, UndefValue::get(I->getType()));
       }
       eraseInstFromFunction(*I);
@@ -2180,8 +2182,7 @@ Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
 
   // There might be assume intrinsics dominating this return that completely
   // determine the value. If so, constant fold it.
-  KnownBits Known(VTy->getPrimitiveSizeInBits());
-  computeKnownBits(ResultOp, Known, 0, &RI);
+  KnownBits Known = computeKnownBits(ResultOp, 0, &RI);
   if (Known.isConstant())
     RI.setOperand(0, Constant::getIntegerValue(VTy, Known.getConstant()));
 
@@ -2242,9 +2243,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
     return &SI;
   }
 
-  unsigned BitWidth = cast<IntegerType>(Cond->getType())->getBitWidth();
-  KnownBits Known(BitWidth);
-  computeKnownBits(Cond, Known, 0, &SI);
+  KnownBits Known = computeKnownBits(Cond, 0, &SI);
   unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
   unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
 
@@ -2257,12 +2256,12 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
         LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes());
   }
 
-  unsigned NewWidth = BitWidth - std::max(LeadingKnownZeros, LeadingKnownOnes);
+  unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);
 
   // Shrink the condition operand if the new type is smaller than the old type.
   // This may produce a non-standard type for the switch, but that's ok because
   // the backend should extend back to a legal type for the target.
-  if (NewWidth > 0 && NewWidth < BitWidth) {
+  if (NewWidth > 0 && NewWidth < Known.getBitWidth()) {
     IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
     Builder->SetInsertPoint(&SI);
     Value *NewCond = Builder->CreateTrunc(Cond, Ty, "trunc");
@@ -2841,9 +2840,7 @@ bool InstCombiner::run() {
     // a value even when the operands are not all constants.
     Type *Ty = I->getType();
     if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy()) {
-      unsigned BitWidth = Ty->getScalarSizeInBits();
-      KnownBits Known(BitWidth);
-      computeKnownBits(I, Known, /*Depth*/0, I);
+      KnownBits Known = computeKnownBits(I, /*Depth*/0, I);
       if (Known.isConstant()) {
         Constant *C = ConstantInt::get(Ty, Known.getConstant());
         DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C <<
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 990bcec109de..1e30dbf6b55a 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -180,7 +180,7 @@ static cl::opt<bool>
 static cl::opt<bool>
     PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden,
                   cl::desc("Use this option to turn on/off "
-                           "memory instrinsic size profiling."));
+                           "memory intrinsic size profiling."));
 
 // Command line option to turn on CFG dot dump after profile annotation.
 // Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 4bc0a7133118..300085eccb0c 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -401,7 +401,10 @@ static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB,
   if (Options.NoPrune || &F.getEntryBlock() == BB)
     return true;
 
-  return !(isFullDominator(BB, DT) || isFullPostDominator(BB, PDT));
+  // Do not instrument full dominators, or full post-dominators with multiple
+  // predecessors.
+  return !isFullDominator(BB, DT)
+    && !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor());
 }
 
 bool SanitizerCoverageModule::runOnFunction(Function &F) {
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 523390758769..f5196cc46181 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -13,6 +13,7 @@ add_llvm_library(LLVMScalarOpts
   GuardWidening.cpp
   GVN.cpp
   GVNHoist.cpp
+  GVNSink.cpp
   IVUsersPrinter.cpp
   InductiveRangeCheckElimination.cpp
   IndVarSimplify.cpp
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index f62e111460ca..c3810366bf22 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -164,9 +164,9 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
 /// \brief Given \p BBs as input, find another set of BBs which collectively
 /// dominates \p BBs and have the minimal sum of frequencies. Return the BB
 /// set found in \p BBs.
-void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
-                          BasicBlock *Entry,
-                          SmallPtrSet<BasicBlock *, 8> &BBs) {
+static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
+                                 BasicBlock *Entry,
+                                 SmallPtrSet<BasicBlock *, 8> &BBs) {
   assert(!BBs.count(Entry) && "Assume Entry is not in BBs");
   // Nodes on the current path to the root.
   SmallPtrSet<BasicBlock *, 8> Path;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 0490d93f6455..0d6e0538261d 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -80,9 +80,10 @@ MaxRecurseDepth("max-recurse-depth", cl::Hidden, cl::init(1000), cl::ZeroOrMore,
 struct llvm::GVN::Expression {
   uint32_t opcode;
   Type *type;
+  bool commutative;
   SmallVector<uint32_t, 4> varargs;
 
-  Expression(uint32_t o = ~2U) : opcode(o) {}
+  Expression(uint32_t o = ~2U) : opcode(o), commutative(false) {}
 
   bool operator==(const Expression &other) const {
     if (opcode != other.opcode)
@@ -246,6 +247,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
     assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
     if (e.varargs[0] > e.varargs[1])
       std::swap(e.varargs[0], e.varargs[1]);
+    e.commutative = true;
   }
 
   if (CmpInst *C = dyn_cast<CmpInst>(I)) {
@@ -256,6 +258,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
       Predicate = CmpInst::getSwappedPredicate(Predicate);
     }
     e.opcode = (C->getOpcode() << 8) | Predicate;
+    e.commutative = true;
   } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {
     for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
          II != IE; ++II)
@@ -281,6 +284,7 @@ GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,
     Predicate = CmpInst::getSwappedPredicate(Predicate);
   }
   e.opcode = (Opcode << 8) | Predicate;
+  e.commutative = true;
   return e;
 }
 
@@ -348,25 +352,25 @@ GVN::ValueTable::~ValueTable() = default;
 /// add - Insert a value into the table with a specified value number.
 void GVN::ValueTable::add(Value *V, uint32_t num) {
   valueNumbering.insert(std::make_pair(V, num));
+  if (PHINode *PN = dyn_cast<PHINode>(V))
+    NumberingPhi[num] = PN;
 }
 
 uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
   if (AA->doesNotAccessMemory(C)) {
     Expression exp = createExpr(C);
-    uint32_t &e = expressionNumbering[exp];
-    if (!e) e = nextValueNumber++;
+    uint32_t e = assignExpNewValueNum(exp).first;
     valueNumbering[C] = e;
     return e;
   } else if (AA->onlyReadsMemory(C)) {
     Expression exp = createExpr(C);
-    uint32_t &e = expressionNumbering[exp];
-    if (!e) {
-      e = nextValueNumber++;
-      valueNumbering[C] = e;
-      return e;
+    auto ValNum = assignExpNewValueNum(exp);
+    if (ValNum.second) {
+      valueNumbering[C] = ValNum.first;
+      return ValNum.first;
     }
     if (!MD) {
-      e = nextValueNumber++;
+      uint32_t e = assignExpNewValueNum(exp).first;
       valueNumbering[C] = e;
       return e;
     }
@@ -522,23 +526,29 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
     case Instruction::ExtractValue:
       exp = createExtractvalueExpr(cast<ExtractValueInst>(I));
       break;
+    case Instruction::PHI:
+      valueNumbering[V] = nextValueNumber;
+      NumberingPhi[nextValueNumber] = cast<PHINode>(V);
+      return nextValueNumber++;
     default:
       valueNumbering[V] = nextValueNumber;
       return nextValueNumber++;
   }
 
-  uint32_t& e = expressionNumbering[exp];
-  if (!e) e = nextValueNumber++;
+  uint32_t e = assignExpNewValueNum(exp).first;
   valueNumbering[V] = e;
   return e;
 }
 
 /// Returns the value number of the specified value. Fails if
 /// the value has not yet been numbered.
-uint32_t GVN::ValueTable::lookup(Value *V) const {
+uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const {
   DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
-  assert(VI != valueNumbering.end() && "Value not numbered?");
-  return VI->second;
+  if (Verify) {
+    assert(VI != valueNumbering.end() && "Value not numbered?");
+    return VI->second;
+  }
+  return (VI != valueNumbering.end()) ? VI->second : 0;
 }
 
 /// Returns the value number of the given comparison,
@@ -549,21 +559,29 @@ uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode,
                                          CmpInst::Predicate Predicate,
                                          Value *LHS, Value *RHS) {
   Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS);
-  uint32_t& e = expressionNumbering[exp];
-  if (!e) e = nextValueNumber++;
-  return e;
+  return assignExpNewValueNum(exp).first;
 }
 
 /// Remove all entries from the ValueTable.
 void GVN::ValueTable::clear() {
   valueNumbering.clear();
   expressionNumbering.clear();
+  NumberingPhi.clear();
+  PhiTranslateTable.clear();
+  BlockRPONumber.clear();
   nextValueNumber = 1;
+  Expressions.clear();
+  ExprIdx.clear();
+  nextExprNumber = 0;
 }
 
 /// Remove a value from the value numbering.
 void GVN::ValueTable::erase(Value *V) {
+  uint32_t Num = valueNumbering.lookup(V);
   valueNumbering.erase(V);
+  // If V is PHINode, V <--> value number is an one-to-one mapping.
+  if (isa<PHINode>(V))
+    NumberingPhi.erase(Num);
 }
 
 /// verifyRemoved - Verify that the value is removed from all internal data
@@ -1451,6 +1469,104 @@ bool GVN::processLoad(LoadInst *L) {
   return false;
 }
 
+/// Return a pair the first field showing the value number of \p Exp and the
+/// second field showing whether it is a value number newly created.
+std::pair<uint32_t, bool>
+GVN::ValueTable::assignExpNewValueNum(Expression &Exp) {
+  uint32_t &e = expressionNumbering[Exp];
+  bool CreateNewValNum = !e;
+  if (CreateNewValNum) {
+    Expressions.push_back(Exp);
+    if (ExprIdx.size() < nextValueNumber + 1)
+      ExprIdx.resize(nextValueNumber * 2);
+    e = nextValueNumber;
+    ExprIdx[nextValueNumber++] = nextExprNumber++;
+  }
+  return {e, CreateNewValNum};
+}
+
+void GVN::ValueTable::assignBlockRPONumber(Function &F) {
+  uint32_t NextBlockNumber = 1;
+  ReversePostOrderTraversal<Function *> RPOT(&F);
+  for (BasicBlock *BB : RPOT)
+    BlockRPONumber[BB] = NextBlockNumber++;
+}
+
+/// Return whether all the values related with the same \p num are
+/// defined in \p BB.
+bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
+                                     GVN &Gvn) {
+  LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
+  while (Vals && Vals->BB == BB)
+    Vals = Vals->Next;
+  return !Vals;
+}
+
+/// Wrap phiTranslateImpl to provide caching functionality.
+uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred,
+                                       const BasicBlock *PhiBlock, uint32_t Num,
+                                       GVN &Gvn) {
+  auto FindRes = PhiTranslateTable.find({Num, Pred});
+  if (FindRes != PhiTranslateTable.end())
+    return FindRes->second;
+  uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn);
+  PhiTranslateTable.insert({{Num, Pred}, NewNum});
+  return NewNum;
+}
+
+/// Translate value number \p Num using phis, so that it has the values of
+/// the phis in BB.
+uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
+                                           const BasicBlock *PhiBlock,
+                                           uint32_t Num, GVN &Gvn) {
+  if (PHINode *PN = NumberingPhi[Num]) {
+    if (BlockRPONumber[Pred] >= BlockRPONumber[PhiBlock])
+      return Num;
+    for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+      if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred)
+        if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false))
+          return TransVal;
+    }
+    return Num;
+  }
+
+  // If there is any value related with Num is defined in a BB other than
+  // PhiBlock, it cannot depend on a phi in PhiBlock without going through
+  // a backedge. We can do an early exit in that case to save compile time.
+  if (!areAllValsInBB(Num, PhiBlock, Gvn))
+    return Num;
+
+  if (ExprIdx[Num] == 0 || Num >= ExprIdx.size())
+    return Num;
+  Expression Exp = Expressions[ExprIdx[Num]];
+
+  for (unsigned i = 0; i < Exp.varargs.size(); i++) {
+    // For InsertValue and ExtractValue, some varargs are index numbers
+    // instead of value numbers. Those index numbers should not be
+    // translated.
+    if ((i > 1 && Exp.opcode == Instruction::InsertValue) ||
+        (i > 0 && Exp.opcode == Instruction::ExtractValue))
+      continue;
+    Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn);
+  }
+
+  if (Exp.commutative) {
+    assert(Exp.varargs.size() == 2 && "Unsupported commutative expression!");
+    if (Exp.varargs[0] > Exp.varargs[1]) {
+      std::swap(Exp.varargs[0], Exp.varargs[1]);
+      uint32_t Opcode = Exp.opcode >> 8;
+      if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)
+        Exp.opcode = (Opcode << 8) |
+                     CmpInst::getSwappedPredicate(
+                         static_cast<CmpInst::Predicate>(Exp.opcode & 255));
+    }
+  }
+
+  if (uint32_t NewNum = expressionNumbering[Exp])
+    return NewNum;
+  return Num;
+}
+
 // In order to find a leader for a given value number at a
 // specific basic block, we first obtain the list of all Values for that number,
 // and then scan the list to find one whose block dominates the block in
@@ -1856,6 +1972,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
     // Fabricate val-num for dead-code in order to suppress assertion in
     // performPRE().
     assignValNumForDeadCode();
+    VN.assignBlockRPONumber(F);
     bool PREChanged = true;
     while (PREChanged) {
       PREChanged = performPRE(F);
@@ -1945,7 +2062,9 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
       success = false;
       break;
     }
-    if (Value *V = findLeader(Pred, VN.lookup(Op))) {
+    uint32_t TValNo =
+        VN.phiTranslate(Pred, Instr->getParent(), VN.lookup(Op), *this);
+    if (Value *V = findLeader(Pred, TValNo)) {
       Instr->setOperand(i, V);
     } else {
       success = false;
@@ -1962,10 +2081,12 @@ bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
   Instr->insertBefore(Pred->getTerminator());
   Instr->setName(Instr->getName() + ".pre");
   Instr->setDebugLoc(Instr->getDebugLoc());
-  VN.add(Instr, ValNo);
+
+  unsigned Num = VN.lookupOrAdd(Instr);
+  VN.add(Instr, Num);
 
   // Update the availability map to include the new instruction.
-  addToLeaderTable(ValNo, Instr, Pred);
+  addToLeaderTable(Num, Instr, Pred);
   return true;
 }
 
@@ -2014,7 +2135,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
       break;
     }
 
-    Value *predV = findLeader(P, ValNo);
+    uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this);
+    Value *predV = findLeader(P, TValNo);
     if (!predV) {
       predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
       PREPred = P;
diff --git a/lib/Transforms/Scalar/GVNSink.cpp b/lib/Transforms/Scalar/GVNSink.cpp
new file mode 100644
index 000000000000..5c75f39e381d
--- /dev/null
+++ b/lib/Transforms/Scalar/GVNSink.cpp
@@ -0,0 +1,872 @@
+//===- GVNSink.cpp - sink expressions into successors -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file GVNSink.cpp
+/// This pass attempts to sink instructions into successors, reducing static
+/// instruction count and enabling if-conversion.
+///
+/// We use a variant of global value numbering to decide what can be sunk.
+/// Consider:
+///
+/// [ %a1 = add i32 %b, 1  ]   [ %c1 = add i32 %d, 1  ]
+/// [ %a2 = xor i32 %a1, 1 ]   [ %c2 = xor i32 %c1, 1 ]
+///                  \           /
+///            [ %e = phi i32 %a2, %c2 ]
+///            [ add i32 %e, 4         ]
+///
+///
+/// GVN would number %a1 and %c1 differently because they compute different
+/// results - the VN of an instruction is a function of its opcode and the
+/// transitive closure of its operands. This is the key property for hoisting
+/// and CSE.
+///
+/// What we want when sinking however is for a numbering that is a function of
+/// the *uses* of an instruction, which allows us to answer the question "if I
+/// replace %a1 with %c1, will it contribute in an equivalent way to all
+/// successive instructions?". The PostValueTable class in GVN provides this
+/// mapping.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <unordered_set>
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-sink"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+namespace {
+
+static bool isMemoryInst(const Instruction *I) {
+  return isa<LoadInst>(I) || isa<StoreInst>(I) ||
+         (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) ||
+         (isa<CallInst>(I) && !cast<CallInst>(I)->doesNotAccessMemory());
+}
+
+/// Iterates through instructions in a set of blocks in reverse order from the
+/// first non-terminator. For example (assume all blocks have size n):
+///   LockstepReverseIterator I([B1, B2, B3]);
+///   *I-- = [B1[n], B2[n], B3[n]];
+///   *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+///   *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+///   ...
+///
+/// It continues until all blocks have been exhausted. Use \c getActiveBlocks()
+/// to
+/// determine which blocks are still going and the order they appear in the
+/// list returned by operator*.
+class LockstepReverseIterator {
+  ArrayRef<BasicBlock *> Blocks;
+  SmallPtrSet<BasicBlock *, 4> ActiveBlocks;
+  SmallVector<Instruction *, 4> Insts;
+  bool Fail;
+
+public:
+  LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) {
+    reset();
+  }
+
+  void reset() {
+    Fail = false;
+    ActiveBlocks.clear();
+    for (BasicBlock *BB : Blocks)
+      ActiveBlocks.insert(BB);
+    Insts.clear();
+    for (BasicBlock *BB : Blocks) {
+      if (BB->size() <= 1) {
+        // Block wasn't big enough - only contained a terminator.
+        ActiveBlocks.erase(BB);
+        continue;
+      }
+      Insts.push_back(BB->getTerminator()->getPrevNode());
+    }
+    if (Insts.empty())
+      Fail = true;
+  }
+
+  bool isValid() const { return !Fail; }
+  ArrayRef<Instruction *> operator*() const { return Insts; }
+  SmallPtrSet<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
+
+  void restrictToBlocks(SmallPtrSetImpl<BasicBlock *> &Blocks) {
+    for (auto II = Insts.begin(); II != Insts.end();) {
+      if (std::find(Blocks.begin(), Blocks.end(), (*II)->getParent()) ==
+          Blocks.end()) {
+        ActiveBlocks.erase((*II)->getParent());
+        II = Insts.erase(II);
+      } else {
+        ++II;
+      }
+    }
+  }
+
+  void operator--() {
+    if (Fail)
+      return;
+    SmallVector<Instruction *, 4> NewInsts;
+    for (auto *Inst : Insts) {
+      if (Inst == &Inst->getParent()->front())
+        ActiveBlocks.erase(Inst->getParent());
+      else
+        NewInsts.push_back(Inst->getPrevNode());
+    }
+    if (NewInsts.empty()) {
+      Fail = true;
+      return;
+    }
+    Insts = NewInsts;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+/// Candidate solution for sinking. There may be different ways to
+/// sink instructions, differing in the number of instructions sunk,
+/// the number of predecessors sunk from and the number of PHIs
+/// required.
+struct SinkingInstructionCandidate {
+  unsigned NumBlocks;
+  unsigned NumInstructions;
+  unsigned NumPHIs;
+  unsigned NumMemoryInsts;
+  int Cost = -1;
+  SmallVector<BasicBlock *, 4> Blocks;
+
+  void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) {
+    unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs;
+    unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0;
+    Cost = (NumInstructions * (NumBlocks - 1)) -
+           (NumExtraPHIs *
+            NumExtraPHIs) // PHIs are expensive, so make sure they're worth it.
+           - SplitEdgeCost;
+  }
+  bool operator>=(const SinkingInstructionCandidate &Other) const {
+    return Cost >= Other.Cost;
+  }
+};
+
+#ifndef NDEBUG
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                              const SinkingInstructionCandidate &C) {
+  OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
+     << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
+  return OS;
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+
+/// Describes a PHI node that may or may not exist. These track the PHIs
+/// that must be created if we sunk a sequence of instructions. It provides
+/// a hash function for efficient equality comparisons.
+class ModelledPHI {
+  SmallVector<Value *, 4> Values;
+  SmallVector<BasicBlock *, 4> Blocks;
+
+public:
+  ModelledPHI() {}
+  ModelledPHI(const PHINode *PN) {
+    for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
+      Blocks.push_back(PN->getIncomingBlock(I));
+    std::sort(Blocks.begin(), Blocks.end());
+
+    // This assumes the PHI is already well-formed and there aren't conflicting
+    // incoming values for the same block.
+    for (auto *B : Blocks)
+      Values.push_back(PN->getIncomingValueForBlock(B));
+  }
+  /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI
+  /// without the same ID.
+  /// \note This is specifically for DenseMapInfo - do not use this!
+  static ModelledPHI createDummy(size_t ID) {
+    ModelledPHI M;
+    M.Values.push_back(reinterpret_cast<Value*>(ID));
+    return M;
+  }
+
+  /// Create a PHI from an array of incoming values and incoming blocks.
+  template <typename VArray, typename BArray>
+  ModelledPHI(const VArray &V, const BArray &B) {
+    std::copy(V.begin(), V.end(), std::back_inserter(Values));
+    std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+  }
+
+  /// Create a PHI from [I[OpNum] for I in Insts].
+  template <typename BArray>
+  ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) {
+    std::copy(B.begin(), B.end(), std::back_inserter(Blocks));
+    for (auto *I : Insts)
+      Values.push_back(I->getOperand(OpNum));
+  }
+
+  /// Restrict the PHI's contents down to only \c NewBlocks.
+  /// \c NewBlocks must be a subset of \c this->Blocks.
+  void restrictToBlocks(const SmallPtrSetImpl<BasicBlock *> &NewBlocks) {
+    auto BI = Blocks.begin();
+    auto VI = Values.begin();
+    while (BI != Blocks.end()) {
+      assert(VI != Values.end());
+      if (std::find(NewBlocks.begin(), NewBlocks.end(), *BI) ==
+          NewBlocks.end()) {
+        BI = Blocks.erase(BI);
+        VI = Values.erase(VI);
+      } else {
+        ++BI;
+        ++VI;
+      }
+    }
+    assert(Blocks.size() == NewBlocks.size());
+  }
+
+  ArrayRef<Value *> getValues() const { return Values; }
+
+  bool areAllIncomingValuesSame() const {
+    return all_of(Values, [&](Value *V) { return V == Values[0]; });
+  }
+  bool areAllIncomingValuesSameType() const {
+    return all_of(
+        Values, [&](Value *V) { return V->getType() == Values[0]->getType(); });
+  }
+  bool areAnyIncomingValuesConstant() const {
+    return any_of(Values, [&](Value *V) { return isa<Constant>(V); });
+  }
+  // Hash functor
+  unsigned hash() const {
+      return (unsigned)hash_combine_range(Values.begin(), Values.end());
+  }
+  bool operator==(const ModelledPHI &Other) const {
+    return Values == Other.Values && Blocks == Other.Blocks;
+  }
+};
+
+template <typename ModelledPHI> struct DenseMapInfo {
+  static inline ModelledPHI &getEmptyKey() {
+    static ModelledPHI Dummy = ModelledPHI::createDummy(0);
+    return Dummy;
+  }
+  static inline ModelledPHI &getTombstoneKey() {
+    static ModelledPHI Dummy = ModelledPHI::createDummy(1);
+    return Dummy;
+  }
+  static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); }
+  static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) {
+    return LHS == RHS;
+  }
+};
+
+typedef DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>> ModelledPHISet;
+
+//===----------------------------------------------------------------------===//
+//                             ValueTable
+//===----------------------------------------------------------------------===//
+// This is a value number table where the value number is a function of the
+// *uses* of a value, rather than its operands. Thus, if VN(A) == VN(B) we know
+// that the program would be equivalent if we replaced A with PHI(A, B).
+//===----------------------------------------------------------------------===//
+
+/// A GVN expression describing how an instruction is used. The operands
+/// field of BasicExpression is used to store uses, not operands.
+///
+/// This class also contains fields for discriminators used when determining
+/// equivalence of instructions with sideeffects.
+class InstructionUseExpr : public GVNExpression::BasicExpression {
+  unsigned MemoryUseOrder = -1;
+  bool Volatile = false;
+
+public:
+  InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R,
+                     BumpPtrAllocator &A)
+      : GVNExpression::BasicExpression(I->getNumUses()) {
+    allocateOperands(R, A);
+    setOpcode(I->getOpcode());
+    setType(I->getType());
+
+    for (auto &U : I->uses())
+      op_push_back(U.getUser());
+    std::sort(op_begin(), op_end());
+  }
+  void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
+  void setVolatile(bool V) { Volatile = V; }
+
+  virtual hash_code getHashValue() const {
+    return hash_combine(GVNExpression::BasicExpression::getHashValue(),
+                        MemoryUseOrder, Volatile);
+  }
+
+  template <typename Function> hash_code getHashValue(Function MapFn) {
+    hash_code H =
+        hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile);
+    for (auto *V : operands())
+      H = hash_combine(H, MapFn(V));
+    return H;
+  }
+};
+
+class ValueTable {
+  DenseMap<Value *, uint32_t> ValueNumbering;
+  DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering;
+  DenseMap<size_t, uint32_t> HashNumbering;
+  BumpPtrAllocator Allocator;
+  ArrayRecycler<Value *> Recycler;
+  uint32_t nextValueNumber;
+
+  /// Create an expression for I based on its opcode and its uses. If I
+  /// touches or reads memory, the expression is also based upon its memory
+  /// order - see \c getMemoryUseOrder().
+  InstructionUseExpr *createExpr(Instruction *I) {
+    InstructionUseExpr *E =
+        new (Allocator) InstructionUseExpr(I, Recycler, Allocator);
+    if (isMemoryInst(I))
+      E->setMemoryUseOrder(getMemoryUseOrder(I));
+
+    if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+      CmpInst::Predicate Predicate = C->getPredicate();
+      E->setOpcode((C->getOpcode() << 8) | Predicate);
+    }
+    return E;
+  }
+
+  /// Helper to compute the value number for a memory instruction
+  /// (LoadInst/StoreInst), including checking the memory ordering and
+  /// volatility.
+  template <class Inst> InstructionUseExpr *createMemoryExpr(Inst *I) {
+    if (isStrongerThanUnordered(I->getOrdering()) || I->isAtomic())
+      return nullptr;
+    InstructionUseExpr *E = createExpr(I);
+    E->setVolatile(I->isVolatile());
+    return E;
+  }
+
+public:
+  /// Returns the value number for the specified value, assigning
+  /// it a new number if it did not have one before.
+  uint32_t lookupOrAdd(Value *V) {
+    auto VI = ValueNumbering.find(V);
+    if (VI != ValueNumbering.end())
+      return VI->second;
+
+    if (!isa<Instruction>(V)) {
+      ValueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    Instruction *I = cast<Instruction>(V);
+    InstructionUseExpr *exp = nullptr;
+    switch (I->getOpcode()) {
+    case Instruction::Load:
+      exp = createMemoryExpr(cast<LoadInst>(I));
+      break;
+    case Instruction::Store:
+      exp = createMemoryExpr(cast<StoreInst>(I));
+      break;
+    case Instruction::Call:
+    case Instruction::Invoke:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::UIToFP:
+    case Instruction::SIToFP:
+    case Instruction::FPTrunc:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::BitCast:
+    case Instruction::Select:
+    case Instruction::ExtractElement:
+    case Instruction::InsertElement:
+    case Instruction::ShuffleVector:
+    case Instruction::InsertValue:
+    case Instruction::GetElementPtr:
+      exp = createExpr(I);
+      break;
+    default:
+      break;
+    }
+
+    if (!exp) {
+      ValueNumbering[V] = nextValueNumber;
+      return nextValueNumber++;
+    }
+
+    uint32_t e = ExpressionNumbering[exp];
+    if (!e) {
+      hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); });
+      auto I = HashNumbering.find(H);
+      if (I != HashNumbering.end()) {
+        e = I->second;
+      } else {
+        e = nextValueNumber++;
+        HashNumbering[H] = e;
+        ExpressionNumbering[exp] = e;
+      }
+    }
+    ValueNumbering[V] = e;
+    return e;
+  }
+
+  /// Returns the value number of the specified value. Fails if the value has
+  /// not yet been numbered.
+  uint32_t lookup(Value *V) const {
+    auto VI = ValueNumbering.find(V);
+    assert(VI != ValueNumbering.end() && "Value not numbered?");
+    return VI->second;
+  }
+
+  /// Removes all value numberings and resets the value table.
+  void clear() {
+    ValueNumbering.clear();
+    ExpressionNumbering.clear();
+    HashNumbering.clear();
+    Recycler.clear(Allocator);
+    nextValueNumber = 1;
+  }
+
+  ValueTable() : nextValueNumber(1) {}
+
+  /// \c Inst uses or touches memory. Return an ID describing the memory state
+  /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2),
+  /// the exact same memory operations happen after I1 and I2.
+  ///
+  /// This is a very hard problem in general, so we use domain-specific
+  /// knowledge that we only ever check for equivalence between blocks sharing a
+  /// single immediate successor that is common, and when determining if I1 ==
+  /// I2 we will have already determined that next(I1) == next(I2). This
+  /// inductive property allows us to simply return the value number of the next
+  /// instruction that defines memory.
+  uint32_t getMemoryUseOrder(Instruction *Inst) {
+    auto *BB = Inst->getParent();
+    for (auto I = std::next(Inst->getIterator()), E = BB->end();
+         I != E && !I->isTerminator(); ++I) {
+      if (!isMemoryInst(&*I))
+        continue;
+      if (isa<LoadInst>(&*I))
+        continue;
+      CallInst *CI = dyn_cast<CallInst>(&*I);
+      if (CI && CI->onlyReadsMemory())
+        continue;
+      InvokeInst *II = dyn_cast<InvokeInst>(&*I);
+      if (II && II->onlyReadsMemory())
+        continue;
+      return lookupOrAdd(&*I);
+    }
+    return 0;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+
+class GVNSink {
+public:
+  GVNSink() : VN() {}
+  bool run(Function &F) {
+    DEBUG(dbgs() << "GVNSink: running on function @" << F.getName() << "\n");
+
+    unsigned NumSunk = 0;
+    ReversePostOrderTraversal<Function*> RPOT(&F);
+    for (auto *N : RPOT)
+      NumSunk += sinkBB(N);
+    
+    return NumSunk > 0;
+  }
+
+private:
+  ValueTable VN;
+
+  bool isInstructionBlacklisted(Instruction *I) {
+    // These instructions may change or break semantics if moved.
+    if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
+        I->getType()->isTokenTy())
+      return true;
+    return false;
+  }
+
+  /// The main heuristic function. Analyze the set of instructions pointed to by
+  /// LRI and return a candidate solution if these instructions can be sunk, or
+  /// None otherwise.
+  Optional<SinkingInstructionCandidate> analyzeInstructionForSinking(
+      LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+      ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents);
+
+  /// Create a ModelledPHI for each PHI in BB, adding to PHIs.
+  void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs,
+                          SmallPtrSetImpl<Value *> &PHIContents) {
+    for (auto &I : *BB) {
+      auto *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        return;
+
+      auto MPHI = ModelledPHI(PN);
+      PHIs.insert(MPHI);
+      for (auto *V : MPHI.getValues())
+        PHIContents.insert(V);
+    }
+  }
+
+  /// The main instruction sinking driver. Set up state and try and sink
+  /// instructions into BBEnd from its predecessors.
+  unsigned sinkBB(BasicBlock *BBEnd);
+
+  /// Perform the actual mechanics of sinking an instruction from Blocks into
+  /// BBEnd, which is their only successor.
+  void sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, BasicBlock *BBEnd);
+
+  /// Remove PHIs that all have the same incoming value.
+  void foldPointlessPHINodes(BasicBlock *BB) {
+    auto I = BB->begin();
+    while (PHINode *PN = dyn_cast<PHINode>(I++)) {
+      if (!all_of(PN->incoming_values(),
+                  [&](const Value *V) { return V == PN->getIncomingValue(0); }))
+        continue;
+      if (PN->getIncomingValue(0) != PN)
+        PN->replaceAllUsesWith(PN->getIncomingValue(0));
+      else
+        PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+      PN->eraseFromParent();
+    }
+  }
+};
+
+Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
+  LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+  ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) {
+  auto Insts = *LRI;
+  DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
+                                                             : Insts) {
+    I->dump();
+  } dbgs() << " ]\n";);
+
+  DenseMap<uint32_t, unsigned> VNums;
+  for (auto *I : Insts) {
+    uint32_t N = VN.lookupOrAdd(I);
+    DEBUG(dbgs() << " VN=" << utohexstr(N) << " for" << *I << "\n");
+    if (N == ~0U)
+      return None;
+    VNums[N]++;
+  }
+  unsigned VNumToSink =
+      std::max_element(VNums.begin(), VNums.end(),
+                       [](const std::pair<uint32_t, unsigned> &I,
+                          const std::pair<uint32_t, unsigned> &J) {
+                         return I.second < J.second;
+                       })
+          ->first;
+
+  if (VNums[VNumToSink] == 1)
+    // Can't sink anything!
+    return None;
+
+  // Now restrict the number of incoming blocks down to only those with
+  // VNumToSink.
+  auto &ActivePreds = LRI.getActiveBlocks();
+  unsigned InitialActivePredSize = ActivePreds.size();
+  SmallVector<Instruction *, 4> NewInsts;
+  for (auto *I : Insts) {
+    if (VN.lookup(I) != VNumToSink)
+      ActivePreds.erase(I->getParent());
+    else
+      NewInsts.push_back(I);
+  }
+  for (auto *I : NewInsts)
+    if (isInstructionBlacklisted(I))
+      return None;
+
+  // If we've restricted the incoming blocks, restrict all needed PHIs also
+  // to that set.
+  bool RecomputePHIContents = false;
+  if (ActivePreds.size() != InitialActivePredSize) {
+    ModelledPHISet NewNeededPHIs;
+    for (auto P : NeededPHIs) {
+      P.restrictToBlocks(ActivePreds);
+      NewNeededPHIs.insert(P);
+    }
+    NeededPHIs = NewNeededPHIs;
+    LRI.restrictToBlocks(ActivePreds);
+    RecomputePHIContents = true;
+  }
+
+  // The sunk instruction's results.
+  ModelledPHI NewPHI(NewInsts, ActivePreds);
+
+  // Does sinking this instruction render previous PHIs redundant?
+  if (NeededPHIs.find(NewPHI) != NeededPHIs.end()) {
+    NeededPHIs.erase(NewPHI);
+    RecomputePHIContents = true;
+  }
+
+  if (RecomputePHIContents) {
+    // The needed PHIs have changed, so recompute the set of all needed
+    // values.
+    PHIContents.clear();
+    for (auto &PHI : NeededPHIs)
+      PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+  }
+
+  // Is this instruction required by a later PHI that doesn't match this PHI?
+  // if so, we can't sink this instruction.
+  for (auto *V : NewPHI.getValues())
+    if (PHIContents.count(V))
+      // V exists in this PHI, but the whole PHI is different to NewPHI
+      // (else it would have been removed earlier). We cannot continue
+      // because this isn't representable.
+      return None;
+
+  // Which operands need PHIs?
+  // FIXME: If any of these fail, we should partition up the candidates to
+  // try and continue making progress.
+  Instruction *I0 = NewInsts[0];
+  for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) {
+    ModelledPHI PHI(NewInsts, OpNum, ActivePreds);
+    if (PHI.areAllIncomingValuesSame())
+      continue;
+    if (!canReplaceOperandWithVariable(I0, OpNum))
+      // We can 't create a PHI from this instruction!
+      return None;
+    if (NeededPHIs.count(PHI))
+      continue;
+    if (!PHI.areAllIncomingValuesSameType())
+      return None;
+    // Don't create indirect calls! The called value is the final operand.
+    if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OpNum == E - 1 &&
+        PHI.areAnyIncomingValuesConstant())
+      return None;
+
+    NeededPHIs.reserve(NeededPHIs.size());
+    NeededPHIs.insert(PHI);
+    PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+  }
+
+  if (isMemoryInst(NewInsts[0]))
+    ++MemoryInstNum;
+
+  SinkingInstructionCandidate Cand;
+  Cand.NumInstructions = ++InstNum;
+  Cand.NumMemoryInsts = MemoryInstNum;
+  Cand.NumBlocks = ActivePreds.size();
+  Cand.NumPHIs = NeededPHIs.size();
+  for (auto *C : ActivePreds)
+    Cand.Blocks.push_back(C);
+
+  return Cand;
+}
+
+unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
+  DEBUG(dbgs() << "GVNSink: running on basic block ";
+        BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+  SmallVector<BasicBlock *, 4> Preds;
+  for (auto *B : predecessors(BBEnd)) {
+    auto *T = B->getTerminator();
+    if (isa<BranchInst>(T) || isa<SwitchInst>(T))
+      Preds.push_back(B);
+    else
+      return 0;
+  }
+  if (Preds.size() < 2)
+    return 0;
+  std::sort(Preds.begin(), Preds.end());
+
+  unsigned NumOrigPreds = Preds.size();
+  // We can only sink instructions through unconditional branches.
+  for (auto I = Preds.begin(); I != Preds.end();) {
+    if ((*I)->getTerminator()->getNumSuccessors() != 1)
+      I = Preds.erase(I);
+    else
+      ++I;
+  }
+
+  LockstepReverseIterator LRI(Preds);
+  SmallVector<SinkingInstructionCandidate, 4> Candidates;
+  unsigned InstNum = 0, MemoryInstNum = 0;
+  ModelledPHISet NeededPHIs;
+  SmallPtrSet<Value *, 4> PHIContents;
+  analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents);
+  unsigned NumOrigPHIs = NeededPHIs.size();
+
+  while (LRI.isValid()) {
+    auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum,
+                                             NeededPHIs, PHIContents);
+    if (!Cand)
+      break;
+    Cand->calculateCost(NumOrigPHIs, Preds.size());
+    Candidates.emplace_back(*Cand);
+    --LRI;
+  }
+
+  std::stable_sort(
+      Candidates.begin(), Candidates.end(),
+      [](const SinkingInstructionCandidate &A,
+         const SinkingInstructionCandidate &B) { return A >= B; });
+  DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
+                                                    : Candidates) dbgs()
+                                               << "  " << C << "\n";);
+
+  // Pick the top candidate, as long it is positive!
+  if (Candidates.empty() || Candidates.front().Cost <= 0)
+    return 0;
+  auto C = Candidates.front();
+
+  DEBUG(dbgs() << " -- Sinking: " << C << "\n");
+  BasicBlock *InsertBB = BBEnd;
+  if (C.Blocks.size() < NumOrigPreds) {
+    DEBUG(dbgs() << " -- Splitting edge to "; BBEnd->printAsOperand(dbgs());
+          dbgs() << "\n");
+    InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split");
+    if (!InsertBB) {
+      DEBUG(dbgs() << " -- FAILED to split edge!\n");
+      // Edge couldn't be split.
+      return 0;
+    }
+  }
+
+  for (unsigned I = 0; I < C.NumInstructions; ++I)
+    sinkLastInstruction(C.Blocks, InsertBB);
+
+  return C.NumInstructions;
+}
+
+void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
+                                  BasicBlock *BBEnd) {
+  SmallVector<Instruction *, 4> Insts;
+  for (BasicBlock *BB : Blocks)
+    Insts.push_back(BB->getTerminator()->getPrevNode());
+  Instruction *I0 = Insts.front();
+
+  SmallVector<Value *, 4> NewOperands;
+  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
+    bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) {
+      return I->getOperand(O) != I0->getOperand(O);
+    });
+    if (!NeedPHI) {
+      NewOperands.push_back(I0->getOperand(O));
+      continue;
+    }
+
+    // Create a new PHI in the successor block and populate it.
+    auto *Op = I0->getOperand(O);
+    assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
+    auto *PN = PHINode::Create(Op->getType(), Insts.size(),
+                               Op->getName() + ".sink", &BBEnd->front());
+    for (auto *I : Insts)
+      PN->addIncoming(I->getOperand(O), I->getParent());
+    NewOperands.push_back(PN);
+  }
+
+  // Arbitrarily use I0 as the new "common" instruction; remap its operands
+  // and move it to the start of the successor block.
+  for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
+    I0->getOperandUse(O).set(NewOperands[O]);
+  I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+
+  // Update metadata and IR flags.
+  for (auto *I : Insts)
+    if (I != I0) {
+      combineMetadataForCSE(I0, I);
+      I0->andIRFlags(I);
+    }
+
+  for (auto *I : Insts)
+    if (I != I0)
+      I->replaceAllUsesWith(I0);
+  foldPointlessPHINodes(BBEnd);
+
+  // Finally nuke all instructions apart from the common instruction.
+  for (auto *I : Insts)
+    if (I != I0)
+      I->eraseFromParent();
+
+  NumRemoved += Insts.size() - 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Pass machinery / boilerplate
+
+class GVNSinkLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  GVNSinkLegacyPass() : FunctionPass(ID) {
+    initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    GVNSink G;
+    return G.run(F);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+} // namespace
+
+PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
+  GVNSink G;
+  if (!G.run(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+
+char GVNSinkLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink",
+                      "Early GVN sinking of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink",
+                    "Early GVN sinking of Expressions", false, false)
+
+FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); }
diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp
index 198d2b2b024f..65a2cd955672 100644
--- a/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/lib/Transforms/Scalar/GuardWidening.cpp
@@ -537,9 +537,7 @@ bool GuardWideningImpl::parseRangeChecks(
       Changed = true;
     } else if (match(Check.getBase(),
                      m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
-      unsigned BitWidth = OpLHS->getType()->getScalarSizeInBits();
-      KnownBits Known(BitWidth);
-      computeKnownBits(OpLHS, Known, DL);
+      KnownBits Known = computeKnownBits(OpLHS, DL);
       if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) {
         Check.setBase(OpLHS);
         APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 85db6e5e1105..e21b0feb7c5a 100644
--- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -1228,7 +1228,12 @@ void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
 
 Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
                                                  ValueToValueMapTy &VM) {
-  Loop &New = LPM.addLoop(Parent);
+  Loop &New = *new Loop();
+  if (Parent)
+    Parent->addChildLoop(&New);
+  else
+    LI.addTopLevelLoop(&New);
+  LPM.addLoop(New);
 
   // Add all of the blocks in Original to the new loop.
   for (auto *BB : Original->blocks())
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index ada22ae38eb8..2ef8f8563bb9 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -253,6 +253,35 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
   return EverChanged;
 }
 
+// Replace uses of Cond with ToVal when safe to do so. If all uses are
+// replaced, we can remove Cond. We cannot blindly replace all uses of Cond
+// because we may incorrectly replace uses when guards/assumes are uses of
+// of `Cond` and we used the guards/assume to reason about the `Cond` value
+// at the end of block. RAUW unconditionally replaces all uses
+// including the guards/assumes themselves and the uses before the
+// guard/assume.
+static void ReplaceFoldableUses(Instruction *Cond, Value *ToVal) {
+  assert(Cond->getType() == ToVal->getType());
+  auto *BB = Cond->getParent();
+  // We can unconditionally replace all uses in non-local blocks (i.e. uses
+  // strictly dominated by BB), since LVI information is true from the
+  // terminator of BB.
+  replaceNonLocalUsesWith(Cond, ToVal);
+  for (Instruction &I : reverse(*BB)) {
+    // Reached the Cond whose uses we are trying to replace, so there are no
+    // more uses.
+    if (&I == Cond)
+      break;
+    // We only replace uses in instructions that are guaranteed to reach the end
+    // of BB, where we know Cond is ToVal.
+    if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+      break;
+    I.replaceUsesOfWith(Cond, ToVal);
+  }
+  if (Cond->use_empty() && !Cond->mayHaveSideEffects())
+    Cond->eraseFromParent();
+}
+
 /// Return the cost of duplicating a piece of this block from first non-phi
 /// and before StopAt instruction to thread across it. Stop scanning the block
 /// when exceeding the threshold. If duplication is impossible, returns ~0U.
@@ -833,13 +862,19 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
         CondBr->eraseFromParent();
         if (CondCmp->use_empty())
           CondCmp->eraseFromParent();
-        // TODO: We can safely replace *some* uses of the CondInst if it has
+        // We can safely replace *some* uses of the CondInst if it has
         // exactly one value as returned by LVI. RAUW is incorrect in the
         // presence of guards and assumes, that have the `Cond` as the use. This
         // is because we use the guards/assume to reason about the `Cond` value
         // at the end of block, but RAUW unconditionally replaces all uses
         // including the guards/assumes themselves and the uses before the
         // guard/assume.
+        else if (CondCmp->getParent() == BB) {
+          auto *CI = Ret == LazyValueInfo::True ?
+            ConstantInt::getTrue(CondCmp->getType()) :
+            ConstantInt::getFalse(CondCmp->getType());
+          ReplaceFoldableUses(CondCmp, CI);
+        }
         return true;
       }
 
@@ -1325,13 +1360,16 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
       if (auto *CondInst = dyn_cast<Instruction>(Cond)) {
         if (CondInst->use_empty() && !CondInst->mayHaveSideEffects())
           CondInst->eraseFromParent();
-        // TODO: We can safely replace *some* uses of the CondInst if it has
+        // We can safely replace *some* uses of the CondInst if it has
         // exactly one value as returned by LVI. RAUW is incorrect in the
         // presence of guards and assumes, that have the `Cond` as the use. This
         // is because we use the guards/assume to reason about the `Cond` value
         // at the end of block, but RAUW unconditionally replaces all uses
         // including the guards/assumes themselves and the uses before the
         // guard/assume.
+        else if (OnlyVal && OnlyVal != MultipleVal &&
+                 CondInst->getParent() == BB)
+          ReplaceFoldableUses(CondInst, OnlyVal);
       }
       return true;
     }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 97337ea5ba62..c6a05ecbd0b1 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1035,6 +1035,17 @@ static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry) {
   return nullptr;
 }
 
+// Check if the recurrence variable `VarX` is in the right form to create
+// the idiom. Returns the value coerced to a PHINode if so.
+static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
+                                 BasicBlock *LoopEntry) {
+  auto *PhiX = dyn_cast<PHINode>(VarX);
+  if (PhiX && PhiX->getParent() == LoopEntry &&
+      (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
+    return PhiX;
+  return nullptr;
+}
+
 /// Return true iff the idiom is detected in the loop.
 ///
 /// Additionally:
@@ -1110,13 +1121,9 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
   }
 
   // step 3: Check the recurrence of variable X
-  {
-    PhiX = dyn_cast<PHINode>(VarX1);
-    if (!PhiX ||
-        (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
-      return false;
-    }
-  }
+  PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
+  if (!PhiX)
+    return false;
 
   // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
   {
@@ -1132,8 +1139,8 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
       if (!Inc || !Inc->isOne())
         continue;
 
-      PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
-      if (!Phi || Phi->getParent() != LoopEntry)
+      PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+      if (!Phi)
         continue;
 
       // Check if the result of the instruction is live of the loop.
@@ -1227,8 +1234,8 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
   VarX = DefX->getOperand(0);
 
   // step 3: Check the recurrence of variable X
-  PhiX = dyn_cast<PHINode>(VarX);
-  if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX))
+  PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
+  if (!PhiX)
     return false;
 
   // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
@@ -1248,8 +1255,8 @@ static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX,
     if (!Inc || !Inc->isOne())
       continue;
 
-    PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
-    if (!Phi || Phi->getParent() != LoopEntry)
+    PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+    if (!Phi)
       continue;
 
     CntInst = Inst;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 6ef1464e9338..19daebd0613a 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -831,7 +831,12 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
 /// mapping the blocks with the specified map.
 static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
                        LoopInfo *LI, LPPassManager *LPM) {
-  Loop &New = LPM->addLoop(PL);
+  Loop &New = *new Loop();
+  if (PL)
+    PL->addChildLoop(&New);
+  else
+    LI->addTopLevelLoop(&New);
+  LPM->addLoop(New);
 
   // Add all of the blocks in L to the new loop.
   for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index 5cfbf6baeaa9..67abc3116988 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -858,7 +858,14 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
 
   // Filter out unreachable phi operands.
   auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) {
-    return ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock});
+    if (*U == PN)
+      return false;
+    if (!ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock}))
+      return false;
+    // Things in TOPClass are equivalent to everything.
+    if (ValueToClass.lookup(*U) == TOPClass)
+      return false;
+    return true;
   });
   std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
                  [&](const Use *U) -> Value * {
@@ -866,14 +873,6 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge,
                    HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
                    OriginalOpsConstant =
                        OriginalOpsConstant && isa<Constant>(*U);
-                   // Use nullptr to distinguish between things that were
-                   // originally self-defined and those that have an operand
-                   // leader that is self-defined.
-                   if (*U == PN)
-                     return nullptr;
-                   // Things in TOPClass are equivalent to everything.
-                   if (ValueToClass.lookup(*U) == TOPClass)
-                     return nullptr;
                    return lookupOperandLeader(*U);
                  });
   return E;
@@ -955,6 +954,10 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
 
   CongruenceClass *CC = ValueToClass.lookup(V);
   if (CC && CC->getDefiningExpr()) {
+    // If we simplified to something else, we need to communicate
+    // that we're users of the value we simplified to.
+    if (I != V)
+      addAdditionalUsers(V, I);
     if (I)
       DEBUG(dbgs() << "Simplified " << *I << " to "
                    << " expression " << *CC->getDefiningExpr() << "\n");
@@ -1581,6 +1584,30 @@ bool NewGVN::isCycleFree(const Instruction *I) const {
 
 // Evaluate PHI nodes symbolically, and create an expression result.
 const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
+  // Resolve irreducible and reducible phi cycles.
+  // FIXME: This is hopefully a temporary solution while we resolve the issues
+  // with fixpointing self-cycles.  It currently should be "guaranteed" to be
+  // correct, but non-optimal.  The SCCFinder does not, for example, take
+  // reachability of arguments into account, etc.
+  SCCFinder.Start(I);
+  bool CanOptimize = true;
+  SmallPtrSet<Value *, 8> OuterOps;
+
+  auto &Component = SCCFinder.getComponentFor(I);
+  for (auto *Member : Component) {
+    if (!isa<PHINode>(Member)) {
+      CanOptimize = false;
+      break;
+    }
+    for (auto &PHIOp : cast<PHINode>(Member)->operands())
+      if (!isa<PHINode>(PHIOp) || !Component.count(cast<PHINode>(PHIOp)))
+        OuterOps.insert(PHIOp);
+  }
+  if (CanOptimize && OuterOps.size() == 1) {
+    DEBUG(dbgs() << "Resolving cyclic phi to value " << *(*OuterOps.begin())
+                 << "\n");
+    return createVariableOrConstant(*OuterOps.begin());
+  }
   // True if one of the incoming phi edges is a backedge.
   bool HasBackedge = false;
   // All constant tracks the state of whether all the *original* phi operands
@@ -1594,17 +1621,7 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
   // See if all arguments are the same.
   // We track if any were undef because they need special handling.
   bool HasUndef = false;
-  bool CycleFree = isCycleFree(I);
   auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) {
-    if (Arg == nullptr)
-      return false;
-    // Original self-operands are already eliminated during expression creation.
-    // We can only eliminate value-wise self-operands if it's cycle
-    // free. Otherwise, eliminating the operand can cause our value to change,
-    // which can cause us to not eliminate the operand, which changes the value
-    // back to what it was before, cycling forever.
-    if (CycleFree && Arg == I)
-      return false;
     if (isa<UndefValue>(Arg)) {
       HasUndef = true;
       return false;
@@ -1613,6 +1630,14 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
   });
   // If we are left with no operands, it's dead.
   if (Filtered.begin() == Filtered.end()) {
+    // If it has undef at this point, it means there are no-non-undef arguments,
+    // and thus, the value of the phi node must be undef.
+    if (HasUndef) {
+      DEBUG(dbgs() << "PHI Node " << *I
+                   << " has no non-undef arguments, valuing it as undef\n");
+      return createConstantExpression(UndefValue::get(I->getType()));
+    }
+
     DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
     deleteExpression(E);
     return createDeadExpression();
@@ -1642,7 +1667,7 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const {
       // constants, or all operands are ignored but the undef, it also must be
       // cycle free.
       if (!AllConstant && HasBackedge && NumOps > 0 &&
-          !isa<UndefValue>(AllSameValue) && !CycleFree)
+          !isa<UndefValue>(AllSameValue) && !isCycleFree(I))
         return E;
 
       // Only have to check for instructions
@@ -3556,6 +3581,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
   // Map to store the use counts
   DenseMap<const Value *, unsigned int> UseCounts;
   for (auto *CC : reverse(CongruenceClasses)) {
+    DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID() << "\n");
     // Track the equivalent store info so we can decide whether to try
     // dead store elimination.
     SmallVector<ValueDFS, 8> PossibleDeadStores;
@@ -3602,8 +3628,6 @@ bool NewGVN::eliminateInstructions(Function &F) {
       }
       CC->swap(MembersLeft);
     } else {
-      DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID()
-                   << "\n");
       // If this is a singleton, we can skip it.
       if (CC->size() != 1 || RealToTemp.lookup(Leader)) {
         // This is a stack because equality replacement/etc may place
@@ -3846,6 +3870,7 @@ bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
   return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
 }
 
+namespace {
 class NewGVNLegacyPass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid.
@@ -3865,6 +3890,7 @@ private:
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
+} // namespace
 
 bool NewGVNLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 8908dae2f545..1d0e8396f6a2 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -1779,8 +1779,9 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
     // arguments and return value aggressively, and can assume it is not called
     // unless we see evidence to the contrary.
     if (F.hasLocalLinkage()) {
-      if (AddressIsTaken(&F))
+      if (F.hasAddressTaken()) {
         AddressTakenFunctions.insert(&F);
+      }
       else {
         Solver.AddArgumentTrackedFunction(&F);
         continue;
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 24bd0a2b7bdf..6e113bccff94 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -326,7 +326,7 @@ private:
   /// partition.
   uint64_t BeginOffset, EndOffset;
 
-  /// \brief The start end end iterators of this partition.
+  /// \brief The start and end iterators of this partition.
   iterator SI, SJ;
 
   /// \brief A collection of split slice tails overlapping the partition.
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 52201d8f3e51..9fa43da99da9 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -48,6 +48,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeEarlyCSELegacyPassPass(Registry);
   initializeEarlyCSEMemSSALegacyPassPass(Registry);
   initializeGVNHoistLegacyPassPass(Registry);
+  initializeGVNSinkLegacyPassPass(Registry);
   initializeFlattenCFGPassPass(Registry);
   initializeInductiveRangeCheckEliminationPass(Registry);
   initializeIndVarSimplifyLegacyPassPass(Registry);
diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index b32a61a7e8f8..0f170e26ce5f 100644
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -123,11 +123,62 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
   // exit block.
   DT.changeImmediateDominator(UnswitchedNode, OldPHNode);
 
-  // Blocks reachable from the unswitched block may need to change their IDom
-  // as well.
+  // For everything that moves up the dominator tree, we need to examine the
+  // dominator frontier to see if it additionally should move up the dominator
+  // tree. This lambda appends the dominator frontier for a node on the
+  // worklist.
+  //
+  // Note that we don't currently use the IDFCalculator here for two reasons:
+  // 1) It computes dominator tree levels for the entire function on each run
+  //    of 'compute'. While this isn't terrible, given that we expect to update
+  //    relatively small subtrees of the domtree, it isn't necessarily the right
+  //    tradeoff.
+  // 2) The interface doesn't fit this usage well. It doesn't operate in
+  //    append-only, and builds several sets that we don't need.
+  //
+  // FIXME: Neither of these issues are a big deal and could be addressed with
+  // some amount of refactoring of IDFCalculator. That would allow us to share
+  // the core logic here (which is solving the same core problem).
   SmallSetVector<BasicBlock *, 4> Worklist;
-  for (auto *SuccBB : successors(UnswitchedBB))
-    Worklist.insert(SuccBB);
+  SmallVector<DomTreeNode *, 4> DomNodes;
+  SmallPtrSet<BasicBlock *, 4> DomSet;
+  auto AppendDomFrontier = [&](DomTreeNode *Node) {
+    assert(DomNodes.empty() && "Must start with no dominator nodes.");
+    assert(DomSet.empty() && "Must start with an empty dominator set.");
+
+    // First flatten this subtree into sequence of nodes by doing a pre-order
+    // walk.
+    DomNodes.push_back(Node);
+    // We intentionally re-evaluate the size as each node can add new children.
+    // Because this is a tree walk, this cannot add any duplicates.
+    for (int i = 0; i < (int)DomNodes.size(); ++i)
+      DomNodes.insert(DomNodes.end(), DomNodes[i]->begin(), DomNodes[i]->end());
+
+    // Now create a set of the basic blocks so we can quickly test for
+    // dominated successors. We could in theory use the DFS numbers of the
+    // dominator tree for this, but we want this to remain predictably fast
+    // even while we mutate the dominator tree in ways that would invalidate
+    // the DFS numbering.
+    for (DomTreeNode *InnerN : DomNodes)
+      DomSet.insert(InnerN->getBlock());
+
+    // Now re-walk the nodes, appending every successor of every node that isn't
+    // in the set. Note that we don't append the node itself, even though if it
+    // is a successor it does not strictly dominate itself and thus it would be
+    // part of the dominance frontier. The reason we don't append it is that
+    // the node passed in came *from* the worklist and so it has already been
+    // processed.
+    for (DomTreeNode *InnerN : DomNodes)
+      for (BasicBlock *SuccBB : successors(InnerN->getBlock()))
+        if (!DomSet.count(SuccBB))
+          Worklist.insert(SuccBB);
+
+    DomNodes.clear();
+    DomSet.clear();
+  };
+
+  // Append the initial dom frontier nodes.
+  AppendDomFrontier(UnswitchedNode);
 
   // Walk the worklist. We grow the list in the loop and so must recompute size.
   for (int i = 0; i < (int)Worklist.size(); ++i) {
@@ -136,20 +187,17 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH,
     DomTreeNode *Node = DT[BB];
     assert(!DomChain.count(Node) &&
            "Cannot be dominated by a block you can reach!");
-    // If this block doesn't have an immediate dominator somewhere in the chain
-    // we hoisted over, then its position in the domtree hasn't changed. Either
-    // it is above the region hoisted and still valid, or it is below the
-    // hoisted block and so was trivially updated. This also applies to
-    // everything reachable from this block so we're completely done with the
-    // it.
+
+    // If this block had an immediate dominator somewhere in the chain
+    // we hoisted over, then its position in the domtree needs to move as it is
+    // reachable from a node hoisted over this chain.
     if (!DomChain.count(Node->getIDom()))
       continue;
 
-    // We need to change the IDom for this node but also walk its successors
-    // which could have similar dominance position.
     DT.changeImmediateDominator(Node, OldPHNode);
-    for (auto *SuccBB : successors(BB))
-      Worklist.insert(SuccBB);
+
+    // Now add this node's dominator frontier to the worklist as well.
+    AppendDomFrontier(Node);
   }
 }
 
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index bf2ab7c55be2..1ec3d0d49637 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -133,7 +133,7 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
       auto *SP = cast<DISubprogram>(MD.second);
       NewMD = DISubprogram::getDistinct(
           NewFunc->getContext(), SP->getScope(), SP->getName(),
-          NewFunc->getName(), SP->getFile(), SP->getLine(), SP->getType(),
+          SP->getLinkageName(), SP->getFile(), SP->getLine(), SP->getType(),
           SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(),
           SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(),
           SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(),
diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp
index 73a0b2737e95..57468be9a2a8 100644
--- a/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/lib/Transforms/Utils/FunctionComparator.cpp
@@ -76,12 +76,14 @@ int FunctionComparator::cmpMem(StringRef L, StringRef R) const {
 
 int FunctionComparator::cmpAttrs(const AttributeList L,
                                  const AttributeList R) const {
-  if (int Res = cmpNumbers(L.getNumSlots(), R.getNumSlots()))
+  if (int Res = cmpNumbers(L.getNumAttrSets(), R.getNumAttrSets()))
     return Res;
 
-  for (unsigned i = 0, e = L.getNumSlots(); i != e; ++i) {
-    AttributeList::iterator LI = L.begin(i), LE = L.end(i), RI = R.begin(i),
-                            RE = R.end(i);
+  for (unsigned i = L.index_begin(), e = L.index_end(); i != e; ++i) {
+    AttributeSet LAS = L.getAttributes(i);
+    AttributeSet RAS = R.getAttributes(i);
+    AttributeSet::iterator LI = LAS.begin(), LE = LAS.end();
+    AttributeSet::iterator RI = RAS.begin(), RE = RAS.end();
     for (; LI != LE && RI != RE; ++LI, ++RI) {
       Attribute LA = *LI;
       Attribute RA = *RI;
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 9cb4762b683c..0ca9f4c484e6 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -1397,11 +1397,12 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
 static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
                               const Optional<uint64_t> &CalleeEntryCount,
                               const Instruction *TheCall,
-                              ProfileSummaryInfo *PSI) {
+                              ProfileSummaryInfo *PSI,
+                              BlockFrequencyInfo *CallerBFI) {
   if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
     return;
   Optional<uint64_t> CallSiteCount =
-      PSI ? PSI->getProfileCount(TheCall, nullptr) : None;
+      PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
   uint64_t CallCount =
       std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
                CalleeEntryCount.getValue());
@@ -1637,7 +1638,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
                       CalledFunc->front());
 
     updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall,
-                      IFI.PSI);
+                      IFI.PSI, IFI.CallerBFI);
     // Update the profile count of callee.
     updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI);
 
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 1ca509472b5f..ebd528bc8ec1 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -1037,17 +1037,15 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
                                           const DominatorTree *DT) {
   assert(V->getType()->isPointerTy() &&
          "getOrEnforceKnownAlignment expects a pointer!");
-  unsigned BitWidth = DL.getPointerTypeSizeInBits(V->getType());
 
-  KnownBits Known(BitWidth);
-  computeKnownBits(V, Known, DL, 0, AC, CxtI, DT);
+  KnownBits Known = computeKnownBits(V, DL, 0, AC, CxtI, DT);
   unsigned TrailZ = Known.countMinTrailingZeros();
 
   // Avoid trouble with ridiculously large TrailZ values, such as
   // those computed from a null pointer.
   TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1));
 
-  unsigned Align = 1u << std::min(BitWidth - 1, TrailZ);
+  unsigned Align = 1u << std::min(Known.getBitWidth() - 1, TrailZ);
 
   // LLVM doesn't support alignments larger than this currently.
   Align = std::min(Align, +Value::MaximumAlignment);
@@ -1796,6 +1794,23 @@ static unsigned replaceDominatedUsesWith(Value *From, Value *To,
   return Count;
 }
 
+unsigned llvm::replaceNonLocalUsesWith(Instruction *From, Value *To) {
+   assert(From->getType() == To->getType());
+   auto *BB = From->getParent();
+   unsigned Count = 0;
+
+  for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE;) {
+    Use &U = *UI++;
+    auto *I = cast<Instruction>(U.getUser());
+    if (I->getParent() == BB)
+      continue;
+    U.set(To);
+    ++Count;
+  }
+  return Count;
+}
+
 unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
                                         DominatorTree &DT,
                                         const BasicBlockEdge &Root) {
@@ -2094,3 +2109,48 @@ void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
       !F->doesNotAccessMemory())
     CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
 }
+
+bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
+  // We can't have a PHI with a metadata type.
+  if (I->getOperand(OpIdx)->getType()->isMetadataTy())
+    return false;
+
+  // Early exit.
+  if (!isa<Constant>(I->getOperand(OpIdx)))
+    return true;
+
+  switch (I->getOpcode()) {
+  default:
+    return true;
+  case Instruction::Call:
+  case Instruction::Invoke:
+    // Many arithmetic intrinsics have no issue taking a
+    // variable, however it's hard to distingish these from
+    // specials such as @llvm.frameaddress that require a constant.
+    if (isa<IntrinsicInst>(I))
+      return false;
+
+    // Constant bundle operands may need to retain their constant-ness for
+    // correctness.
+    if (ImmutableCallSite(I).isBundleOperand(OpIdx))
+      return false;
+    return true;
+  case Instruction::ShuffleVector:
+    // Shufflevector masks are constant.
+    return OpIdx != 2;
+  case Instruction::ExtractValue:
+  case Instruction::InsertValue:
+    // All operands apart from the first are constant.
+    return OpIdx == 0;
+  case Instruction::Alloca:
+    return false;
+  case Instruction::GetElementPtr:
+    if (OpIdx == 0)
+      return true;
+    gep_type_iterator It = gep_type_begin(I);
+    for (auto E = std::next(It, OpIdx); It != E; ++It)
+      if (It.isStruct())
+        return false;
+    return true;
+  }
+}
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 27f72fcd8bda..1b442a9a264d 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1376,53 +1376,6 @@ HoistTerminator:
   return true;
 }
 
-// Is it legal to place a variable in operand \c OpIdx of \c I?
-// FIXME: This should be promoted to Instruction.
-static bool canReplaceOperandWithVariable(const Instruction *I,
-                                          unsigned OpIdx) {
-  // We can't have a PHI with a metadata type.
-  if (I->getOperand(OpIdx)->getType()->isMetadataTy())
-    return false;
-
-  // Early exit.
-  if (!isa<Constant>(I->getOperand(OpIdx)))
-    return true;
-
-  switch (I->getOpcode()) {
-  default:
-    return true;
-  case Instruction::Call:
-  case Instruction::Invoke:
-    // FIXME: many arithmetic intrinsics have no issue taking a
-    // variable, however it's hard to distingish these from
-    // specials such as @llvm.frameaddress that require a constant.
-    if (isa<IntrinsicInst>(I))
-      return false;
-
-    // Constant bundle operands may need to retain their constant-ness for
-    // correctness.
-    if (ImmutableCallSite(I).isBundleOperand(OpIdx))
-      return false;
-
-    return true;
-
-  case Instruction::ShuffleVector:
-    // Shufflevector masks are constant.
-    return OpIdx != 2;
-  case Instruction::ExtractValue:
-  case Instruction::InsertValue:
-    // All operands apart from the first are constant.
-    return OpIdx == 0;
-  case Instruction::Alloca:
-    return false;
-  case Instruction::GetElementPtr:
-    if (OpIdx == 0)
-      return true;
-    gep_type_iterator It = std::next(gep_type_begin(I), OpIdx - 1);
-    return It.isSequential();
-  }
-}
-
 // All instructions in Insts belong to different blocks that all unconditionally
 // branch to a common successor. Analyze each instruction and return true if it
 // would be possible to sink them into their successor, creating one common
@@ -4368,8 +4321,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
                                      const DataLayout &DL) {
   Value *Cond = SI->getCondition();
   unsigned Bits = Cond->getType()->getIntegerBitWidth();
-  KnownBits Known(Bits);
-  computeKnownBits(Cond, Known, DL, 0, AC, SI);
+  KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI);
 
   // We can also eliminate cases by determining that their values are outside of
   // the limited range of the condition based on how many significant (non-sign)
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 85c9464b5569..49effda5d833 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -466,9 +466,7 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B,
       }
 
       Value *Offset = GEP->getOperand(2);
-      unsigned BitWidth = Offset->getType()->getIntegerBitWidth();
-      KnownBits Known(BitWidth);
-      computeKnownBits(Offset, Known, DL, 0, nullptr, CI, nullptr);
+      KnownBits Known = computeKnownBits(Offset, DL, 0, nullptr, CI, nullptr);
       Known.Zero.flipAllBits();
       uint64_t ArrSize =
              cast<ArrayType>(GEP->getSourceElementType())->getNumElements();
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1dc554bede7e..3b036a6ac430 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2092,6 +2092,10 @@ private:
   /// The data is collected per VF.
   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
 
+  /// Holds the instructions (address computations) that are forced to be
+  /// scalarized.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+
   /// Returns the expected difference in cost from scalarizing the expression
   /// feeding a predicated instruction \p PredInst. The instructions to
   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
@@ -5086,12 +5090,18 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 }
 
 bool LoopVectorizationLegality::canVectorize() {
+  // Store the result and return it at the end instead of exiting early, in case
+  // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+  bool Result = true;
   // We must have a loop in canonical form. Loops with indirectbr in them cannot
   // be canonicalized.
   if (!TheLoop->getLoopPreheader()) {
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // FIXME: The code is currently dead, since the loop gets sent to
@@ -5101,21 +5111,30 @@ bool LoopVectorizationLegality::canVectorize() {
   if (!TheLoop->empty()) {
     ORE->emit(createMissedAnalysis("NotInnermostLoop")
               << "loop is not the innermost loop");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // We must have a single backedge.
   if (TheLoop->getNumBackEdges() != 1) {
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // We must have a single exiting block.
   if (!TheLoop->getExitingBlock()) {
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // We only handle bottom-tested loops, i.e. loop in which the condition is
@@ -5124,7 +5143,10 @@ bool LoopVectorizationLegality::canVectorize() {
   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
     ORE->emit(createMissedAnalysis("CFGNotUnderstood")
               << "loop control flow is not understood by vectorizer");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // We need to have a loop header.
@@ -5135,28 +5157,28 @@ bool LoopVectorizationLegality::canVectorize() {
   unsigned NumBlocks = TheLoop->getNumBlocks();
   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
     DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
-    return false;
-  }
-
-  // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = PSE.getBackedgeTakenCount();
-  if (ExitCount == PSE.getSE()->getCouldNotCompute()) {
-    ORE->emit(createMissedAnalysis("CantComputeNumberOfIterations")
-              << "could not determine number of loop iterations");
-    DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // Check if we can vectorize the instructions and CFG in this loop.
   if (!canVectorizeInstrs()) {
     DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   // Go over each instruction and look at memory deps.
   if (!canVectorizeMemory()) {
     DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
   DEBUG(dbgs() << "LV: We can vectorize this loop"
@@ -5184,13 +5206,17 @@ bool LoopVectorizationLegality::canVectorize() {
               << "Too many SCEV assumptions need to be made and checked "
               << "at runtime");
     DEBUG(dbgs() << "LV: Too many SCEV checks needed.\n");
-    return false;
+    if (ORE->allowExtraAnalysis())
+      Result = false;
+    else
+      return false;
   }
 
-  // Okay! We can vectorize. At this point we don't have any other mem analysis
+  // Okay! We've done all the tests. If any have failed, return false. Otherwise
+  // we can vectorize, and at this point we don't have any other mem analysis
   // which may limit our maximum vectorization factor, so just return true with
   // no restrictions.
-  return true;
+  return Result;
 }
 
 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
@@ -5554,6 +5580,13 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
     DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
   }
 
+  // Insert the forced scalars.
+  // FIXME: Currently widenPHIInstruction() often creates a dead vector
+  // induction variable when the PHI user is scalarized.
+  if (ForcedScalars.count(VF))
+    for (auto *I : ForcedScalars.find(VF)->second)
+      Worklist.insert(I);
+
   // Expand the worklist by looking through any bitcasts and getelementptr
   // instructions we've already identified as scalar. This is similar to the
   // expansion step in collectLoopUniforms(); however, here we're only
@@ -7129,11 +7162,18 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   if (VF > 1 && isProfitableToScalarize(I, VF))
     return VectorizationCostTy(InstsToScalarize[VF][I], false);
 
+  // Forced scalars do not have any scalarization overhead.
+  if (VF > 1 && ForcedScalars.count(VF) &&
+      ForcedScalars.find(VF)->second.count(I))
+    return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
+
   Type *VectorTy;
   unsigned C = getInstructionCost(I, VF, VectorTy);
 
+  // Note: Even if all instructions are scalarized, return true if any memory
+  // accesses appear in the loop to get benefits from address folding etc.
   bool TypeNotScalarized =
-      VF > 1 && !VectorTy->isVoidTy() && TTI.getNumberOfParts(VectorTy) < VF;
+      VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
   return VectorizationCostTy(C, TypeNotScalarized);
 }
 
@@ -7208,6 +7248,62 @@ void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
         setWideningDecision(&I, VF, Decision, Cost);
     }
   }
+
+  // Make sure that any load of address and any other address computation
+  // remains scalar unless there is gather/scatter support. This avoids
+  // inevitable extracts into address registers, and also has the benefit of
+  // activating LSR more, since that pass can't optimize vectorized
+  // addresses.
+  if (TTI.prefersVectorizedAddressing())
+    return;
+
+  // Start with all scalar pointer uses.
+  SmallPtrSet<Instruction *, 8> AddrDefs;
+  for (BasicBlock *BB : TheLoop->blocks())
+    for (Instruction &I : *BB) {
+      Instruction *PtrDef =
+        dyn_cast_or_null<Instruction>(getPointerOperand(&I));
+      if (PtrDef && TheLoop->contains(PtrDef) &&
+          getWideningDecision(&I, VF) != CM_GatherScatter)
+        AddrDefs.insert(PtrDef);
+    }
+
+  // Add all instructions used to generate the addresses.
+  SmallVector<Instruction *, 4> Worklist;
+  for (auto *I : AddrDefs)
+    Worklist.push_back(I);
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.pop_back_val();
+    for (auto &Op : I->operands())
+      if (auto *InstOp = dyn_cast<Instruction>(Op))
+        if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
+            AddrDefs.insert(InstOp).second == true)
+          Worklist.push_back(InstOp);
+  }
+
+  for (auto *I : AddrDefs) {
+    if (isa<LoadInst>(I)) {
+      // Setting the desired widening decision should ideally be handled in
+      // by cost functions, but since this involves the task of finding out
+      // if the loaded register is involved in an address computation, it is
+      // instead changed here when we know this is the case.
+      if (getWideningDecision(I, VF) == CM_Widen)
+        // Scalarize a widened load of address.
+        setWideningDecision(I, VF, CM_Scalarize,
+                            (VF * getMemoryInstructionCost(I, 1)));
+      else if (auto Group = Legal->getInterleavedAccessGroup(I)) {
+        // Scalarize an interleave group of address loads.
+        for (unsigned I = 0; I < Group->getFactor(); ++I) {
+          if (Instruction *Member = Group->getMember(I))
+            setWideningDecision(Member, VF, CM_Scalarize,
+                                (VF * getMemoryInstructionCost(Member, 1)));
+        }
+      }
+    } else
+      // Make sure I gets scalarized and a cost estimate without
+      // scalarization overhead.
+      ForcedScalars[VF].insert(I);
+  }
 }
 
 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
@@ -7216,7 +7312,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   Type *RetTy = I->getType();
   if (canTruncateToMinimalBitwidth(I, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
-  VectorTy = ToVectorTy(RetTy, VF);
+  VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
   auto SE = PSE.getSE();
 
   // TODO: We need to estimate the cost of intrinsic calls.
@@ -7349,9 +7445,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     } else if (Legal->isUniform(Op2)) {
       Op2VK = TargetTransformInfo::OK_UniformValue;
     }
-    SmallVector<const Value *, 4> Operands(I->operand_values()); 
-    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
-                                      Op2VK, Op1VP, Op2VP, Operands);
+    SmallVector<const Value *, 4> Operands(I->operand_values());
+    unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
+    return N * TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK,
+                                          Op2VK, Op1VP, Op2VP, Operands);
   }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
@@ -7374,7 +7471,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   }
   case Instruction::Store:
   case Instruction::Load: {
-    VectorTy = ToVectorTy(getMemInstValueType(I), VF);
+    unsigned Width = VF;
+    if (Width > 1) {
+      InstWidening Decision = getWideningDecision(I, Width);
+      assert(Decision != CM_Unknown &&
+             "CM decision should be taken at this point");
+      if (Decision == CM_Scalarize)
+        Width = 1;
+    }
+    VectorTy = ToVectorTy(getMemInstValueType(I), Width);
     return getMemoryInstructionCost(I, VF);
   }
   case Instruction::ZExt:
diff --git a/test/Analysis/CostModel/AArch64/falkor.ll b/test/Analysis/CostModel/AArch64/falkor.ll
deleted file mode 100644
index e9563191f077..000000000000
--- a/test/Analysis/CostModel/AArch64/falkor.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: opt < %s -cost-model -analyze -mcpu=falkor | FileCheck %s
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-gnu"
-
-; CHECK-LABEL: vectorInstrCost
-define void @vectorInstrCost() {
-
-    ; Vector extracts - extracting the first element should have a zero cost;
-    ; all other elements should have a cost of two.
-    ;
-    ; CHECK: cost of 0 {{.*}} extractelement <2 x i64> undef, i32 0
-    ; CHECK: cost of 2 {{.*}} extractelement <2 x i64> undef, i32 1
-    %t1 = extractelement <2 x i64> undef, i32 0
-    %t2 = extractelement <2 x i64> undef, i32 1
-
-    ; Vector inserts - inserting the first element should have a zero cost; all
-    ; other elements should have a cost of two.
-    ;
-    ; CHECK: cost of 0 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 0
-    ; CHECK: cost of 2 {{.*}} insertelement <2 x i64> undef, i64 undef, i32 1
-    %t3 = insertelement <2 x i64> undef, i64 undef, i32 0
-    %t4 = insertelement <2 x i64> undef, i64 undef, i32 1
-
-    ret void
-}
diff --git a/test/Analysis/Delinearization/constant_functions_multi_dim.ll b/test/Analysis/Delinearization/constant_functions_multi_dim.ll
new file mode 100644
index 000000000000..b44b900d3f52
--- /dev/null
+++ b/test/Analysis/Delinearization/constant_functions_multi_dim.ll
@@ -0,0 +1,80 @@
+; RUN: opt -delinearize -analyze < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK:      Inst:  %tmp = load float, float* %arrayidx, align 4
+; CHECK-NEXT: In Loop with Header: for.inc
+; CHECK-NEXT: AccessFunction: {(4 * %N * %call),+,4}<nsw><%for.inc>
+; CHECK-NEXT: Base offset: %A
+; CHECK-NEXT: ArrayDecl[UnknownSize][%N] with elements of 4 bytes.
+; CHECK-NEXT: ArrayRef[%call][{0,+,1}<nuw><nsw><%for.inc>]
+
+; CHECK:      Inst:  %tmp5 = load float, float* %arrayidx4, align 4
+; CHECK-NEXT: In Loop with Header: for.inc
+; CHECK-NEXT: AccessFunction: {(4 * %call1),+,(4 * %N)}<nsw><%for.inc>
+; CHECK-NEXT: Base offset: %B
+; CHECK-NEXT: ArrayDecl[UnknownSize][%N] with elements of 4 bytes.
+; CHECK-NEXT: ArrayRef[{0,+,1}<nuw><nsw><%for.inc>][%call1]
+
+; Function Attrs: noinline nounwind uwtable
+define void @mat_mul(float* %C, float* %A, float* %B, i64 %N) #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
+entry:
+  br label %entry.split
+
+entry.split:                                      ; preds = %entry
+  %call = tail call i64 @_Z13get_global_idj(i32 0) #3
+  %call1 = tail call i64 @_Z13get_global_idj(i32 1) #3
+  %cmp1 = icmp sgt i64 %N, 0
+  %mul = mul nsw i64 %call, %N
+  br i1 %cmp1, label %for.inc.lr.ph, label %for.end
+
+for.inc.lr.ph:                                    ; preds = %entry.split
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.lr.ph, %for.inc
+  %acc.03 = phi float [ 0.000000e+00, %for.inc.lr.ph ], [ %tmp6, %for.inc ]
+  %m.02 = phi i64 [ 0, %for.inc.lr.ph ], [ %inc, %for.inc ]
+  %add = add nsw i64 %m.02, %mul
+  %arrayidx = getelementptr inbounds float, float* %A, i64 %add
+  %tmp = load float, float* %arrayidx, align 4
+  %mul2 = mul nsw i64 %m.02, %N
+  %add3 = add nsw i64 %mul2, %call1
+  %arrayidx4 = getelementptr inbounds float, float* %B, i64 %add3
+  %tmp5 = load float, float* %arrayidx4, align 4
+  %tmp6 = tail call float @llvm.fmuladd.f32(float %tmp, float %tmp5, float %acc.03)
+  %inc = add nuw nsw i64 %m.02, 1
+  %exitcond = icmp ne i64 %inc, %N
+  br i1 %exitcond, label %for.inc, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:                       ; preds = %for.inc
+  %.lcssa = phi float [ %tmp6, %for.inc ]
+  br label %for.end
+
+for.end:                                          ; preds = %for.cond.for.end_crit_edge, %entry.split
+  %acc.0.lcssa = phi float [ %.lcssa, %for.cond.for.end_crit_edge ], [ 0.000000e+00, %entry.split ]
+  %add7 = add nsw i64 %mul, %call1
+  %arrayidx8 = getelementptr inbounds float, float* %C, i64 %add7
+  store float %acc.0.lcssa, float* %arrayidx8, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare i64 @_Z13get_global_idj(i32) #1
+
+; Function Attrs: nounwind readnone speculatable
+declare float @llvm.fmuladd.f32(float, float, float) #2
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone speculatable }
+attributes #3 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{!"clang version 5.0.0 (trunk 303846) (llvm/trunk 303834)"}
+!2 = !{i32 1, i32 1, i32 1, i32 0}
+!3 = !{!"none", !"none", !"none", !"none"}
+!4 = !{!"float*", !"float*", !"float*", !"long"}
+!5 = !{!"", !"", !"", !""}
diff --git a/test/Analysis/IVUsers/quadradic-exit-value.ll b/test/Analysis/IVUsers/quadradic-exit-value.ll
index 6d4f1b039b48..afc215198218 100644
--- a/test/Analysis/IVUsers/quadradic-exit-value.ll
+++ b/test/Analysis/IVUsers/quadradic-exit-value.ll
@@ -30,13 +30,47 @@ exit:
   ret i64 %r
 }
 
+; PR15470: LSR miscompile. The test1 function should return '1'.
+; It is valid to fold SCEVUnknown into the recurrence because it
+; was defined before the loop.
+;
+; SCEV does not know how to denormalize chained recurrences, so make
+; sure they aren't marked as post-inc users.
+;
+; CHECK-LABEL: IV Users for loop %test1.loop
+; CHECK-NO-LCSSA: %sext.us = {0,+,(16777216 + (-16777216 * %sub.us))<nuw><nsw>,+,33554432}<%test1.loop> (post-inc with loop %test1.loop) in    %f = ashr i32 %sext.us, 24
+define i32 @test1(i1 %cond) {
+entry:
+  %sub.us = select i1 %cond, i32 0, i32 0
+  br label %test1.loop
+
+test1.loop:
+  %inc1115.us = phi i32 [ 0, %entry ], [ %inc11.us, %test1.loop ]
+  %inc11.us = add nsw i32 %inc1115.us, 1
+  %cmp.us = icmp slt i32 %inc11.us, 2
+  br i1 %cmp.us, label %test1.loop, label %for.end
+
+for.end:
+  %tobool.us = icmp eq i32 %inc1115.us, 0
+  %mul.us = shl i32 %inc1115.us, 24
+  %sub.cond.us = sub nsw i32 %inc1115.us, %sub.us
+  %sext.us = mul i32 %mul.us, %sub.cond.us
+  %f = ashr i32 %sext.us, 24
+  br label %exit
+
+exit:
+  ret i32 %f
+}
+
 ; PR15470: LSR miscompile. The test2 function should return '1'.
+; It is illegal to fold SCEVUnknown (sext.us) into the recurrence
+; because it is defined after the loop where this recurrence belongs.
 ;
 ; SCEV does not know how to denormalize chained recurrences, so make
 ; sure they aren't marked as post-inc users.
 ;
 ; CHECK-LABEL: IV Users for loop %test2.loop
-; CHECK-NO-LCSSA: %sext.us = {0,+,(16777216 + (-16777216 * %sub.us))<nuw><nsw>,+,33554432}<%test2.loop> (post-inc with loop %test2.loop) in    %f = ashr i32 %sext.us, 24
+; CHECK-NO-LCSSA: %sub.cond.us = ((-1 * %sub.us)<nsw> + {0,+,1}<nuw><nsw><%test2.loop>) (post-inc with loop %test2.loop) in    %sext.us = mul i32 %mul.us, %sub.cond.us
 define i32 @test2() {
 entry:
   br label %test2.loop
diff --git a/test/Analysis/ScalarEvolution/different-loops-recs.ll b/test/Analysis/ScalarEvolution/different-loops-recs.ll
index ad3d1e0bd110..6b88f09e936f 100644
--- a/test/Analysis/ScalarEvolution/different-loops-recs.ll
+++ b/test/Analysis/ScalarEvolution/different-loops-recs.ll
@@ -220,7 +220,8 @@ exit:
 
 ; Mix of previous use cases that demonstrates %s3 can be incorrectly treated as
 ; a recurrence of loop1 because of operands order if we pick recurrencies in an
-; incorrect order.
+; incorrect order. It also shows that we cannot safely fold v1 (SCEVUnknown)
+; because we cannot prove for sure that it doesn't use Phis of loop 2.
 
 define void @test_03(i32 %a, i32 %b, i32 %c, i32* %p) {
 
@@ -228,9 +229,9 @@ define void @test_03(i32 %a, i32 %b, i32 %c, i32* %p) {
 ; CHECK:       %v1 = load i32, i32* %p
 ; CHECK-NEXT:  -->  %v1
 ; CHECK:       %s1 = add i32 %phi1, %v1
-; CHECK-NEXT:  -->  {(%a + %v1),+,1}<%loop1>
+; CHECK-NEXT:  -->  ({%a,+,1}<%loop1> + %v1)
 ; CHECK:       %s2 = add i32 %s1, %b
-; CHECK-NEXT:  -->  {(%a + %b + %v1),+,1}<%loop1>
+; CHECK-NEXT:  -->  ({(%a + %b),+,1}<%loop1> + %v1)
 ; CHECK:       %s3 = add i32 %s2, %phi2
 ; CHECK-NEXT:  -->  ({{{{}}((2 * %a) + %b),+,1}<%loop1>,+,2}<%loop2> + %v1)
 
@@ -452,3 +453,60 @@ exit:
   %s6 = add i32 %phi3, %phi2
   ret void
 }
+
+; Make sure that a complicated Phi does not get folded with rec's start value
+; of a loop which is above.
+define void @test_08() {
+
+; CHECK-LABEL: Classifying expressions for: @test_08
+; CHECK:       %tmp11 = add i64 %iv.2.2, %iv.2.1
+; CHECK-NEXT:  -->  ({0,+,-1}<nsw><%loop_2> + %iv.2.1)
+; CHECK:       %tmp12 = trunc i64 %tmp11 to i32
+; CHECK-NEXT:  -->  (trunc i64 ({0,+,-1}<nsw><%loop_2> + %iv.2.1) to i32)
+; CHECK:       %tmp14 = mul i32 %tmp12, %tmp7
+; CHECK-NEXT:  -->  ((trunc i64 ({0,+,-1}<nsw><%loop_2> + %iv.2.1) to i32) * {-1,+,-1}<%loop_1>)
+; CHECK:       %tmp16 = mul i64 %iv.2.1, %iv.1.1
+; CHECK-NEXT:  -->  ({2,+,1}<nuw><nsw><%loop_1> * %iv.2.1)
+
+entry:
+  br label %loop_1
+
+loop_1:
+  %iv.1.1 = phi i64 [ 2, %entry ], [ %iv.1.1.next, %loop_1_back_branch ]
+  %iv.1.2 = phi i32 [ -1, %entry ], [ %iv.1.2.next, %loop_1_back_branch ]
+  br label %loop_1_exit
+
+dead:
+  br label %loop_1_exit
+
+loop_1_exit:
+  %tmp5 = icmp sgt i64 %iv.1.1, 2
+  br i1 %tmp5, label %loop_2_preheader, label %loop_1_back_branch
+
+loop_1_back_branch:
+  %iv.1.1.next = add nuw nsw i64 %iv.1.1, 1
+  %iv.1.2.next = add nsw i32 %iv.1.2, 1
+  br label %loop_1
+
+loop_2_preheader:
+  %tmp6 = sub i64 1, %iv.1.1
+  %tmp7 = trunc i64 %tmp6 to i32
+  br label %loop_2
+
+loop_2:
+  %iv.2.1 = phi i64 [ 0, %loop_2_preheader ], [ %tmp16, %loop_2 ]
+  %iv.2.2 = phi i64 [ 0, %loop_2_preheader ], [ %iv.2.2.next, %loop_2 ]
+  %iv.2.3 = phi i64 [ 2, %loop_2_preheader ], [ %iv.2.3.next, %loop_2 ]
+  %tmp11 = add i64 %iv.2.2, %iv.2.1
+  %tmp12 = trunc i64 %tmp11 to i32
+  %tmp14 = mul i32 %tmp12, %tmp7
+  %tmp16 = mul i64 %iv.2.1, %iv.1.1
+  %iv.2.3.next = add nuw nsw i64 %iv.2.3, 1
+  %iv.2.2.next = add nsw i64 %iv.2.2, -1
+  %tmp17 = icmp slt i64 %iv.2.3.next, %iv.1.1
+  br i1 %tmp17, label %loop_2, label %exit
+
+exit:
+  %tmp10 = add i32 %iv.1.2, 3
+  ret void
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index 71ea9d54f647..0298315a5510 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -43,7 +43,7 @@ define [1 x double] @constant() {
   ; The key problem here is that we may fail to create an MBB referenced by a
   ; PHI. If so, we cannot complete the G_PHI and mustn't try or bad things
   ; happen.
-; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: cannot select: G_STORE %vreg4, %vreg2; mem:ST4[%addr] GPR:%vreg4,%vreg2 (in function: pending_phis)
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: cannot select: G_STORE %vreg5, %vreg2; mem:ST4[%addr] GPR:%vreg5,%vreg2 (in function: pending_phis)
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for pending_phis
 ; FALLBACK-WITH-REPORT-OUT-LABEL: pending_phis:
 define i32 @pending_phis(i1 %tst, i32 %val, i32* %addr) {
diff --git a/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll b/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll
index 3ecdb7bbedfb..0972840de47b 100644
--- a/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll
+++ b/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
 ; RUN:   -O0 -aarch64-enable-global-isel-at-O=0 \
-; RUN:   | FileCheck %s --check-prefix ENABLED --check-prefix NOFALLBACK
+; RUN:   | FileCheck %s --check-prefix ENABLED --check-prefix ENABLED-O0 --check-prefix NOFALLBACK
 
 ; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
 ; RUN:   -O0 -aarch64-enable-global-isel-at-O=0 -global-isel-abort=2  \
-; RUN:   | FileCheck %s --check-prefix ENABLED --check-prefix FALLBACK
+; RUN:   | FileCheck %s --check-prefix ENABLED --check-prefix ENABLED-O0 --check-prefix FALLBACK
 
 ; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
 ; RUN:   -global-isel \
@@ -32,6 +32,7 @@
 ; ENABLED:       IRTranslator
 ; ENABLED-NEXT:  Legalizer
 ; ENABLED-NEXT:  RegBankSelect
+; ENABLED-O0-NEXT:  Localizer
 ; ENABLED-NEXT:  InstructionSelect
 ; ENABLED-NEXT:  ResetMachineFunction
 
diff --git a/test/CodeGen/AArch64/GlobalISel/localizer-in-O0-pipeline.mir b/test/CodeGen/AArch64/GlobalISel/localizer-in-O0-pipeline.mir
new file mode 100644
index 000000000000..ea8a77ca3917
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/localizer-in-O0-pipeline.mir
@@ -0,0 +1,96 @@
+# RUN: llc -O0 -mtriple aarch64-apple-ios %s -global-isel -start-after regbankselect \
+# RUN:    -stop-before instruction-select -o - | FileCheck --check-prefix=CHECK --check-prefix=OPTNONE %s
+# RUN: llc -mtriple aarch64-apple-ios %s -global-isel -start-after regbankselect \
+# RUN:   -stop-before instruction-select -o - | FileCheck --check-prefix=CHECK --check-prefix=OPT %s
+#
+# Check that we are only running the localizer at O0 and that it runs
+# between the regbankselect pass and the instruction-select.
+# Moreover, check that it does what we expect.
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-apple-ios"
+  
+  define float @foo(float %arg, i1 %cond) {
+    br i1 %cond, label %true, label %false
+  
+  true:                                             ; preds = %0
+    br label %end
+  
+  false:                                            ; preds = %0
+    br label %end
+  
+  end:                                              ; preds = %false, %true
+    %val = phi float [ 1.000000e+00, %true ], [ 2.000000e+00, %false ]
+    %res = fadd float %arg, %val
+    ret float %res
+  }
+
+...
+---
+# CHECK-LABEL: name: foo
+name:            foo
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+# CHECK: registers:
+# CHECK-NEXT: - { id: 0, class: fpr }
+# CHECK-NEXT: - { id: 1, class: gpr }
+# CHECK-NEXT: - { id: 2, class: fpr }
+# CHECK-NEXT: - { id: 3, class: fpr }
+# CHECK-NEXT: - { id: 4, class: fpr }
+# CHECK-NEXT: - { id: 5, class: fpr }
+# The localizer will create two new values to materialize the constants.
+# OPTNONE-NEXT:  - { id: 6, class: fpr }
+# OPTNONE-NEXT:  - { id: 7, class: fpr }
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: fpr }
+  - { id: 5, class: fpr }
+
+# First block remains untouched
+# CHECK: body
+# CHECK:   %4(s32) = G_FCONSTANT float 1.000000e+00
+# CHECK:   %5(s32) = G_FCONSTANT float 2.000000e+00
+
+# Second block will get the constant 1.0 when the localizer is enabled.
+# CHECK: bb.1.true:
+# OPT-NOT: G_FCONSTANT
+# OPTNONE: [[FONE:%[0-9]+]](s32) = G_FCONSTANT float 1.000000e+00
+# CHECK: G_BR %bb.3.end
+
+# Thrid block will get the constant 2.0 when the localizer is enabled.
+# CHECK: bb.2.false:
+# OPT-NOT: G_FCONSTANT
+# OPTNONE: [[FTWO:%[0-9]+]](s32) = G_FCONSTANT float 2.000000e+00
+
+# CHECK: bb.3.end
+# OPTNONE: %2(s32) = PHI [[FONE]](s32), %bb.1.true, [[FTWO]](s32), %bb.2.false
+# OPT: %2(s32) = PHI %4(s32), %bb.1.true, %5(s32), %bb.2.false
+# CHECK-NEXT: G_FADD %0, %2
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %s0, %w0
+
+    %0(s32) = COPY %s0
+    %1(s1) = COPY %w0
+    %4(s32) = G_FCONSTANT float 1.000000e+00
+    %5(s32) = G_FCONSTANT float 2.000000e+00
+    G_BRCOND %1(s1), %bb.1.true
+    G_BR %bb.2.false
+  
+  bb.1.true:
+    G_BR %bb.3.end
+  
+  bb.2.false:
+  
+  bb.3.end:
+    %2(s32) = PHI %4(s32), %bb.1.true, %5(s32), %bb.2.false
+    %3(s32) = G_FADD %0, %2
+    %s0 = COPY %3(s32)
+    RET_ReallyLR implicit %s0
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/localizer.mir b/test/CodeGen/AArch64/GlobalISel/localizer.mir
new file mode 100644
index 000000000000..8fbb2040157e
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/localizer.mir
@@ -0,0 +1,312 @@
+# RUN: llc -O0 -mtriple=aarch64-apple-ios -run-pass=localizer -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK
+
+# Test the localizer.
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @local_use() { ret void }
+  define void @non_local_1use() { ret void }
+  define void @non_local_2uses() { ret void }
+  define void @non_local_phi_use() { ret void }
+  define void @non_local_phi_use_followed_by_use() { ret void }
+  define void @non_local_phi_use_followed_by_use_fi() { ret void }
+  define void @float_non_local_phi_use_followed_by_use_fi() { ret void }
+...
+
+---
+# CHECK-LABEL: name: local_use
+name:            local_use
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_CONSTANT 1
+# CHECK-NEXT: %1(s32) = G_ADD %0, %0
+body:             |
+  bb.0:
+    %0(s32) = G_CONSTANT 1
+    %1(s32) = G_ADD %0, %0
+...
+
+---
+# CHECK-LABEL: name: non_local_1use
+name:            non_local_1use
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# Existing registers should be left untouched
+# CHECK:  - { id: 0, class: gpr }
+#CHECK-NEXT:  - { id: 1, class: gpr }
+#CHECK-NEXT:  - { id: 2, class: gpr }
+# The newly created reg should be on the same regbank/regclass as its origin.
+#CHECK-NEXT:  - { id: 3, class: gpr }
+
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_CONSTANT 1
+# CHECK-NEXT: %1(s32) = G_ADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %3(s32) = G_CONSTANT 1
+# CHECK-NEXT: %2(s32) = G_ADD %3, %1
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0(s32) = G_CONSTANT 1
+    %1(s32) = G_ADD %0, %0
+
+  bb.1:
+    %2(s32) = G_ADD %0, %1
+...
+
+
+---
+# CHECK-LABEL: name: non_local_2uses
+name:            non_local_2uses
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# Existing registers should be left untouched
+# CHECK:  - { id: 0, class: gpr }
+#CHECK-NEXT:  - { id: 1, class: gpr }
+#CHECK-NEXT:  - { id: 2, class: gpr }
+# The newly created reg should be on the same regbank/regclass as its origin.
+#CHECK-NEXT:  - { id: 3, class: gpr }
+
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_CONSTANT 1
+# CHECK-NEXT: %1(s32) = G_ADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %3(s32) = G_CONSTANT 1
+# CHECK-NEXT: %2(s32) = G_ADD %3, %3
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0(s32) = G_CONSTANT 1
+    %1(s32) = G_ADD %0, %0
+
+  bb.1:
+    %2(s32) = G_ADD %0, %0
+...
+
+---
+# CHECK-LABEL: name: non_local_phi_use
+name:            non_local_phi_use
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+# CHECK:      registers:
+# Existing registers should be left untouched
+# CHECK:  - { id: 0, class: gpr }
+#CHECK-NEXT:  - { id: 1, class: gpr }
+#CHECK-NEXT:  - { id: 2, class: gpr }
+#CHECK-NEXT:  - { id: 3, class: gpr }
+#CHECK-NEXT:  - { id: 4, class: gpr }
+# The newly created reg should be on the same regbank/regclass as its origin.
+#CHECK-NEXT:  - { id: 5, class: gpr }
+
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_CONSTANT 1
+# CHECK-NEXT: %1(s32) = G_ADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %5(s32) = G_CONSTANT 1
+
+# CHECK: bb.2:
+# CHECK: %3(s32) = PHI %5(s32), %bb.1
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0(s32) = G_CONSTANT 1
+    %1(s32) = G_ADD %0, %0
+
+  bb.1:
+    successors: %bb.2
+
+  bb.2:
+    %3(s32) = PHI %0(s32), %bb.1
+    %2(s32) = G_ADD %3, %3
+...
+
+---
+# CHECK-LABEL: name: non_local_phi_use_followed_by_use
+name:            non_local_phi_use_followed_by_use
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+# CHECK:      registers:
+# Existing registers should be left untouched
+# CHECK:  - { id: 0, class: gpr }
+#CHECK-NEXT:  - { id: 1, class: gpr }
+#CHECK-NEXT:  - { id: 2, class: gpr }
+#CHECK-NEXT:  - { id: 3, class: gpr }
+#CHECK-NEXT:  - { id: 4, class: gpr }
+# The newly created regs should be on the same regbank/regclass as its origin.
+#CHECK-NEXT:  - { id: 5, class: gpr }
+#CHECK-NEXT:  - { id: 6, class: gpr }
+
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_CONSTANT 1
+# CHECK-NEXT: %1(s32) = G_ADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %5(s32) = G_CONSTANT 1
+
+# CHECK: bb.2:
+# CHECK: %3(s32) = PHI %5(s32), %bb.1
+# CHECK-NEXT: %6(s32) = G_CONSTANT 1
+# CHECK-NEXT: %2(s32) = G_ADD %3, %6
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0(s32) = G_CONSTANT 1
+    %1(s32) = G_ADD %0, %0
+
+  bb.1:
+    successors: %bb.2
+
+  bb.2:
+    %3(s32) = PHI %0(s32), %bb.1
+    %2(s32) = G_ADD %3, %0
+...
+
+---
+# CHECK-LABEL: name: non_local_phi_use_followed_by_use_fi
+name:            non_local_phi_use_followed_by_use_fi
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+# CHECK:      registers:
+# Existing registers should be left untouched
+# CHECK:  - { id: 0, class: gpr }
+#CHECK-NEXT:  - { id: 1, class: gpr }
+#CHECK-NEXT:  - { id: 2, class: gpr }
+#CHECK-NEXT:  - { id: 3, class: gpr }
+#CHECK-NEXT:  - { id: 4, class: gpr }
+# The newly created reg should be on the same regbank/regclass as its origin.
+#CHECK-NEXT:  - { id: 5, class: gpr }
+#CHECK-NEXT:  - { id: 6, class: gpr }
+
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_FRAME_INDEX 1
+# CHECK-NEXT: %1(s32) = G_ADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %5(s32) = G_FRAME_INDEX 1
+
+# CHECK: bb.2:
+# CHECK: %3(s32) = PHI %5(s32), %bb.1
+# CHECK-NEXT: %6(s32) = G_FRAME_INDEX 1
+# CHECK-NEXT: %2(s32) = G_ADD %3, %6
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0(s32) = G_FRAME_INDEX 1
+    %1(s32) = G_ADD %0, %0
+
+  bb.1:
+    successors: %bb.2
+
+  bb.2:
+    %3(s32) = PHI %0(s32), %bb.1
+    %2(s32) = G_ADD %3, %0
+...
+
+---
+# CHECK-LABEL: name: float_non_local_phi_use_followed_by_use_fi
+name:            float_non_local_phi_use_followed_by_use_fi
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+# CHECK:      registers:
+# Existing registers should be left untouched
+# CHECK:  - { id: 0, class: fpr }
+#CHECK-NEXT:  - { id: 1, class: fpr }
+#CHECK-NEXT:  - { id: 2, class: fpr }
+#CHECK-NEXT:  - { id: 3, class: fpr }
+#CHECK-NEXT:  - { id: 4, class: fpr }
+# The newly created reg should be on the same regbank/regclass as its origin.
+#CHECK-NEXT:  - { id: 5, class: fpr }
+#CHECK-NEXT:  - { id: 6, class: fpr }
+
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0(s32) = G_FCONSTANT float 1.0
+# CHECK-NEXT: %1(s32) = G_FADD %0, %0
+
+# CHECK: bb.1:
+# CHECK: %5(s32) = G_FCONSTANT float 1.0
+
+# CHECK: bb.2:
+# CHECK: %3(s32) = PHI %5(s32), %bb.1
+# CHECK-NEXT: %6(s32) = G_FCONSTANT float 1.0
+# CHECK-NEXT: %2(s32) = G_FADD %3, %6
+body:             |
+  bb.0:
+    successors: %bb.1
+
+    %0(s32) = G_FCONSTANT float 1.0
+    %1(s32) = G_FADD %0, %0
+
+  bb.1:
+    successors: %bb.2
+
+  bb.2:
+    %3(s32) = PHI %0(s32), %bb.1
+    %2(s32) = G_FADD %3, %0
+...
diff --git a/test/CodeGen/AArch64/aarch64-stp-cluster.ll b/test/CodeGen/AArch64/aarch64-stp-cluster.ll
index fe5abbf15eff..25cf313b81e7 100644
--- a/test/CodeGen/AArch64/aarch64-stp-cluster.ll
+++ b/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -aarch64-enable-stp-suppress=false -o - 2>&1 > /dev/null | FileCheck %s
 
 ; CHECK: ********** MI Scheduling **********
 ; CHECK-LABEL: stp_i64_scale:BB#0
diff --git a/test/CodeGen/AArch64/arm64-csldst-mmo.ll b/test/CodeGen/AArch64/arm64-csldst-mmo.ll
index 4930c493d62c..cfb8e3a38c49 100644
--- a/test/CodeGen/AArch64/arm64-csldst-mmo.ll
+++ b/test/CodeGen/AArch64/arm64-csldst-mmo.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 ; REQUIRES: asserts
 
 @G = external global [0 x i32], align 4
diff --git a/test/CodeGen/AArch64/arm64-inline-asm.ll b/test/CodeGen/AArch64/arm64-inline-asm.ll
index f28d0ab07c5a..f849df2a51ec 100644
--- a/test/CodeGen/AArch64/arm64-inline-asm.ll
+++ b/test/CodeGen/AArch64/arm64-inline-asm.ll
@@ -254,3 +254,10 @@ define void @test_constraint_w(i32 %a) {
   tail call void asm sideeffect "sqxtn h0, ${0:s}\0A", "w"(i32 %a)
   ret void
 }
+
+define void @test_inline_modifier_a(i8* %ptr) nounwind {
+  ; CHECK-LABEL: test_inline_modifier_a:
+  tail call void asm sideeffect "prfm pldl1keep, ${0:a}\0A", "r"(i8* %ptr)
+  ; CHECK: prfm pldl1keep, [x0]
+  ret void
+}
diff --git a/test/CodeGen/AArch64/arm64-ldp-cluster.ll b/test/CodeGen/AArch64/arm64-ldp-cluster.ll
index 0cfbe5958f4d..64e535ca7499 100644
--- a/test/CodeGen/AArch64/arm64-ldp-cluster.ll
+++ b/test/CodeGen/AArch64/arm64-ldp-cluster.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck --check-prefix=EXYNOS %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=exynos-m1 -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck --check-prefix=EXYNOS %s
 
 ; Test ldr clustering.
 ; CHECK: ********** MI Scheduling **********
diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index 41287a17da86..307d1ec1aa8c 100644
--- a/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -disable-machine-dce -o - 2>&1 > /dev/null | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -disable-machine-dce -o - -misched-limit=2 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -disable-machine-dce -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -disable-machine-dce -o - -misched-limit=2 2>&1 > /dev/null | FileCheck %s
 ;
 ; The Cortex-A53 machine model will cause the MADD instruction to be scheduled
 ; much higher than the ADD instructions in order to hide latency. When not
diff --git a/test/CodeGen/AArch64/arm64-misched-basic-A57.ll b/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
index fac5f8ad2e9f..82ba18ce72ca 100644
--- a/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
+++ b/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
@@ -6,7 +6,7 @@
 ; the loads to avoid unnecessary stalls. The generic machine model schedules 4
 ; loads consecutively for this case and will cause stalls.
 ;
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 ; CHECK: ********** MI Scheduling **********
 ; CHECK: main:BB#2
 ; CHECK: LDR
diff --git a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
index 0ee74d1f782e..cde62fcb3f95 100644
--- a/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
+++ b/test/CodeGen/AArch64/arm64-misched-forwarding-A53.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a53 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 ;
 ; For Cortex-A53, shiftable operands that are not actually shifted
 ; are not needed for an additional two cycles.
diff --git a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
index 0ec754f97ec7..748a4762d82f 100644
--- a/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
+++ b/test/CodeGen/AArch64/arm64-misched-memdep-bug.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 ;
 ; Test for bug in misched memory dependency calculation.
 ;
diff --git a/test/CodeGen/AArch64/arm64-misched-multimmo.ll b/test/CodeGen/AArch64/arm64-misched-multimmo.ll
index 3593668e0156..75f45da0e48f 100644
--- a/test/CodeGen/AArch64/arm64-misched-multimmo.ll
+++ b/test/CodeGen/AArch64/arm64-misched-multimmo.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu -mcpu=cortex-a57 -enable-misched=0 -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 
 
 @G1 = common global [100 x i32] zeroinitializer, align 4
diff --git a/test/CodeGen/AArch64/arm64-vabs.ll b/test/CodeGen/AArch64/arm64-vabs.ll
index ff7a0a8300e2..6b754b0a169e 100644
--- a/test/CodeGen/AArch64/arm64-vabs.ll
+++ b/test/CodeGen/AArch64/arm64-vabs.ll
@@ -33,7 +33,7 @@ define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: sabdl2_8h:
-;CHECK: sabdl2.8h
+;CHECK: sabdl.8h
         %load1 = load <16 x i8>, <16 x i8>* %A
         %load2 = load <16 x i8>, <16 x i8>* %B
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -45,7 +45,7 @@ define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: sabdl2_4s:
-;CHECK: sabdl2.4s
+;CHECK: sabdl.4s
         %load1 = load <8 x i16>, <8 x i16>* %A
         %load2 = load <8 x i16>, <8 x i16>* %B
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -57,7 +57,7 @@ define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: sabdl2_2d:
-;CHECK: sabdl2.2d
+;CHECK: sabdl.2d
         %load1 = load <4 x i32>, <4 x i32>* %A
         %load2 = load <4 x i32>, <4 x i32>* %B
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -99,7 +99,7 @@ define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: uabdl2_8h:
-;CHECK: uabdl2.8h
+;CHECK: uabdl.8h
   %load1 = load <16 x i8>, <16 x i8>* %A
   %load2 = load <16 x i8>, <16 x i8>* %B
   %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -112,7 +112,7 @@ define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: uabdl2_4s:
-;CHECK: uabdl2.4s
+;CHECK: uabdl.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -124,7 +124,7 @@ define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: uabdl2_2d:
-;CHECK: uabdl2.2d
+;CHECK: uabdl.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -561,7 +561,7 @@ define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
 
 define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
 ;CHECK-LABEL: sabal2_8h:
-;CHECK: sabal2.8h
+;CHECK: sabal.8h
         %load1 = load <16 x i8>, <16 x i8>* %A
         %load2 = load <16 x i8>, <16 x i8>* %B
         %tmp3 = load <8 x i16>, <8 x i16>* %C
@@ -575,7 +575,7 @@ define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwin
 
 define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sabal2_4s:
-;CHECK: sabal2.4s
+;CHECK: sabal.4s
         %load1 = load <8 x i16>, <8 x i16>* %A
         %load2 = load <8 x i16>, <8 x i16>* %B
         %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -589,7 +589,7 @@ define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwin
 
 define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sabal2_2d:
-;CHECK: sabal2.2d
+;CHECK: sabal.2d
         %load1 = load <4 x i32>, <4 x i32>* %A
         %load2 = load <4 x i32>, <4 x i32>* %B
         %tmp3 = load <2 x i64>, <2 x i64>* %C
@@ -639,7 +639,7 @@ define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind
 
 define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
 ;CHECK-LABEL: uabal2_8h:
-;CHECK: uabal2.8h
+;CHECK: uabal.8h
         %load1 = load <16 x i8>, <16 x i8>* %A
         %load2 = load <16 x i8>, <16 x i8>* %B
         %tmp3 = load <8 x i16>, <8 x i16>* %C
@@ -653,7 +653,7 @@ define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwin
 
 define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: uabal2_4s:
-;CHECK: uabal2.4s
+;CHECK: uabal.4s
         %load1 = load <8 x i16>, <8 x i16>* %A
         %load2 = load <8 x i16>, <8 x i16>* %B
         %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -667,7 +667,7 @@ define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwin
 
 define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: uabal2_2d:
-;CHECK: uabal2.2d
+;CHECK: uabal.2d
         %load1 = load <4 x i32>, <4 x i32>* %A
         %load2 = load <4 x i32>, <4 x i32>* %B
         %tmp3 = load <2 x i64>, <2 x i64>* %C
diff --git a/test/CodeGen/AArch64/arm64-vadd.ll b/test/CodeGen/AArch64/arm64-vadd.ll
index 9d09251524ea..2a25538250e4 100644
--- a/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/test/CodeGen/AArch64/arm64-vadd.ll
@@ -318,7 +318,7 @@ define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: uaddw2_8h:
-;CHECK: uaddw2.8h
+;CHECK: uaddw.8h
         %tmp1 = load <8 x i16>, <8 x i16>* %A
 
         %tmp2 = load <16 x i8>, <16 x i8>* %B
@@ -331,7 +331,7 @@ define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: uaddw2_4s:
-;CHECK: uaddw2.4s
+;CHECK: uaddw.4s
         %tmp1 = load <4 x i32>, <4 x i32>* %A
 
         %tmp2 = load <8 x i16>, <8 x i16>* %B
@@ -344,7 +344,7 @@ define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: uaddw2_2d:
-;CHECK: uaddw2.2d
+;CHECK: uaddw.2d
         %tmp1 = load <2 x i64>, <2 x i64>* %A
 
         %tmp2 = load <4 x i32>, <4 x i32>* %B
@@ -387,7 +387,7 @@ define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: saddw2_8h:
-;CHECK: saddw2.8h
+;CHECK: saddw.8h
         %tmp1 = load <8 x i16>, <8 x i16>* %A
 
         %tmp2 = load <16 x i8>, <16 x i8>* %B
@@ -400,7 +400,7 @@ define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: saddw2_4s:
-;CHECK: saddw2.4s
+;CHECK: saddw.4s
         %tmp1 = load <4 x i32>, <4 x i32>* %A
 
         %tmp2 = load <8 x i16>, <8 x i16>* %B
@@ -413,7 +413,7 @@ define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: saddw2_2d:
-;CHECK: saddw2.2d
+;CHECK: saddw.2d
         %tmp1 = load <2 x i64>, <2 x i64>* %A
 
         %tmp2 = load <4 x i32>, <4 x i32>* %B
diff --git a/test/CodeGen/AArch64/arm64-vmul.ll b/test/CodeGen/AArch64/arm64-vmul.ll
index a7668ec97979..f70ed9a43427 100644
--- a/test/CodeGen/AArch64/arm64-vmul.ll
+++ b/test/CodeGen/AArch64/arm64-vmul.ll
@@ -83,7 +83,7 @@ define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqdmull2_4s:
-;CHECK: sqdmull2.4s
+;CHECK: sqdmull.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -94,7 +94,7 @@ define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: sqdmull2_2d:
-;CHECK: sqdmull2.2d
+;CHECK: sqdmull.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -324,7 +324,7 @@ define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin
 
 define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal2_4s:
-;CHECK: sqdmlal2.4s
+;CHECK: sqdmlal.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -337,7 +337,7 @@ define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw
 
 define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal2_2d:
-;CHECK: sqdmlal2.2d
+;CHECK: sqdmlal.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp3 = load <2 x i64>, <2 x i64>* %C
@@ -372,7 +372,7 @@ define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwin
 
 define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlsl2_4s:
-;CHECK: sqdmlsl2.4s
+;CHECK: sqdmlsl.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -385,7 +385,7 @@ define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounw
 
 define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sqdmlsl2_2d:
-;CHECK: sqdmlsl2.2d
+;CHECK: sqdmlsl.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp3 = load <2 x i64>, <2 x i64>* %C
@@ -874,7 +874,7 @@ define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: sqdmull2_lane_4s:
 ;CHECK-NOT: dup
-;CHECK: sqdmull2.4s
+;CHECK: sqdmull.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -886,7 +886,7 @@ define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: sqdmull2_lane_2d:
 ;CHECK-NOT: dup
-;CHECK: sqdmull2.2d
+;CHECK: sqdmull.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
@@ -994,7 +994,7 @@ define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n
 define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal2_lane_4s:
 ;CHECK-NOT: dup
-;CHECK: sqdmlal2.4s
+;CHECK: sqdmlal.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -1008,7 +1008,7 @@ define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C)
 define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sqdmlal2_lane_2d:
 ;CHECK-NOT: dup
-;CHECK: sqdmlal2.2d
+;CHECK: sqdmlal.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp3 = load <2 x i64>, <2 x i64>* %C
@@ -1147,7 +1147,7 @@ define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) n
 define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
 ;CHECK-LABEL: sqdmlsl2_lane_4s:
 ;CHECK-NOT: dup
-;CHECK: sqdmlsl2.4s
+;CHECK: sqdmlsl.4s
   %load1 = load <8 x i16>, <8 x i16>* %A
   %load2 = load <8 x i16>, <8 x i16>* %B
   %tmp3 = load <4 x i32>, <4 x i32>* %C
@@ -1161,7 +1161,7 @@ define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C)
 define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
 ;CHECK-LABEL: sqdmlsl2_lane_2d:
 ;CHECK-NOT: dup
-;CHECK: sqdmlsl2.2d
+;CHECK: sqdmlsl.2d
   %load1 = load <4 x i32>, <4 x i32>* %A
   %load2 = load <4 x i32>, <4 x i32>* %B
   %tmp3 = load <2 x i64>, <2 x i64>* %C
diff --git a/test/CodeGen/AArch64/arm64-vshift.ll b/test/CodeGen/AArch64/arm64-vshift.ll
index c1c4649bd6a4..6b0fe40b5a09 100644
--- a/test/CodeGen/AArch64/arm64-vshift.ll
+++ b/test/CodeGen/AArch64/arm64-vshift.ll
@@ -1164,7 +1164,7 @@ define <2 x i64> @ushll2d(<2 x i32>* %A) nounwind {
 
 define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: ushll2_8h:
-;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1
+;CHECK: ushll.8h v0, {{v[0-9]+}}, #1
         %load1 = load <16 x i8>, <16 x i8>* %A
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -1174,7 +1174,7 @@ define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
 
 define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: ushll2_4s:
-;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1
+;CHECK: ushll.4s v0, {{v[0-9]+}}, #1
         %load1 = load <8 x i16>, <8 x i16>* %A
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -1184,7 +1184,7 @@ define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
 
 define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: ushll2_2d:
-;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1
+;CHECK: ushll.2d v0, {{v[0-9]+}}, #1
         %load1 = load <4 x i32>, <4 x i32>* %A
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -1221,7 +1221,7 @@ define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind {
 
 define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
 ;CHECK-LABEL: sshll2_8h:
-;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1
+;CHECK: sshll.8h v0, {{v[0-9]+}}, #1
         %load1 = load <16 x i8>, <16 x i8>* %A
         %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
@@ -1231,7 +1231,7 @@ define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
 
 define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
 ;CHECK-LABEL: sshll2_4s:
-;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1
+;CHECK: sshll.4s v0, {{v[0-9]+}}, #1
         %load1 = load <8 x i16>, <8 x i16>* %A
         %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
@@ -1241,7 +1241,7 @@ define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
 
 define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind {
 ;CHECK-LABEL: sshll2_2d:
-;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1
+;CHECK: sshll.2d v0, {{v[0-9]+}}, #1
         %load1 = load <4 x i32>, <4 x i32>* %A
         %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
diff --git a/test/CodeGen/AArch64/arm64-vsub.ll b/test/CodeGen/AArch64/arm64-vsub.ll
index 7af69118347e..6746e49989cb 100644
--- a/test/CodeGen/AArch64/arm64-vsub.ll
+++ b/test/CodeGen/AArch64/arm64-vsub.ll
@@ -157,7 +157,7 @@ define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: ssubl2_8h:
-;CHECK: ssubl2.8h
+;CHECK: ssubl.8h
         %tmp1 = load <16 x i8>, <16 x i8>* %A
         %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
         %ext1 = sext <8 x i8> %high1 to <8 x i16>
@@ -172,7 +172,7 @@ define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: ssubl2_4s:
-;CHECK: ssubl2.4s
+;CHECK: ssubl.4s
         %tmp1 = load <8 x i16>, <8 x i16>* %A
         %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
         %ext1 = sext <4 x i16> %high1 to <4 x i32>
@@ -187,7 +187,7 @@ define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: ssubl2_2d:
-;CHECK: ssubl2.2d
+;CHECK: ssubl.2d
         %tmp1 = load <4 x i32>, <4 x i32>* %A
         %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
         %ext1 = sext <2 x i32> %high1 to <2 x i64>
@@ -235,7 +235,7 @@ define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: usubl2_8h:
-;CHECK: usubl2.8h
+;CHECK: usubl.8h
   %tmp1 = load <16 x i8>, <16 x i8>* %A
   %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %ext1 = zext <8 x i8> %high1 to <8 x i16>
@@ -250,7 +250,7 @@ define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: usubl2_4s:
-;CHECK: usubl2.4s
+;CHECK: usubl.4s
   %tmp1 = load <8 x i16>, <8 x i16>* %A
   %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %ext1 = zext <4 x i16> %high1 to <4 x i32>
@@ -265,7 +265,7 @@ define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: usubl2_2d:
-;CHECK: usubl2.2d
+;CHECK: usubl.2d
   %tmp1 = load <4 x i32>, <4 x i32>* %A
   %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
   %ext1 = zext <2 x i32> %high1 to <2 x i64>
@@ -310,7 +310,7 @@ define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: ssubw2_8h:
-;CHECK: ssubw2.8h
+;CHECK: ssubw.8h
         %tmp1 = load <8 x i16>, <8 x i16>* %A
 
         %tmp2 = load <16 x i8>, <16 x i8>* %B
@@ -323,7 +323,7 @@ define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: ssubw2_4s:
-;CHECK: ssubw2.4s
+;CHECK: ssubw.4s
         %tmp1 = load <4 x i32>, <4 x i32>* %A
 
         %tmp2 = load <8 x i16>, <8 x i16>* %B
@@ -336,7 +336,7 @@ define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: ssubw2_2d:
-;CHECK: ssubw2.2d
+;CHECK: ssubw.2d
         %tmp1 = load <2 x i64>, <2 x i64>* %A
 
         %tmp2 = load <4 x i32>, <4 x i32>* %B
@@ -379,7 +379,7 @@ define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
 
 define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 ;CHECK-LABEL: usubw2_8h:
-;CHECK: usubw2.8h
+;CHECK: usubw.8h
         %tmp1 = load <8 x i16>, <8 x i16>* %A
 
         %tmp2 = load <16 x i8>, <16 x i8>* %B
@@ -392,7 +392,7 @@ define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
 
 define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: usubw2_4s:
-;CHECK: usubw2.4s
+;CHECK: usubw.4s
         %tmp1 = load <4 x i32>, <4 x i32>* %A
 
         %tmp2 = load <8 x i16>, <8 x i16>* %B
@@ -405,7 +405,7 @@ define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
 
 define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
 ;CHECK-LABEL: usubw2_2d:
-;CHECK: usubw2.2d
+;CHECK: usubw.2d
         %tmp1 = load <2 x i64>, <2 x i64>* %A
 
         %tmp2 = load <4 x i32>, <4 x i32>* %B
diff --git a/test/CodeGen/AArch64/asm-print-comments.ll b/test/CodeGen/AArch64/asm-print-comments.ll
new file mode 100644
index 000000000000..e997dce23583
--- /dev/null
+++ b/test/CodeGen/AArch64/asm-print-comments.ll
@@ -0,0 +1,17 @@
+; RUN: llc %s -mtriple=arm64-apple-darwin -o - | FileCheck %s
+
+; CHECK-LABEL: ; -- Begin function foo
+; CHECK: foo:
+define hidden i32 @foo() {
+  entry:
+  ret i32 30
+}
+; CHECK: ; -- End function
+
+; CHECK-LABEL: ; -- Begin function bar
+; CHECK: bar:
+define i32 @bar() {
+  entry:
+  ret i32 30
+}
+; CHECK: ; -- End function
diff --git a/test/CodeGen/AArch64/cmpxchg-O0.ll b/test/CodeGen/AArch64/cmpxchg-O0.ll
index 8432b15ea523..1bfbcf851c0e 100644
--- a/test/CodeGen/AArch64/cmpxchg-O0.ll
+++ b/test/CodeGen/AArch64/cmpxchg-O0.ll
@@ -3,10 +3,11 @@
 define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
 ; CHECK-LABEL: test_cmpxchg_8:
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK:     mov [[STATUS:w[3-9]+]], #0
 ; CHECK:     ldaxrb [[OLD:w[0-9]+]], [x0]
 ; CHECK:     cmp [[OLD]], w1, uxtb
 ; CHECK:     b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     stlxrb [[STATUS:w[3-9]]], w2, [x0]
+; CHECK:     stlxrb [[STATUS]], w2, [x0]
 ; CHECK:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK: [[DONE]]:
 ; CHECK:     subs {{w[0-9]+}}, [[OLD]], w1
@@ -18,6 +19,7 @@ define { i8, i1 } @test_cmpxchg_8(i8* %addr, i8 %desired, i8 %new) nounwind {
 define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind {
 ; CHECK-LABEL: test_cmpxchg_16:
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK:     mov [[STATUS:w[3-9]+]], #0
 ; CHECK:     ldaxrh [[OLD:w[0-9]+]], [x0]
 ; CHECK:     cmp [[OLD]], w1, uxth
 ; CHECK:     b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
@@ -33,10 +35,11 @@ define { i16, i1 } @test_cmpxchg_16(i16* %addr, i16 %desired, i16 %new) nounwind
 define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind {
 ; CHECK-LABEL: test_cmpxchg_32:
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK:     mov [[STATUS:w[3-9]+]], #0
 ; CHECK:     ldaxr [[OLD:w[0-9]+]], [x0]
 ; CHECK:     cmp [[OLD]], w1
 ; CHECK:     b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     stlxr [[STATUS:w[3-9]]], w2, [x0]
+; CHECK:     stlxr [[STATUS]], w2, [x0]
 ; CHECK:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK: [[DONE]]:
 ; CHECK:     subs {{w[0-9]+}}, [[OLD]], w1
@@ -48,10 +51,11 @@ define { i32, i1 } @test_cmpxchg_32(i32* %addr, i32 %desired, i32 %new) nounwind
 define { i64, i1 } @test_cmpxchg_64(i64* %addr, i64 %desired, i64 %new) nounwind {
 ; CHECK-LABEL: test_cmpxchg_64:
 ; CHECK: [[RETRY:.LBB[0-9]+_[0-9]+]]:
+; CHECK:     mov [[STATUS:w[3-9]+]], #0
 ; CHECK:     ldaxr [[OLD:x[0-9]+]], [x0]
 ; CHECK:     cmp [[OLD]], x1
 ; CHECK:     b.ne [[DONE:.LBB[0-9]+_[0-9]+]]
-; CHECK:     stlxr [[STATUS:w[3-9]]], x2, [x0]
+; CHECK:     stlxr [[STATUS]], x2, [x0]
 ; CHECK:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK: [[DONE]]:
 ; CHECK:     subs {{x[0-9]+}}, [[OLD]], x1
diff --git a/test/CodeGen/AArch64/fast-isel-cmpxchg.ll b/test/CodeGen/AArch64/fast-isel-cmpxchg.ll
index aa78210fae74..7ef625abab20 100644
--- a/test/CodeGen/AArch64/fast-isel-cmpxchg.ll
+++ b/test/CodeGen/AArch64/fast-isel-cmpxchg.ll
@@ -2,11 +2,12 @@
 
 ; CHECK-LABEL: cmpxchg_monotonic_32:
 ; CHECK: [[RETRY:.LBB[0-9_]+]]:
+; CHECK-NEXT:     mov [[STATUS:w[0-9]+]], #0
 ; CHECK-NEXT:     ldaxr [[OLD:w[0-9]+]], [x0]
 ; CHECK-NEXT:     cmp [[OLD]], w1
 ; CHECK-NEXT:     b.ne [[DONE:.LBB[0-9_]+]]
 ; CHECK-NEXT: // BB#2:
-; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], w2, [x0]
+; CHECK-NEXT:     stlxr [[STATUS]], w2, [x0]
 ; CHECK-NEXT:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK-NEXT: [[DONE]]:
 ; CHECK-NEXT:     cmp [[OLD]], w1
@@ -27,11 +28,12 @@ define i32 @cmpxchg_monotonic_32(i32* %p, i32 %cmp, i32 %new, i32* %ps) #0 {
 ; CHECK:      // BB#0:
 ; CHECK:     ldr [[NEW:w[0-9]+]], [x2]
 ; CHECK-NEXT: [[RETRY:.LBB[0-9_]+]]:
+; CHECK-NEXT:     mov [[STATUS:w[0-9]+]], #0
 ; CHECK-NEXT:     ldaxr [[OLD:w[0-9]+]], [x0]
 ; CHECK-NEXT:     cmp [[OLD]], w1
 ; CHECK-NEXT:     b.ne [[DONE:.LBB[0-9_]+]]
 ; CHECK-NEXT: // BB#2:
-; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], [[NEW]], [x0]
+; CHECK-NEXT:     stlxr [[STATUS]], [[NEW]], [x0]
 ; CHECK-NEXT:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK-NEXT: [[DONE]]:
 ; CHECK-NEXT:     cmp [[OLD]], w1
@@ -51,11 +53,12 @@ define i32 @cmpxchg_acq_rel_32_load(i32* %p, i32 %cmp, i32* %pnew, i32* %ps) #0
 
 ; CHECK-LABEL: cmpxchg_seq_cst_64:
 ; CHECK: [[RETRY:.LBB[0-9_]+]]:
+; CHECK-NEXT:     mov [[STATUS:w[0-9]+]], #0
 ; CHECK-NEXT:     ldaxr [[OLD:x[0-9]+]], [x0]
 ; CHECK-NEXT:     cmp [[OLD]], x1
 ; CHECK-NEXT:     b.ne [[DONE:.LBB[0-9_]+]]
 ; CHECK-NEXT: // BB#2:
-; CHECK-NEXT:     stlxr [[STATUS:w[0-9]+]], x2, [x0]
+; CHECK-NEXT:     stlxr [[STATUS]], x2, [x0]
 ; CHECK-NEXT:     cbnz [[STATUS]], [[RETRY]]
 ; CHECK-NEXT: [[DONE]]:
 ; CHECK-NEXT:     cmp [[OLD]], x1
diff --git a/test/CodeGen/AArch64/live-interval-analysis.mir b/test/CodeGen/AArch64/live-interval-analysis.mir
index d44300973566..93dfcf507fff 100644
--- a/test/CodeGen/AArch64/live-interval-analysis.mir
+++ b/test/CodeGen/AArch64/live-interval-analysis.mir
@@ -6,7 +6,7 @@
 ---
 # CHECK-LABEL: ********** INTERVALS **********
 # W29 is reserved, so we should only see dead defs
-# CHECK-DAG: W29 [0B,0d:{{[0-9]+}})[32r,32d:{{[0-9]+}})[64r,64d:{{[0-9]+}})
+# CHECK-DAG: W29 [32r,32d:{{[0-9]+}})[64r,64d:{{[0-9]+}})
 # For normal registers like x28 we should see the full intervals
 # CHECK-DAG: W28 [0B,16r:{{[0-9]+}})[32r,48r:{{[0-9]+}})[48r,48d:{{[0-9]+}})
 # CHECK: # End machine code for function reserved_reg_liveness.
@@ -14,7 +14,7 @@ name: reserved_reg_liveness
 tracksRegLiveness: true
 body: |
   bb.0:
-    liveins: %x28_fp
+    liveins: %x28
     %6 : xseqpairsclass = COPY %x28_fp
     %x28_fp = COPY %6
     %x28 = COPY %x28
diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll
index 4c682e594e66..1d8787212579 100644
--- a/test/CodeGen/AArch64/misched-fusion-aes.ll
+++ b/test/CodeGen/AArch64/misched-fusion-aes.ll
@@ -1,5 +1,5 @@
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57
-; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA72
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57A72
 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1
 
 declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k)
@@ -72,55 +72,40 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
   ret void
 
 ; CHECK-LABEL: aesea:
-; CHECKA57: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
-; CHECKA57: aesmc {{v[0-7].16b}}, [[VA]]
-; CHECKA57: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
-; CHECKA57: aesmc {{v[0-7].16b}}, [[VB]]
-; CHECKA57: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
-; CHECKA57: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
-; CHECKA57: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
-; CHECKA57: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
-; CHECKA72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
-; CHECKA72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
-; CHECKA72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
-; CHECKA72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
-; CHECKA72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
-; CHECKA72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
-; CHECKA72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
-; CHECKA72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
+; CHECKA57A72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECKA57A72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
+; CHECKA57A72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
+; CHECKA57A72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
+; CHECKA57A72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
+; CHECKA57A72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
+; CHECKA57A72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
+; CHECKA57A72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
+
 ; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECKM1: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
 ; CHECKM1: aese {{v[0-7].16b}}, {{v[0-7].16b}}
 ; CHECKM1: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
 ; CHECKM1: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1: aesmc {{v[0-7].16b}}, [[VD]]
+; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
+; CHECKM1: aesmc {{v[0-7].16b}}, [[VH]]
 ; CHECKM1: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
 ; CHECKM1: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
 ; CHECKM1: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
-; CHECKM1: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
 }
 
 define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {
@@ -188,53 +173,65 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d,
   ret void
 
 ; CHECK-LABEL: aesda:
-; CHECKA57: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
-; CHECKA57: aesimc {{v[0-7].16b}}, [[VA]]
-; CHECKA57: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
-; CHECKA57: aesimc {{v[0-7].16b}}, [[VB]]
-; CHECKA57: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
-; CHECKA57: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
-; CHECKA57: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
-; CHECKA57: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
-; CHECKA72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
-; CHECKA72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
-; CHECKA72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
-; CHECKA72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
-; CHECKA72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
-; CHECKA72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
-; CHECKA72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
-; CHECKA72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+; CHECKA57A72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
+; CHECKA57A72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
+; CHECKA57A72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
+; CHECKA57A72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
+; CHECKA57A72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
+; CHECKA57A72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
+; CHECKA57A72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
+; CHECKA57A72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57A72-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+
 ; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1: aesimc {{v[0-7].16b}}, [[VA]]
+; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VA]]
+; CHECKM1: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
 ; CHECKM1: aesd {{v[0-7].16b}}, {{v[0-7].16b}}
 ; CHECKM1: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
 ; CHECKM1: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1: aesimc {{v[0-7].16b}}, [[VD]]
+; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
+; CHECKM1: aesimc {{v[0-7].16b}}, [[VH]]
 ; CHECKM1: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
 ; CHECKM1: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
 ; CHECKM1: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
 ; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
-; CHECKM1: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
-; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+}
+
+define void @aes_load_store(<16 x i8> *%p1, <16 x i8> *%p2 , <16 x i8> *%p3) {
+entry:
+  %x1 = alloca <16 x i8>, align 16
+  %x2 = alloca <16 x i8>, align 16
+  %x3 = alloca <16 x i8>, align 16
+  %x4 = alloca <16 x i8>, align 16
+  %x5 = alloca <16 x i8>, align 16
+  %in1 = load <16 x i8>, <16 x i8>* %p1, align 16
+  store <16 x i8> %in1, <16 x i8>* %x1, align 16
+  %aese1 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %in1, <16 x i8> %in1) #2
+  store <16 x i8> %aese1, <16 x i8>* %x2, align 16
+  %in2 = load <16 x i8>, <16 x i8>* %p2, align 16
+  %aesmc1= call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %aese1) #2
+  store <16 x i8> %aesmc1, <16 x i8>* %x3, align 16
+  %aese2 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %in1, <16 x i8> %in2) #2
+  store <16 x i8> %aese2, <16 x i8>* %x4, align 16
+  %aesmc2= call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %aese2) #2
+  store <16 x i8> %aesmc2, <16 x i8>* %x5, align 16
+  ret void
+
+; CHECK-LABEL: aes_load_store:
+; CHECK: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECK: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECK-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
 }
diff --git a/test/CodeGen/AArch64/optimize-imm.ll b/test/CodeGen/AArch64/optimize-imm.ll
index a4725c65aa26..f960a3a95fc9 100644
--- a/test/CodeGen/AArch64/optimize-imm.ll
+++ b/test/CodeGen/AArch64/optimize-imm.ll
@@ -62,3 +62,22 @@ entry:
   %and = xor i32 %xor, 56
   ret i32 %and
 }
+
+; Check that, when (and %t1, 129) is transformed to (and %t0, 0),
+; (xor %arg, 129) doesn't get transformed to (xor %arg, 0).
+;
+; CHECK-LABEL: PR33100:
+; CHECK: mov w[[R0:[0-9]+]], #129
+; CHECK: eor {{x[0-9]+}}, {{x[0-9]+}}, x[[R0]]
+
+define i64 @PR33100(i64 %arg) {
+entry:
+  %alloca0 = alloca i64
+  store i64 8, i64* %alloca0, align 4
+  %t0 = load i64, i64* %alloca0, align 4
+  %t1 = shl i64 %arg, %t0
+  %and0 = and i64 %t1, 129
+  %xor0 = xor i64 %arg, 129
+  %t2 = add i64 %and0, %xor0
+  ret i64 %t2
+}
diff --git a/test/CodeGen/AArch64/scheduledag-constreg.mir b/test/CodeGen/AArch64/scheduledag-constreg.mir
index 23c785504f01..6b83dc715e0a 100644
--- a/test/CodeGen/AArch64/scheduledag-constreg.mir
+++ b/test/CodeGen/AArch64/scheduledag-constreg.mir
@@ -1,4 +1,4 @@
-# RUN: llc -o /dev/null %s -mtriple=aarch64-- -run-pass=machine-scheduler -enable-misched -debug-only=misched 2>&1 | FileCheck %s
+# RUN: llc -o /dev/null %s -mtriple=aarch64-- -run-pass=machine-scheduler -enable-misched -debug-only=machine-scheduler 2>&1 | FileCheck %s
 # REQUIRES: asserts
 --- |
   define void @func() { ret void }
diff --git a/test/CodeGen/AArch64/tailcall_misched_graph.ll b/test/CodeGen/AArch64/tailcall_misched_graph.ll
index 0e4eb2b5fad9..4fbd8944f032 100644
--- a/test/CodeGen/AArch64/tailcall_misched_graph.ll
+++ b/test/CodeGen/AArch64/tailcall_misched_graph.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=cyclone -debug-only=misched < %s 2>&1 | FileCheck %s
+; RUN: llc -mcpu=cyclone -debug-only=machine-scheduler < %s 2>&1 | FileCheck %s
 
 ; REQUIRES: asserts
 
diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
index 8839ba8e0ab2..0557008ceb4f 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir
@@ -5,6 +5,11 @@
   entry:
     ret void
   }
+
+  define void @test_fconstant() {
+  entry:
+    ret void
+  }
 ...
 
 ---
@@ -18,3 +23,18 @@ body: |
 
     %0(s32) = G_CONSTANT i32 5
 ...
+
+---
+name:            test_fconstant
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body: |
+  bb.0.entry:
+    ; CHECK-LABEL: name: test_fconstant
+    ; CHECK: %0(s32) = G_FCONSTANT  float 1.000000e+00
+    ; CHECK: %1(s32) = G_FCONSTANT  float 7.5
+
+    %0(s32) = G_FCONSTANT float 1.0
+    %1(s32) = G_FCONSTANT float 7.5
+...
diff --git a/test/CodeGen/AMDGPU/bfe-combine.ll b/test/CodeGen/AMDGPU/bfe-combine.ll
new file mode 100644
index 000000000000..791b49f0e143
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bfe-combine.ll
@@ -0,0 +1,41 @@
+; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck --check-prefix=GCN --check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN --check-prefix=CI %s
+
+; GCN-LABEL: {{^}}bfe_combine8:
+; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 8, 8
+; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], 2, v[[BFE]]
+; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 6, v{{[0-9]+}}
+; CI: v_and_b32_e32 v[[ADDRLO:[0-9]+]], 0x3fc, v[[SHR]]
+; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
+; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+define amdgpu_kernel void @bfe_combine8(i32 addrspace(1)* nocapture %arg, i32 %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %idx = add i32 %x, %id
+  %srl = lshr i32 %idx, 8
+  %and = and i32 %srl, 255
+  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %and
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_combine16:
+; VI: v_bfe_u32 v[[BFE:[0-9]+]], v{{[0-9]+}}, 16, 16
+; VI: v_lshlrev_b32_e32 v[[ADDRBASE:[0-9]+]], {{[^,]+}}, v[[BFE]]
+; CI: v_lshrrev_b32_e32 v[[SHR:[0-9]+]], 1, v{{[0-9]+}}
+; CI: v_and_b32_e32 v[[AND:[0-9]+]], 0x7fff8000, v[[SHR]]
+; CI: v_lshl_b64 v{{\[}}[[ADDRLO:[0-9]+]]:{{[^\]+}}], v{{\[}}[[AND]]:{{[^\]+}}], 2
+; VI: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADDRBASE]]
+; GCN: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+define amdgpu_kernel void @bfe_combine16(i32 addrspace(1)* nocapture %arg, i32 %x) {
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x() #2
+  %idx = add i32 %x, %id
+  %srl = lshr i32 %idx, 1
+  %and = and i32 %srl, 2147450880
+  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %and
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/test/CodeGen/AMDGPU/extload-align.ll b/test/CodeGen/AMDGPU/extload-align.ll
index 4644800421d8..12cf27b918af 100644
--- a/test/CodeGen/AMDGPU/extload-align.ll
+++ b/test/CodeGen/AMDGPU/extload-align.ll
@@ -1,4 +1,4 @@
-; RUN: llc -debug-only=misched -march=amdgcn -verify-machineinstrs %s -o - 2>&1| FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC -check-prefix=DEBUG %s
+; RUN: llc -debug-only=machine-scheduler -march=amdgcn -verify-machineinstrs %s -o - 2>&1| FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC -check-prefix=DEBUG %s
 ; REQUIRES: asserts
 
 ; Verify that the extload generated from %eval has the default
@@ -20,4 +20,4 @@ define amdgpu_kernel void @extload_align(i32* %out, i32 %index) #0 {
   %eval = sext i16 %val to i32
   store i32 %eval, i32* %out
   ret void
-}
\ No newline at end of file
+}
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index 6fa26cb38793..9441bf208829 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=SI,GCN,MESA-GCN,FUNC
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,HSA-VI,FUNC
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=redwood -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC
+; RUN: llc < %s -march=r600 -mcpu=cayman -verify-machineinstrs | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}i8_arg:
 ; HSA-VI: kernarg_segment_alignment = 4
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll
new file mode 100644
index 000000000000..22e15e216805
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getpc.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+declare i64 @llvm.amdgcn.s.getpc() #0
+
+; GCN-LABEL: {{^}}test_s_getpc:
+; GCN: s_load_dwordx2
+; GCN-DAG: s_getpc_b64 s{{\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2
+define amdgpu_kernel void @test_s_getpc(i64 addrspace(1)* %out) #0 {
+  %tmp = call i64 @llvm.amdgcn.s.getpc() #1
+  store volatile i64 %tmp, i64 addrspace(1)* %out, align 8
+  ret void
+}
+
+attributes #0 = { nounwind readnone speculatable }
diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll
index 5dd2efdf6382..72fde04ba391 100644
--- a/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}constant_load_i16:
 ; GCN-NOHSA: buffer_load_ushort v{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/load-constant-i8.ll b/test/CodeGen/AMDGPU/load-constant-i8.ll
index 6e56b9f9b6d6..bdfc3caf9d01 100644
--- a/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}constant_load_i8:
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll
index dcdd1a947cd4..e3415b9c47de 100644
--- a/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
 
 ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
 
diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll
index 71adf090532f..fc0cbf916b52 100644
--- a/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}global_load_i8:
diff --git a/test/CodeGen/AMDGPU/load-local-i16.ll b/test/CodeGen/AMDGPU/load-local-i16.ll
index bbbb34e8d333..7de3f3b28c6d 100644
--- a/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}local_load_i16:
 ; GCN: ds_read_u16 v{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/load-local-i8.ll b/test/CodeGen/AMDGPU/load-local-i8.ll
index 731996ec6c45..16eb366a4b15 100644
--- a/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}local_load_i8:
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
index e85a724c1567..60e43f8fb2a7 100644
--- a/test/CodeGen/AMDGPU/min.ll
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
 ; GCN: v_min_i32_e32
diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
index a90f200f79e3..190d2b72ebaf 100644
--- a/test/CodeGen/AMDGPU/parallelandifcollapse.ll
+++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck %s
 ;
 ; CFG flattening should use parallel-and mode to generate branch conditions and
 ; then merge if-regions with the same bodies.
diff --git a/test/CodeGen/AMDGPU/r600.bitcast.ll b/test/CodeGen/AMDGPU/r600.bitcast.ll
index acf7a66a357f..67431e6a4825 100644
--- a/test/CodeGen/AMDGPU/r600.bitcast.ll
+++ b/test/CodeGen/AMDGPU/r600.bitcast.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; This test just checks that the compiler doesn't crash.
 
diff --git a/test/CodeGen/AMDGPU/schedule-regpressure.mir b/test/CodeGen/AMDGPU/schedule-regpressure.mir
index c71de87eeece..3a20ec732e5b 100644
--- a/test/CodeGen/AMDGPU/schedule-regpressure.mir
+++ b/test/CodeGen/AMDGPU/schedule-regpressure.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=amdgcn -misched=converge -run-pass machine-scheduler %s -o - -debug-only=misched 2>&1 | FileCheck %s
+# RUN: llc -march=amdgcn -misched=converge -run-pass machine-scheduler %s -o - -debug-only=machine-scheduler 2>&1 | FileCheck %s
 # REQUIRES: asserts
 
 # Check there is no SReg_32 pressure created by DS_* instructions because of M0 use
diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll
index add90e9c2f3a..f63719d62a84 100644
--- a/test/CodeGen/AMDGPU/setcc.ll
+++ b/test/CodeGen/AMDGPU/setcc.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
diff --git a/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll b/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
new file mode 100644
index 000000000000..1cdfec9fdb59
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shl-add-to-add-shl.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=amdgcn -mcpu=fiji < %s | FileCheck %s
+
+; Check transformation shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1)
+; Only one shift if expected, GEP shall not produce a separate shift
+
+; CHECK-LABEL: {{^}}add_const_offset:
+; CHECK: v_lshlrev_b32_e32 v[[SHL:[0-9]+]], 4, v0
+; CHECK: v_add_i32_e32 v[[ADD:[0-9]+]], vcc, 0xc80, v[[SHL]]
+; CHECK-NOT: v_lshl
+; CHECK: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[ADD]]
+; CHECK: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+define amdgpu_kernel void @add_const_offset(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %add = add i32 %id, 200
+  %shl = shl i32 %add, 2
+  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %shl
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}or_const_offset:
+; CHECK: v_lshlrev_b32_e32 v[[SHL:[0-9]+]], 4, v0
+; CHECK: v_or_b32_e32 v[[OR:[0-9]+]], 0x1000, v[[SHL]]
+; CHECK-NOT: v_lshl
+; CHECK: v_add_i32_e32 v[[ADDRLO:[0-9]+]], vcc, s{{[0-9]+}}, v[[OR]]
+; CHECK: load_dword v{{[0-9]+}}, v{{\[}}[[ADDRLO]]:
+define amdgpu_kernel void @or_const_offset(i32 addrspace(1)* nocapture %arg) {
+bb:
+  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %add = or i32 %id, 256
+  %shl = shl i32 %add, 2
+  %ptr = getelementptr inbounds i32, i32 addrspace(1)* %arg, i32 %shl
+  %val = load i32, i32 addrspace(1)* %ptr, align 4
+  store i32 %val, i32 addrspace(1)* %arg, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index ff666cc3653b..edc313ee323b 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
 ; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index b4355b76016a..44cfdf6398ae 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
diff --git a/test/CodeGen/AMDGPU/store-global.ll b/test/CodeGen/AMDGPU/store-global.ll
index 160e921fc075..f61e524ee2e5 100644
--- a/test/CodeGen/AMDGPU/store-global.ll
+++ b/test/CodeGen/AMDGPU/store-global.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}store_i1:
 ; EG: MEM_RAT MSKOR
diff --git a/test/CodeGen/AMDGPU/store-private.ll b/test/CodeGen/AMDGPU/store-private.ll
index ab73ada370ea..ce7656adc0b4 100644
--- a/test/CodeGen/AMDGPU/store-private.ll
+++ b/test/CodeGen/AMDGPU/store-private.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}store_i1:
 ; EG: MOVA_INT
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
index f366029fdea2..e7655df15520 100644
--- a/test/CodeGen/AMDGPU/sub.ll
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
 declare i32 @llvm.r600.read.tidig.x() readnone
diff --git a/test/CodeGen/AMDGPU/unknown-processor.ll b/test/CodeGen/AMDGPU/unknown-processor.ll
index 25a700a943d2..e25f2235993f 100644
--- a/test/CodeGen/AMDGPU/unknown-processor.ll
+++ b/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=unknown < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
-; RUN: llc -march=r600 -mcpu=unknown < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
+; RUN: llc -march=amdgcn -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=GCN %s
+; RUN: llc -march=r600 -mcpu=unknown -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR -check-prefix=R600 %s
 
 ; Should not crash when the processor is not recognized and the
 ; wavefront size feature not set.
diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll
index 03cf725601b7..a0aac8c1d9ba 100644
--- a/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -138,3 +138,25 @@ entry:
   store float %tmp2, float addrspace(1)* %out
   ret void
 }
+
+; The pointer arguments in local address space should not affect promotion to vector.
+
+; OPT-LABEL: @vector_read_with_local_arg(
+; OPT: %0 = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
+; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
+define amdgpu_kernel void @vector_read_with_local_arg(i32 addrspace(3)* %stopper, i32 addrspace(1)* %out, i32 %index) {
+entry:
+  %tmp = alloca [4 x i32]
+  %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
+  %y = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 1
+  %z = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 2
+  %w = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 3
+  store i32 0, i32* %x
+  store i32 1, i32* %y
+  store i32 2, i32* %z
+  store i32 3, i32* %w
+  %tmp1 = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 %index
+  %tmp2 = load i32, i32* %tmp1
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
index 1a0c7fd8e1d6..f4aba880ff76 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
@@ -620,6 +620,360 @@ entry:
   ret float %r
 }
 
+declare arm_aapcscc [3 x i32] @tiny_int_arrays_target([2 x i32])
+
+define arm_aapcscc [3 x i32] @test_tiny_int_arrays([2 x i32] %arr) {
+; CHECK-LABEL: name: test_tiny_int_arrays
+; CHECK: liveins: %r0, %r1
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[ARG_ARR0:%[0-9]+]](s64) = IMPLICIT_DEF
+; CHECK: [[ARG_ARR1:%[0-9]+]](s64) = G_INSERT [[ARG_ARR0]], [[R0]](s32), 0
+; CHECK: [[ARG_ARR2:%[0-9]+]](s64) = G_INSERT [[ARG_ARR1]], [[R1]](s32), 32
+; CHECK: [[ARG_ARR:%[0-9]+]](s64) = COPY [[ARG_ARR2]]
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s64), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s64), 32
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: BLX @tiny_int_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit-def %r0, implicit-def %r1
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[R2:%[0-9]+]](s32) = COPY %r2
+; CHECK: [[RES_ARR0:%[0-9]+]](s96) = IMPLICIT_DEF
+; CHECK: [[RES_ARR1:%[0-9]+]](s96) = G_INSERT [[RES_ARR0]], [[R0]](s32), 0
+; CHECK: [[RES_ARR2:%[0-9]+]](s96) = G_INSERT [[RES_ARR1]], [[R1]](s32), 32
+; CHECK: [[RES_ARR3:%[0-9]+]](s96) = G_INSERT [[RES_ARR2]], [[R2]](s32), 64
+; CHECK: [[RES_ARR:%[0-9]+]](s96) = COPY [[RES_ARR3]]
+; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s96), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s96), 32
+; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s96), 64
+; FIXME: This doesn't seem correct with regard to the AAPCS docs (which say
+; that composite types larger than 4 bytes should be passed through memory),
+; but it's what DAGISel does. We should fix it in the common code for both.
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: %r2 = COPY [[R2]]
+; CHECK: BX_RET 14, _, implicit %r0, implicit %r1, implicit %r2
+entry:
+  %r = notail call arm_aapcscc [3 x i32] @tiny_int_arrays_target([2 x i32] %arr)
+  ret [3 x i32] %r
+}
+
+declare arm_aapcscc void @multiple_int_arrays_target([2 x i32], [2 x i32])
+
+define arm_aapcscc void @test_multiple_int_arrays([2 x i32] %arr0, [2 x i32] %arr1) {
+; CHECK-LABEL: name: test_multiple_int_arrays
+; CHECK: liveins: %r0, %r1
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[R2:%[0-9]+]](s32) = COPY %r2
+; CHECK: [[R3:%[0-9]+]](s32) = COPY %r3
+; CHECK: [[ARG_ARR0_0:%[0-9]+]](s64) = IMPLICIT_DEF
+; CHECK: [[ARG_ARR0_1:%[0-9]+]](s64) = G_INSERT [[ARG_ARR0_0]], [[R0]](s32), 0
+; CHECK: [[ARG_ARR0_2:%[0-9]+]](s64) = G_INSERT [[ARG_ARR0_1]], [[R1]](s32), 32
+; CHECK: [[ARG_ARR0:%[0-9]+]](s64) = COPY [[ARG_ARR0_2]]
+; CHECK: [[ARG_ARR1_0:%[0-9]+]](s64) = IMPLICIT_DEF
+; CHECK: [[ARG_ARR1_1:%[0-9]+]](s64) = G_INSERT [[ARG_ARR1_0]], [[R2]](s32), 0
+; CHECK: [[ARG_ARR1_2:%[0-9]+]](s64) = G_INSERT [[ARG_ARR1_1]], [[R3]](s32), 32
+; CHECK: [[ARG_ARR1:%[0-9]+]](s64) = COPY [[ARG_ARR1_2]]
+; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR0]](s64), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR0]](s64), 32
+; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR1]](s64), 0
+; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR1]](s64), 32
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: %r2 = COPY [[R2]]
+; CHECK: %r3 = COPY [[R3]]
+; CHECK: BLX @multiple_int_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3
+; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: BX_RET 14, _
+entry:
+  notail call arm_aapcscc void @multiple_int_arrays_target([2 x i32] %arr0, [2 x i32] %arr1)
+  ret void
+}
+
+declare arm_aapcscc void @large_int_arrays_target([20 x i32])
+
+define arm_aapcscc void @test_large_int_arrays([20 x i32] %arr) {
+; CHECK-LABEL: name: test_large_int_arrays
+; CHECK: fixedStack:
+; The parameters live in separate stack locations, one for each element that
+; doesn't fit in the registers.
+; CHECK-DAG: id: [[FIRST_STACK_ID:[0-9]+]], offset: 0, size: 4
+; CHECK-DAG: id: [[LAST_STACK_ID:[-0]+]], offset: 60, size: 4
+; CHECK: liveins: %r0, %r1, %r2, %r3
+; CHECK-DAG: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK-DAG: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK-DAG: [[R2:%[0-9]+]](s32) = COPY %r2
+; CHECK-DAG: [[R3:%[0-9]+]](s32) = COPY %r3
+; CHECK: [[FIRST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[FIRST_STACK_ID]]
+; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[FIRST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[FIRST_STACK_ID]]
+; CHECK: [[LAST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[LAST_STACK_ID]]
+; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[LAST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[LAST_STACK_ID]]
+; CHECK: [[ARG_ARR0:%[0-9]+]](s640) = IMPLICIT_DEF
+; CHECK: [[ARG_ARR1:%[0-9]+]](s640) = G_INSERT [[ARG_ARR0]], [[R0]](s32), 0
+; CHECK: [[ARG_ARR2:%[0-9]+]](s640) = G_INSERT [[ARG_ARR1]], [[R1]](s32), 32
+; CHECK: [[ARG_ARR3:%[0-9]+]](s640) = G_INSERT [[ARG_ARR2]], [[R2]](s32), 64
+; CHECK: [[ARG_ARR4:%[0-9]+]](s640) = G_INSERT [[ARG_ARR3]], [[R3]](s32), 96
+; CHECK: [[ARG_ARR5:%[0-9]+]](s640) = G_INSERT [[ARG_ARR4]], [[FIRST_STACK_ELEMENT]](s32), 128
+; CHECK: [[ARG_ARR6:%[0-9]+]](s640) = G_INSERT {{%[0-9]+}}, [[LAST_STACK_ELEMENT]](s32), 608
+; CHECK: [[ARG_ARR:%[0-9]+]](s640) = COPY [[ARG_ARR6]]
+; CHECK: ADJCALLSTACKDOWN 64, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 32
+; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 64
+; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 96
+; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 128
+; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s640), 608
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: %r2 = COPY [[R2]]
+; CHECK: %r3 = COPY [[R3]]
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF_FIRST_ELEMENT:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[FIRST_STACK_ARG_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[OFF_FIRST_ELEMENT]](s32)
+; CHECK: G_STORE [[FIRST_STACK_ELEMENT]](s32), [[FIRST_STACK_ARG_ADDR]]{{.*}}store 4
+; Match the second-to-last offset, so we can get the correct SP for the last element
+; CHECK: G_CONSTANT i32 56
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF_LAST_ELEMENT:%[0-9]+]](s32) = G_CONSTANT i32 60
+; CHECK: [[LAST_STACK_ARG_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[OFF_LAST_ELEMENT]](s32)
+; CHECK: G_STORE [[LAST_STACK_ELEMENT]](s32), [[LAST_STACK_ARG_ADDR]]{{.*}}store 4
+; CHECK: BLX @large_int_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3
+; CHECK: ADJCALLSTACKUP 64, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: BX_RET 14, _
+entry:
+  notail call arm_aapcscc void @large_int_arrays_target([20 x i32] %arr)
+  ret void
+}
+
+declare arm_aapcscc [2 x float] @fp_arrays_aapcs_target([3 x double])
+
+define arm_aapcscc [2 x float] @test_fp_arrays_aapcs([3 x double] %arr) {
+; CHECK-LABEL: name: test_fp_arrays_aapcs
+; CHECK: fixedStack:
+; CHECK: id: [[ARR2_ID:[0-9]+]], offset: 0, size: 8
+; CHECK: liveins: %r0, %r1, %r2, %r3
+; CHECK: [[ARR0_0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[ARR0_1:%[0-9]+]](s32) = COPY %r1
+; LITTLE: [[ARR0:%[0-9]+]](s64) = G_SEQUENCE [[ARR0_0]](s32), 0, [[ARR0_1]](s32), 32
+; BIG: [[ARR0:%[0-9]+]](s64) = G_SEQUENCE [[ARR0_1]](s32), 0, [[ARR0_0]](s32), 32
+; CHECK: [[ARR1_0:%[0-9]+]](s32) = COPY %r2
+; CHECK: [[ARR1_1:%[0-9]+]](s32) = COPY %r3
+; LITTLE: [[ARR1:%[0-9]+]](s64) = G_SEQUENCE [[ARR1_0]](s32), 0, [[ARR1_1]](s32), 32
+; BIG: [[ARR1:%[0-9]+]](s64) = G_SEQUENCE [[ARR1_1]](s32), 0, [[ARR1_0]](s32), 32
+; CHECK: [[ARR2_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[ARR2_ID]]
+; CHECK: [[ARR2:%[0-9]+]](s64) = G_LOAD [[ARR2_FI]]{{.*}}load 8 from %fixed-stack.[[ARR2_ID]]
+; CHECK: [[ARR_MERGED_0:%[0-9]+]](s192) = IMPLICIT_DEF
+; CHECK: [[ARR_MERGED_1:%[0-9]+]](s192) = G_INSERT [[ARR_MERGED_0]], [[ARR0]](s64), 0
+; CHECK: [[ARR_MERGED_2:%[0-9]+]](s192) = G_INSERT [[ARR_MERGED_1]], [[ARR1]](s64), 64
+; CHECK: [[ARR_MERGED_3:%[0-9]+]](s192) = G_INSERT [[ARR_MERGED_2]], [[ARR2]](s64), 128
+; CHECK: [[ARR_MERGED:%[0-9]+]](s192) = COPY [[ARR_MERGED_3]]
+; CHECK: ADJCALLSTACKDOWN 8, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[ARR0:%[0-9]+]](s64) = G_EXTRACT [[ARR_MERGED]](s192), 0
+; CHECK: [[ARR1:%[0-9]+]](s64) = G_EXTRACT [[ARR_MERGED]](s192), 64
+; CHECK: [[ARR2:%[0-9]+]](s64) = G_EXTRACT [[ARR_MERGED]](s192), 128
+; CHECK: [[ARR0_0:%[0-9]+]](s32) = G_EXTRACT [[ARR0]](s64), 0
+; CHECK: [[ARR0_1:%[0-9]+]](s32) = G_EXTRACT [[ARR0]](s64), 32
+; LITTLE: %r0 = COPY [[ARR0_0]](s32)
+; LITTLE: %r1 = COPY [[ARR0_1]](s32)
+; BIG: %r0 = COPY [[ARR0_1]](s32)
+; BIG: %r1 = COPY [[ARR0_0]](s32)
+; CHECK: [[ARR1_0:%[0-9]+]](s32) = G_EXTRACT [[ARR1]](s64), 0
+; CHECK: [[ARR1_1:%[0-9]+]](s32) = G_EXTRACT [[ARR1]](s64), 32
+; LITTLE: %r2 = COPY [[ARR1_0]](s32)
+; LITTLE: %r3 = COPY [[ARR1_1]](s32)
+; BIG: %r2 = COPY [[ARR1_1]](s32)
+; BIG: %r3 = COPY [[ARR1_0]](s32)
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[ARR2_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[ARR2_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[ARR2_OFFSET]](s32)
+; CHECK: G_STORE [[ARR2]](s64), [[ARR2_ADDR]](p0){{.*}}store 8
+; CHECK: BLX @fp_arrays_aapcs_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[R_MERGED_0:%[0-9]+]](s64) = IMPLICIT_DEF
+; CHECK: [[R_MERGED_1:%[0-9]+]](s64) = G_INSERT [[R_MERGED_0]], [[R0]](s32), 0
+; CHECK: [[R_MERGED_2:%[0-9]+]](s64) = G_INSERT [[R_MERGED_1]], [[R1]](s32), 32
+; CHECK: [[R_MERGED:%[0-9]+]](s64) = COPY [[R_MERGED_2]]
+; CHECK: ADJCALLSTACKUP 8, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s64), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s64), 32
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
+entry:
+  %r = notail call arm_aapcscc [2 x float] @fp_arrays_aapcs_target([3 x double] %arr)
+  ret [2 x float] %r
+}
+
+declare arm_aapcs_vfpcc [4 x float] @fp_arrays_aapcs_vfp_target([3 x double], [3 x float], [4 x double])
+
+define arm_aapcs_vfpcc [4 x float] @test_fp_arrays_aapcs_vfp([3 x double] %x, [3 x float] %y, [4 x double] %z) {
+; CHECK-LABEL: name: test_fp_arrays_aapcs_vfp
+; CHECK: fixedStack:
+; CHECK-DAG: id: [[Z0_ID:[0-9]+]], offset: 0, size: 8
+; CHECK-DAG: id: [[Z1_ID:[0-9]+]], offset: 8, size: 8
+; CHECK-DAG: id: [[Z2_ID:[0-9]+]], offset: 16, size: 8
+; CHECK-DAG: id: [[Z3_ID:[0-9]+]], offset: 24, size: 8
+; CHECK: liveins: %d0, %d1, %d2, %s6, %s7, %s8
+; CHECK: [[X0:%[0-9]+]](s64) = COPY %d0
+; CHECK: [[X1:%[0-9]+]](s64) = COPY %d1
+; CHECK: [[X2:%[0-9]+]](s64) = COPY %d2
+; CHECK: [[Y0:%[0-9]+]](s32) = COPY %s6
+; CHECK: [[Y1:%[0-9]+]](s32) = COPY %s7
+; CHECK: [[Y2:%[0-9]+]](s32) = COPY %s8
+; CHECK: [[Z0_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z0_ID]]
+; CHECK: [[Z0:%[0-9]+]](s64) = G_LOAD [[Z0_FI]]{{.*}}load 8
+; CHECK: [[Z1_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z1_ID]]
+; CHECK: [[Z1:%[0-9]+]](s64) = G_LOAD [[Z1_FI]]{{.*}}load 8
+; CHECK: [[Z2_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z2_ID]]
+; CHECK: [[Z2:%[0-9]+]](s64) = G_LOAD [[Z2_FI]]{{.*}}load 8
+; CHECK: [[Z3_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Z3_ID]]
+; CHECK: [[Z3:%[0-9]+]](s64) = G_LOAD [[Z3_FI]]{{.*}}load 8
+; CHECK: [[X_ARR_0:%[0-9]+]](s192) = IMPLICIT_DEF
+; CHECK: [[X_ARR_1:%[0-9]+]](s192) = G_INSERT [[X_ARR_0]], [[X0]](s64), 0
+; CHECK: [[X_ARR_2:%[0-9]+]](s192) = G_INSERT [[X_ARR_1]], [[X1]](s64), 64
+; CHECK: [[X_ARR_3:%[0-9]+]](s192) = G_INSERT [[X_ARR_2]], [[X2]](s64), 128
+; CHECK: [[X_ARR:%[0-9]+]](s192) = COPY [[X_ARR_3]](s192)
+; CHECK: [[Y_ARR_0:%[0-9]+]](s96) = IMPLICIT_DEF
+; CHECK: [[Y_ARR_1:%[0-9]+]](s96) = G_INSERT [[Y_ARR_0]], [[Y0]](s32), 0
+; CHECK: [[Y_ARR_2:%[0-9]+]](s96) = G_INSERT [[Y_ARR_1]], [[Y1]](s32), 32
+; CHECK: [[Y_ARR_3:%[0-9]+]](s96) = G_INSERT [[Y_ARR_2]], [[Y2]](s32), 64
+; CHECK: [[Y_ARR:%[0-9]+]](s96) = COPY [[Y_ARR_3]](s96)
+; CHECK: [[Z_ARR_0:%[0-9]+]](s256) = IMPLICIT_DEF
+; CHECK: [[Z_ARR_1:%[0-9]+]](s256) = G_INSERT [[Z_ARR_0]], [[Z0]](s64), 0
+; CHECK: [[Z_ARR_2:%[0-9]+]](s256) = G_INSERT [[Z_ARR_1]], [[Z1]](s64), 64
+; CHECK: [[Z_ARR_3:%[0-9]+]](s256) = G_INSERT [[Z_ARR_2]], [[Z2]](s64), 128
+; CHECK: [[Z_ARR_4:%[0-9]+]](s256) = G_INSERT [[Z_ARR_3]], [[Z3]](s64), 192
+; CHECK: [[Z_ARR:%[0-9]+]](s256) = COPY [[Z_ARR_4]](s256)
+; CHECK: ADJCALLSTACKDOWN 32, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[X0:%[0-9]+]](s64) = G_EXTRACT [[X_ARR]](s192), 0
+; CHECK: [[X1:%[0-9]+]](s64) = G_EXTRACT [[X_ARR]](s192), 64
+; CHECK: [[X2:%[0-9]+]](s64) = G_EXTRACT [[X_ARR]](s192), 128
+; CHECK: [[Y0:%[0-9]+]](s32) = G_EXTRACT [[Y_ARR]](s96), 0
+; CHECK: [[Y1:%[0-9]+]](s32) = G_EXTRACT [[Y_ARR]](s96), 32
+; CHECK: [[Y2:%[0-9]+]](s32) = G_EXTRACT [[Y_ARR]](s96), 64
+; CHECK: [[Z0:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 0
+; CHECK: [[Z1:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 64
+; CHECK: [[Z2:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 128
+; CHECK: [[Z3:%[0-9]+]](s64) = G_EXTRACT [[Z_ARR]](s256), 192
+; CHECK: %d0 = COPY [[X0]](s64)
+; CHECK: %d1 = COPY [[X1]](s64)
+; CHECK: %d2 = COPY [[X2]](s64)
+; CHECK: %s6 = COPY [[Y0]](s32)
+; CHECK: %s7 = COPY [[Y1]](s32)
+; CHECK: %s8 = COPY [[Y2]](s32)
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[Z0_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[Z0_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Z0_OFFSET]](s32)
+; CHECK: G_STORE [[Z0]](s64), [[Z0_ADDR]](p0){{.*}}store 8
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[Z1_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 8
+; CHECK: [[Z1_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Z1_OFFSET]](s32)
+; CHECK: G_STORE [[Z1]](s64), [[Z1_ADDR]](p0){{.*}}store 8
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[Z2_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 16
+; CHECK: [[Z2_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Z2_OFFSET]](s32)
+; CHECK: G_STORE [[Z2]](s64), [[Z2_ADDR]](p0){{.*}}store 8
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[Z3_OFFSET:%[0-9]+]](s32) = G_CONSTANT i32 24
+; CHECK: [[Z3_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[Z3_OFFSET]](s32)
+; CHECK: G_STORE [[Z3]](s64), [[Z3_ADDR]](p0){{.*}}store 8
+; CHECK: BLX @fp_arrays_aapcs_vfp_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %d0, implicit %d1, implicit %d2, implicit %s6, implicit %s7, implicit %s8, implicit-def %s0, implicit-def %s1, implicit-def %s2, implicit-def %s3
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %s0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %s1
+; CHECK: [[R2:%[0-9]+]](s32) = COPY %s2
+; CHECK: [[R3:%[0-9]+]](s32) = COPY %s3
+; CHECK: [[R_MERGED_0:%[0-9]+]](s128) = IMPLICIT_DEF
+; CHECK: [[R_MERGED_1:%[0-9]+]](s128) = G_INSERT [[R_MERGED_0]], [[R0]](s32), 0
+; CHECK: [[R_MERGED_2:%[0-9]+]](s128) = G_INSERT [[R_MERGED_1]], [[R1]](s32), 32
+; CHECK: [[R_MERGED_3:%[0-9]+]](s128) = G_INSERT [[R_MERGED_2]], [[R2]](s32), 64
+; CHECK: [[R_MERGED_4:%[0-9]+]](s128) = G_INSERT [[R_MERGED_3]], [[R3]](s32), 96
+; CHECK: [[R_MERGED:%[0-9]+]](s128) = COPY [[R_MERGED_4]]
+; CHECK: ADJCALLSTACKUP 32, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 32
+; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 64
+; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[R_MERGED]](s128), 96
+; CHECK: %s0 = COPY [[R0]]
+; CHECK: %s1 = COPY [[R1]]
+; CHECK: %s2 = COPY [[R2]]
+; CHECK: %s3 = COPY [[R3]]
+; CHECK: BX_RET 14, _, implicit %s0, implicit %s1, implicit %s2, implicit %s3
+entry:
+  %r = notail call arm_aapcs_vfpcc [4 x float] @fp_arrays_aapcs_vfp_target([3 x double] %x, [3 x float] %y, [4 x double] %z)
+  ret [4 x float] %r
+}
+
+declare arm_aapcscc [2 x i32*] @tough_arrays_target([6 x [4 x i32]] %arr)
+
+define arm_aapcscc [2 x i32*] @test_tough_arrays([6 x [4 x i32]] %arr) {
+; CHECK-LABEL: name: test_tough_arrays
+; CHECK: fixedStack:
+; The parameters live in separate stack locations, one for each element that
+; doesn't fit in the registers.
+; CHECK-DAG: id: [[FIRST_STACK_ID:[0-9]+]], offset: 0, size: 4
+; CHECK-DAG: id: [[LAST_STACK_ID:[-0]+]], offset: 76, size: 4
+; CHECK: liveins: %r0, %r1, %r2, %r3
+; CHECK-DAG: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK-DAG: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK-DAG: [[R2:%[0-9]+]](s32) = COPY %r2
+; CHECK-DAG: [[R3:%[0-9]+]](s32) = COPY %r3
+; CHECK: [[FIRST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[FIRST_STACK_ID]]
+; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[FIRST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[FIRST_STACK_ID]]
+; CHECK: [[LAST_STACK_ELEMENT_FI:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[LAST_STACK_ID]]
+; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_LOAD [[LAST_STACK_ELEMENT_FI]]{{.*}}load 4 from %fixed-stack.[[LAST_STACK_ID]]
+; CHECK: [[ARG_ARR0:%[0-9]+]](s768) = IMPLICIT_DEF
+; CHECK: [[ARG_ARR1:%[0-9]+]](s768) = G_INSERT [[ARG_ARR0]], [[R0]](s32), 0
+; CHECK: [[ARG_ARR2:%[0-9]+]](s768) = G_INSERT [[ARG_ARR1]], [[R1]](s32), 32
+; CHECK: [[ARG_ARR3:%[0-9]+]](s768) = G_INSERT [[ARG_ARR2]], [[R2]](s32), 64
+; CHECK: [[ARG_ARR4:%[0-9]+]](s768) = G_INSERT [[ARG_ARR3]], [[R3]](s32), 96
+; CHECK: [[ARG_ARR5:%[0-9]+]](s768) = G_INSERT [[ARG_ARR4]], [[FIRST_STACK_ELEMENT]](s32), 128
+; CHECK: [[ARG_ARR6:%[0-9]+]](s768) = G_INSERT {{%[0-9]+}}, [[LAST_STACK_ELEMENT]](s32), 736
+; CHECK: [[ARG_ARR:%[0-9]+]](s768) = COPY [[ARG_ARR6]]
+; CHECK: ADJCALLSTACKDOWN 80, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 32
+; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 64
+; CHECK: [[R3:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 96
+; CHECK: [[FIRST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 128
+; CHECK: [[LAST_STACK_ELEMENT:%[0-9]+]](s32) = G_EXTRACT [[ARG_ARR]](s768), 736
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: %r2 = COPY [[R2]]
+; CHECK: %r3 = COPY [[R3]]
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF_FIRST_ELEMENT:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[FIRST_STACK_ARG_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[OFF_FIRST_ELEMENT]](s32)
+; CHECK: G_STORE [[FIRST_STACK_ELEMENT]](s32), [[FIRST_STACK_ARG_ADDR]]{{.*}}store 4
+; Match the second-to-last offset, so we can get the correct SP for the last element
+; CHECK: G_CONSTANT i32 72
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF_LAST_ELEMENT:%[0-9]+]](s32) = G_CONSTANT i32 76
+; CHECK: [[LAST_STACK_ARG_ADDR:%[0-9]+]](p0) = G_GEP [[SP]], [[OFF_LAST_ELEMENT]](s32)
+; CHECK: G_STORE [[LAST_STACK_ELEMENT]](s32), [[LAST_STACK_ARG_ADDR]]{{.*}}store 4
+; CHECK: BLX @tough_arrays_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
+; CHECK: [[R0:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[R1:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[RES_ARR0:%[0-9]+]](s64) = IMPLICIT_DEF
+; CHECK: [[RES_ARR1:%[0-9]+]](s64) = G_INSERT [[RES_ARR0]], [[R0]](s32), 0
+; CHECK: [[RES_ARR2:%[0-9]+]](s64) = G_INSERT [[RES_ARR1]], [[R1]](s32), 32
+; CHECK: [[RES_ARR:%[0-9]+]](s64) = COPY [[RES_ARR2]]
+; CHECK: ADJCALLSTACKUP 80, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R0:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s64), 0
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[RES_ARR]](s64), 32
+; CHECK: %r0 = COPY [[R0]]
+; CHECK: %r1 = COPY [[R1]]
+; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
+entry:
+  %r = notail call arm_aapcscc [2 x i32*] @tough_arrays_target([6 x [4 x i32]] %arr)
+  ret [2 x i32*] %r
+}
+
 define i32 @test_shufflevector_s32_v2s32(i32 %arg) {
 ; CHECK-LABEL: name: test_shufflevector_s32_v2s32
 ; CHECK: [[ARG:%[0-9]+]](s32) = COPY %r0
diff --git a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
index e3680ed2b929..ef30cb1063f8 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-unsupported.ll
@@ -45,11 +45,13 @@ define half @test_half(half %a, half %b) {
   ret half %res
 }
 
-; On ARM, clang lowers structs to arrays.
-define void @test_arrays([2 x i32] %this.could.come.from.a.struct) {
-; CHECK: remark: {{.*}} unable to lower arguments: void ([2 x i32])*
-; CHECK-LABEL: warning: Instruction selection used fallback path for test_arrays
-  ret void
+declare [16 x i32] @ret_demotion_target()
+
+define [16 x i32] @test_ret_demotion() {
+; CHECK: remark: {{.*}} unable to translate instruction: call{{.*}} @ret_demotion_target
+; CHECK-LABEL: warning: Instruction selection used fallback path for test_ret_demotion
+  %res = call [16 x i32] @ret_demotion_target()
+  ret [16 x i32] %res
 }
 
 define void @test_structs({i32, i32} %struct) {
diff --git a/test/CodeGen/ARM/arm-shrink-wrapping.ll b/test/CodeGen/ARM/arm-shrink-wrapping.ll
index 9cce19417047..1985ff9b4a27 100644
--- a/test/CodeGen/ARM/arm-shrink-wrapping.ll
+++ b/test/CodeGen/ARM/arm-shrink-wrapping.ll
@@ -656,6 +656,9 @@ declare double @llvm.pow.f64(double, double)
 ;
 ; DISABLE: pop
 ;
+; FIXME: This is flakey passing by finding 'bl' somewhere amongst the debug
+; info (like labels named 'line_table) not because it's found a bl instruction.
+;
 ; CHECK: bl
 define float @debug_info(float %gamma, float %slopeLimit, i1 %or.cond, double %tmp) "no-frame-pointer-elim"="true" {
 bb:
@@ -681,7 +684,9 @@ bb13:                                             ; preds = %bb3, %bb
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "LLVM", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "LLVM", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !4, globals: !2, imports: !2)
 !1 = !DIFile(filename: "a.cpp", directory: "b")
 !2 = !{}
 !3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{!5}
+!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/CodeGen/ARM/constantpool-promote-dbg.ll b/test/CodeGen/ARM/constantpool-promote-dbg.ll
index ae765d26dcac..84386d2975f0 100644
--- a/test/CodeGen/ARM/constantpool-promote-dbg.ll
+++ b/test/CodeGen/ARM/constantpool-promote-dbg.ll
@@ -1,4 +1,4 @@
-; RUN: llc -relocation-model=static < %s | FileCheck %s
+; RUN: llc -relocation-model=static -arm-promote-constant < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv7m--linux-gnu"
diff --git a/test/CodeGen/ARM/constantpool-promote-ldrh.ll b/test/CodeGen/ARM/constantpool-promote-ldrh.ll
index 9e369dc08c4b..59970495874b 100644
--- a/test/CodeGen/ARM/constantpool-promote-ldrh.ll
+++ b/test/CodeGen/ARM/constantpool-promote-ldrh.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -O0 -fast-isel=false | FileCheck %s
-; RUN: llc < %s -O0 -fast-isel=false -filetype=obj
+; RUN: llc < %s -O0 -fast-isel=false -arm-promote-constant | FileCheck %s
+; RUN: llc < %s -O0 -fast-isel=false -filetype=obj -arm-promote-constant
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv6m-arm-linux-gnueabi"
 
diff --git a/test/CodeGen/ARM/constantpool-promote.ll b/test/CodeGen/ARM/constantpool-promote.ll
index 8df7e100c051..d5361f33a98b 100644
--- a/test/CodeGen/ARM/constantpool-promote.ll
+++ b/test/CodeGen/ARM/constantpool-promote.ll
@@ -1,15 +1,15 @@
-; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
-; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
-; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
-; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
-; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
-; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
-; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
-; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
-; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
-; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
-; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
-; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
+; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=static -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
+; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=pic -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
+; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=ropi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
+; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=rwpi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
+; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=static -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
+; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=pic -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
+; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=ropi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
+; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=rwpi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
+; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=static -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
+; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=pic -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
+; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=ropi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
+; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=rwpi -arm-promote-constant < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
 
 @.str = private unnamed_addr constant [2 x i8] c"s\00", align 1
 @.str1 = private unnamed_addr constant [69 x i8] c"this string is far too long to fit in a literal pool by far and away\00", align 1
diff --git a/test/CodeGen/ARM/cortexr52-misched-basic.ll b/test/CodeGen/ARM/cortexr52-misched-basic.ll
index 3ccb34d9fc90..eb2c29a3a5d1 100644
--- a/test/CodeGen/ARM/cortexr52-misched-basic.ll
+++ b/test/CodeGen/ARM/cortexr52-misched-basic.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=R52_SCHED
-; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic    -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=R52_SCHED
+; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=generic    -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=GENERIC
 ;
 ; Check the latency for instructions for both generic and cortex-r52.
 ; Cortex-r52 machine model will cause the div to be sceduled before eor
diff --git a/test/CodeGen/ARM/fastisel-thumb-litpool.ll b/test/CodeGen/ARM/fastisel-thumb-litpool.ll
index aa9e7260fb2e..53653a5a4f57 100644
--- a/test/CodeGen/ARM/fastisel-thumb-litpool.ll
+++ b/test/CodeGen/ARM/fastisel-thumb-litpool.ll
@@ -5,6 +5,7 @@
 ; hence the CHECK-NOT.
 
 define i32 @test_thumb_ldrlit() minsize {
+; CHECK-LABEL: test_thumb_ldrlit:
 ; CHECK: ldr r0, LCPI0_0
 ; CHECK-NOT: ldr
   ret i32 12345678
diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll
index fb204debf612..b447497b270a 100644
--- a/test/CodeGen/ARM/memcpy-inline.ll
+++ b/test/CodeGen/ARM/memcpy-inline.ll
@@ -35,6 +35,8 @@ entry:
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0], [[INC]]
 ; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+; CHECK-T1-LABEL: t1:
+; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8], [31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
   ret void
 }
@@ -51,6 +53,8 @@ entry:
 ; CHECK: str [[REG2]], [r0]
 ; CHECK: vld1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r1]
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r3]
+; CHECK-T1-LABEL: t2:
+; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8], [36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
   ret void
 }
@@ -62,6 +66,8 @@ entry:
 ; CHECK: vst1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0]!
 ; CHECK: vldr d{{[0-9]+}}, [r1]
 ; CHECK: vst1.8 {d{{[0-9]+}}}, [r0]
+; CHECK-T1-LABEL: t3:
+; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8], [24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
   ret void
 }
@@ -72,6 +78,8 @@ entry:
 ; CHECK: vld1.64 {[[REG3:d[0-9]+]], [[REG4:d[0-9]+]]}, [r1]
 ; CHECK: vst1.8 {[[REG3]], [[REG4]]}, [r0]!
 ; CHECK: strh [[REG5:r[0-9]+]], [r0]
+; CHECK-T1-LABEL: t4:
+; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8], [18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
   ret void
 }
@@ -87,10 +95,7 @@ entry:
 ; CHECK: movt [[REG7:r[0-9]+]], #22866
 ; CHECK: str [[REG7]]
 ; CHECK-T1-LABEL: t5:
-; CHECK-T1: movs [[TREG3:r[0-9]]],
-; CHECK-T1: strb [[TREG3]],
-; CHECK-T1: movs [[TREG4:r[0-9]]],
-; CHECK-T1: strb [[TREG4]],
+; CHECK-T1: bl _memcpy
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
   ret void
 }
diff --git a/test/CodeGen/ARM/memset-inline.ll b/test/CodeGen/ARM/memset-inline.ll
index b86874692aca..b2bd257701d3 100644
--- a/test/CodeGen/ARM/memset-inline.ll
+++ b/test/CodeGen/ARM/memset-inline.ll
@@ -1,22 +1,36 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s -check-prefix=CHECK-7A
+; RUN: llc < %s -mtriple=thumbv6m -pre-RA-sched=source -disable-post-ra -mattr=+strict-align | FileCheck %s -check-prefix=CHECK-6M
 
 define void @t1(i8* nocapture %c) nounwind optsize {
 entry:
-; CHECK-LABEL: t1:
-; CHECK: movs r1, #0
-; CHECK: strd r1, r1, [r0]
-; CHECK: str r1, [r0, #8]
+; CHECK-7A-LABEL: t1:
+; CHECK-7A: movs r1, #0
+; CHECK-7A: strd r1, r1, [r0]
+; CHECK-7A: str r1, [r0, #8]
+; CHECK-6M-LABEL: t1:
+; CHECK-6M: movs r1, #0
+; CHECK-6M: str r1, [r0]
+; CHECK-6M: str r1, [r0, #4]
+; CHECK-6M: str r1, [r0, #8]
   call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
   ret void
 }
 
 define void @t2() nounwind ssp {
 entry:
-; CHECK-LABEL: t2:
-; CHECK: vmov.i32 {{q[0-9]+}}, #0x0
-; CHECK: movs r1, #10
-; CHECK: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2], r1
-; CHECK: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]
+; CHECK-7A-LABEL: t2:
+; CHECK-7A: vmov.i32 {{q[0-9]+}}, #0x0
+; CHECK-7A: movs r1, #10
+; CHECK-7A: vst1.64 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2], r1
+; CHECK-7A: vst1.16 {d{{[0-9]+}}, d{{[0-9]+}}}, [r2]
+; CHECK-6M-LABEL: t2:
+; CHECK-6M: movs [[REG:r[0-9]+]], #0
+; CHECK-6M: str  [[REG]], [sp, #20]
+; CHECK-6M: str  [[REG]], [sp, #16]
+; CHECK-6M: str  [[REG]], [sp, #12]
+; CHECK-6M: str  [[REG]], [sp, #8]
+; CHECK-6M: str  [[REG]], [sp, #4]
+; CHECK-6M: str  [[REG]], [sp]
   %buf = alloca [26 x i8], align 1
   %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0
   call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
@@ -24,6 +38,56 @@ entry:
   ret void
 }
 
+define void @t3(i8* %p) {
+entry:
+; CHECK-7A-LABEL: t3:
+; CHECK-7A: muls [[REG:r[0-9]+]],
+; CHECK-7A: str  [[REG]],
+; CHECK-6M-LABEL: t3:
+; CHECK-6M-NOT: muls
+; CHECK-6M: strb [[REG:r[0-9]+]],
+; CHECK-6M: strb [[REG]],
+; CHECK-6M: strb [[REG]],
+; CHECK-6M: strb [[REG]],
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = trunc i32 %i to i8
+  call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 1, i1 false)
+  call void @something(i8* %p)
+  %inc = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+define void @t4(i8* %p) {
+entry:
+; CHECK-7A-LABEL: t4:
+; CHECK-7A: muls [[REG:r[0-9]+]],
+; CHECK-7A: str  [[REG]],
+; CHECK-6M-LABEL: t4:
+; CHECK-6M: muls [[REG:r[0-9]+]],
+; CHECK-6M: strh [[REG]],
+; CHECK-6M: strh [[REG]],
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %0 = trunc i32 %i to i8
+  call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 2, i1 false)
+  call void @something(i8* %p)
+  %inc = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc, 255
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
 declare void @something(i8*) nounwind
 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM/misched-copy-arm.ll b/test/CodeGen/ARM/misched-copy-arm.ll
index 330252a90d7c..53f8b8d15042 100644
--- a/test/CodeGen/ARM/misched-copy-arm.ll
+++ b/test/CodeGen/ARM/misched-copy-arm.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=misched -arm-atomic-cfg-tidy=0 %s -o - 2>&1 | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=swift -pre-RA-sched=source -join-globalcopies -enable-misched -verify-misched -debug-only=machine-scheduler -arm-atomic-cfg-tidy=0 %s -o - 2>&1 | FileCheck %s
 ;
 ; Loop counter copies should be eliminated.
 ; There is also a MUL here, but we don't care where it is scheduled.
diff --git a/test/CodeGen/ARM/misched-fp-basic.ll b/test/CodeGen/ARM/misched-fp-basic.ll
index 27ad2cec34fd..2f672b0cb540 100644
--- a/test/CodeGen/ARM/misched-fp-basic.ll
+++ b/test/CodeGen/ARM/misched-fp-basic.ll
@@ -1,9 +1,9 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a9 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > \
+; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a9 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > \
 ; RUN:   /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=swift -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > \
+; RUN: llc < %s -mtriple=arm-eabi -mcpu=swift -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > \
 ; RUN:   /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT
-; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > \
+; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-r52 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > \
 ; RUN:   /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52
 ;
 ; Check the latency of instructions for processors with sched-models
diff --git a/test/CodeGen/ARM/misched-int-basic-thumb2.mir b/test/CodeGen/ARM/misched-int-basic-thumb2.mir
index 86ef1e26f636..32d1e03d9a1b 100644
--- a/test/CodeGen/ARM/misched-int-basic-thumb2.mir
+++ b/test/CodeGen/ARM/misched-int-basic-thumb2.mir
@@ -1,10 +1,10 @@
 # Basic machine sched model test for Thumb2 int instructions
 # RUN: llc -o /dev/null %s -mtriple=thumbv7-eabi -mcpu=swift -run-pass  machine-scheduler  -enable-misched -verify-misched \
-# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT
+# RUN:  -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT
 # RUN: llc -o /dev/null %s -mtriple=thumbv7--eabi -mcpu=cortex-a9 -run-pass  machine-scheduler  -enable-misched -verify-misched \
-# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9
+# RUN:  -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9
 # RUN: llc -o /dev/null %s -mtriple=thumbv8r-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -verify-misched \
-# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52
+# RUN:  -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52
 # REQUIRES: asserts
 --- |
   ; ModuleID = 'foo.ll'
diff --git a/test/CodeGen/ARM/misched-int-basic.mir b/test/CodeGen/ARM/misched-int-basic.mir
index f237c0a07b2e..d5231269d732 100644
--- a/test/CodeGen/ARM/misched-int-basic.mir
+++ b/test/CodeGen/ARM/misched-int-basic.mir
@@ -1,9 +1,9 @@
 # RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=swift -run-pass  machine-scheduler  -enable-misched -verify-misched \
-# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT
+# RUN:  -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT
 # RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-a9 -run-pass  machine-scheduler  -enable-misched -verify-misched \
-# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9
+# RUN:  -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9
 # RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -verify-misched \
-# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52
+# RUN:  -debug-only=machine-scheduler 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52
 # REQUIRES: asserts
 --- |
   ; ModuleID = 'foo.ll'
diff --git a/test/CodeGen/ARM/single-issue-r52.mir b/test/CodeGen/ARM/single-issue-r52.mir
index 6c95f7603e6e..1eba074dafb3 100644
--- a/test/CodeGen/ARM/single-issue-r52.mir
+++ b/test/CodeGen/ARM/single-issue-r52.mir
@@ -1,5 +1,5 @@
-# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=misched -misched-topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN
-# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=misched -misched-bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=machine-scheduler -misched-topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=machine-scheduler -misched-bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP
 # REQUIRES: asserts
 --- |
   ; ModuleID = 'foo.ll'
diff --git a/test/CodeGen/ARM/vcombine.ll b/test/CodeGen/ARM/vcombine.ll
index 81b22ee12cdd..c08ed81d042a 100644
--- a/test/CodeGen/ARM/vcombine.ll
+++ b/test/CodeGen/ARM/vcombine.ll
@@ -99,7 +99,9 @@ define <4 x i16> @vget_low16(<8 x i16>* %A) nounwind {
 define <8 x i8> @vget_high8(<16 x i8>* %A) nounwind {
 ; CHECK: vget_high8
 ; CHECK-NOT: vst
-; CHECK-LE: vmov r0, r1, d17
+; CHECK-LE-NOT: vld1.64 {d16, d17}, [r0]
+; CHECK-LE: vldr  d16, [r0, #8]
+; CHECK-LE: vmov  r0, r1, d16
 ; CHECK-BE: vmov r1, r0, d16
 	%tmp1 = load <16 x i8>, <16 x i8>* %A
         %tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
diff --git a/test/CodeGen/ARM/vext.ll b/test/CodeGen/ARM/vext.ll
index e44e757a3169..5742dc314978 100644
--- a/test/CodeGen/ARM/vext.ll
+++ b/test/CodeGen/ARM/vext.ll
@@ -199,10 +199,10 @@ define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ; CHECK-LABEL: test_undef:
 ; CHECK:       @ BB#0:
-; CHECK-NEXT:    vld1.64 {d16, d17}, [r1]
-; CHECK-NEXT:    vld1.64 {d18, d19}, [r0]
-; CHECK-NEXT:    vzip.16 d19, d16
-; CHECK-NEXT:    vmov r0, r1, d19
+; CHECK-NEXT:    vldr  d16, [r1]
+; CHECK-NEXT:    vldr  d17, [r0, #8]
+; CHECK-NEXT:    vzip.16 d17, d16
+; CHECK-NEXT:    vmov  r0, r1, d17
 ; CHECK-NEXT:    mov pc, lr
         %tmp1 = load <8 x i16>, <8 x i16>* %A
         %tmp2 = load <8 x i16>, <8 x i16>* %B
diff --git a/test/CodeGen/Hexagon/post-ra-kill-update.mir b/test/CodeGen/Hexagon/post-ra-kill-update.mir
index c43624d7a8d3..ac46a70a68a5 100644
--- a/test/CodeGen/Hexagon/post-ra-kill-update.mir
+++ b/test/CodeGen/Hexagon/post-ra-kill-update.mir
@@ -6,7 +6,7 @@
 
 # CHECK-LABEL: name: foo
 # Check for no-kill of r9 in the first instruction, after reordering:
-# CHECK: %d7 = S2_lsr_r_p_or %d7, killed %d1, %r9
+# CHECK: %d7 = S2_lsr_r_p_or killed %d7, killed %d1, %r9
 # CHECK: %d13 = S2_lsr_r_p killed %d0, killed %r9
 
 --- |
diff --git a/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll b/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll
index 18cca5c356e3..242ee53f19f2 100644
--- a/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll
+++ b/test/CodeGen/Lanai/lanai-misched-trivial-disjoint.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc %s -mtriple=lanai-unknown-unknown -debug-only=misched -o /dev/null 2>&1 | FileCheck %s
+; RUN: llc %s -mtriple=lanai-unknown-unknown -debug-only=machine-scheduler -o /dev/null 2>&1 | FileCheck %s
 
 ; Make sure there are no control dependencies between memory operations that
 ; are trivially disjoint.
diff --git a/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir b/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir
deleted file mode 100644
index 96801f5b0a37..000000000000
--- a/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir
+++ /dev/null
@@ -1,24 +0,0 @@
-# RUN: llc -mtriple=arm-apple-ios -run-pass=if-converter %s -o - | FileCheck %s
----
-name:            foo
-body:             |
-  bb.0:
-    B %bb.2
-
-  bb.1:
-    BX_RET 14, 0
-
-  bb.2:
-    Bcc %bb.1, 1, %cpsr
-
-  bb.3:
-    B %bb.1
-
-...
-
-# We should get a single block containing the BX_RET, with no successors at all
-
-# CHECK:      body:
-# CHECK-NEXT:   bb.0:
-# CHECK-NEXT:     BX_RET
-
diff --git a/test/CodeGen/MSP430/hwmult16.ll b/test/CodeGen/MSP430/hwmult16.ll
index b23f1ad37d81..87b6a7aeacf5 100644
--- a/test/CodeGen/MSP430/hwmult16.ll
+++ b/test/CodeGen/MSP430/hwmult16.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mhwmult=16bit < %s | FileCheck %s
+; RUN: llc -O0 -mattr=+hwmult16 < %s | FileCheck %s
 
 target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
 target triple = "msp430---elf"
diff --git a/test/CodeGen/MSP430/hwmult32.ll b/test/CodeGen/MSP430/hwmult32.ll
index 6ffeb9698862..10c831e77ffb 100644
--- a/test/CodeGen/MSP430/hwmult32.ll
+++ b/test/CodeGen/MSP430/hwmult32.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mhwmult=32bit < %s | FileCheck %s
+; RUN: llc -O0 -mattr=+hwmult32 < %s | FileCheck %s
 
 target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
 target triple = "msp430---elf"
diff --git a/test/CodeGen/MSP430/hwmultf5.ll b/test/CodeGen/MSP430/hwmultf5.ll
index 51ca4be4a654..c57922ece7d0 100644
--- a/test/CodeGen/MSP430/hwmultf5.ll
+++ b/test/CodeGen/MSP430/hwmultf5.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mhwmult=f5series < %s | FileCheck %s
+; RUN: llc -O0 -mattr=+hwmultf5 < %s | FileCheck %s
 
 target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16"
 target triple = "msp430---elf"
diff --git a/test/CodeGen/MSP430/vararg.ll b/test/CodeGen/MSP430/vararg.ll
index a708b89cbd8f..4baf499848fd 100644
--- a/test/CodeGen/MSP430/vararg.ll
+++ b/test/CodeGen/MSP430/vararg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
 
 target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
 target triple = "msp430---elf"
diff --git a/test/CodeGen/Nios2/lit.local.cfg b/test/CodeGen/Nios2/lit.local.cfg
new file mode 100644
index 000000000000..84c8b039391b
--- /dev/null
+++ b/test/CodeGen/Nios2/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'Nios2' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/CodeGen/Nios2/target_support.ll b/test/CodeGen/Nios2/target_support.ll
new file mode 100644
index 000000000000..90e7020b2fcc
--- /dev/null
+++ b/test/CodeGen/Nios2/target_support.ll
@@ -0,0 +1,11 @@
+; This tests that llc accepts Nios2 target.
+
+; RUN: not not llc < %s -asm-verbose=false -march=nios2 2>&1 | FileCheck %s --check-prefix=ARCH
+; RUN: not not llc < %s -asm-verbose=false -mtriple=nios2 2>&1 | FileCheck %s --check-prefix=TRIPLE
+
+; ARCH-NOT: invalid target
+; TRIPLE-NOT: unable to get target
+
+define i32 @f(i32 %i) {
+  ret i32 %i
+}
diff --git a/test/CodeGen/PowerPC/atomics-constant.ll b/test/CodeGen/PowerPC/atomics-constant.ll
new file mode 100644
index 000000000000..a92ca813af85
--- /dev/null
+++ b/test/CodeGen/PowerPC/atomics-constant.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+
+target triple = "powerpc64le-unknown-linux-gnu"
+
+@a = constant i64 zeroinitializer
+
+define i64 @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LC0@toc@ha
+; CHECK-NEXT:    li 4, 0
+; CHECK-NEXT:    ld 3, .LC0@toc@l(3)
+; CHECK-NEXT:    cmpw 7, 4, 4
+; CHECK-NEXT:    ld 3, 0(3)
+; CHECK-NEXT:    bne- 7, .+4
+; CHECK-NEXT:    isync
+; CHECK-NEXT:    li 3, 0
+; CHECK-NEXT:    blr
+entry:
+  %value = load atomic i64, i64* @a acquire, align 8
+  ret i64 %value
+}
diff --git a/test/CodeGen/PowerPC/build-vector-tests.ll b/test/CodeGen/PowerPC/build-vector-tests.ll
index 1bce9d4cb439..c42f677d17ab 100644
--- a/test/CodeGen/PowerPC/build-vector-tests.ll
+++ b/test/CodeGen/PowerPC/build-vector-tests.ll
@@ -869,9 +869,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsi
 ; P8BE-LABEL: fromDiffConstsi
 ; P8LE-LABEL: fromDiffConstsi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -899,9 +899,9 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAi
 ; P8BE-LABEL: fromDiffMemConsAi
 ; P8LE-LABEL: fromDiffMemConsAi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -929,12 +929,12 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDi
 ; P8BE-LABEL: fromDiffMemConsDi
 ; P8LE-LABEL: fromDiffMemConsDi
-; P9BE: lxvx
-; P9BE: lxvx
+; P9BE: lxv
+; P9BE: lxv
 ; P9BE: vperm
 ; P9BE: blr
-; P9LE: lxvx
-; P9LE: lxvx
+; P9LE: lxv
+; P9LE: lxv
 ; P9LE: vperm
 ; P9LE: blr
 ; P8BE: lxvw4x
@@ -1018,13 +1018,13 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDi
 ; P8LE-LABEL: fromDiffMemVarDi
 ; P9BE: sldi {{r[0-9]+}}, r4, 2
-; P9BE-DAG: lxvx {{v[0-9]+}}, r3,
-; P9BE-DAG: lxvx
+; P9BE-DAG: lxv {{v[0-9]+}}
+; P9BE-DAG: lxv
 ; P9BE: vperm
 ; P9BE: blr
 ; P9LE: sldi {{r[0-9]+}}, r4, 2
-; P9LE-DAG: lxvx {{v[0-9]+}}, r3,
-; P9LE-DAG: lxvx
+; P9LE-DAG: lxv {{v[0-9]+}}
+; P9LE-DAG: lxv
 ; P9LE: vperm
 ; P9LE: blr
 ; P8BE: sldi {{r[0-9]+}}, r4, 2
@@ -1281,9 +1281,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvftoi
 ; P8BE-LABEL: fromDiffConstsConvftoi
 ; P8LE-LABEL: fromDiffConstsConvftoi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -1303,10 +1303,10 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvftoi
 ; P8BE-LABEL: fromDiffMemConsAConvftoi
 ; P8LE-LABEL: fromDiffMemConsAConvftoi
-; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3)
 ; P9BE: xvcvspsxws v2, [[REG1]]
 ; P9BE: blr
-; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3)
 ; P9LE: xvcvspsxws v2, [[REG1]]
 ; P9LE: blr
 ; P8BE: lxvw4x [[REG1:[vs0-9]+]], 0, r3
@@ -1341,13 +1341,13 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDConvftoi
 ; P8BE-LABEL: fromDiffMemConsDConvftoi
 ; P8LE-LABEL: fromDiffMemConsDConvftoi
-; P9BE: lxvx
-; P9BE: lxvx
+; P9BE: lxv
+; P9BE: lxv
 ; P9BE: vperm
 ; P9BE: xvcvspsxws
 ; P9BE: blr
-; P9LE: lxvx
-; P9LE: lxvx
+; P9LE: lxv
+; P9LE: lxv
 ; P9LE: vperm
 ; P9LE: xvcvspsxws
 ; P9LE: blr
@@ -1557,9 +1557,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvdtoi
 ; P8BE-LABEL: fromDiffConstsConvdtoi
 ; P8LE-LABEL: fromDiffConstsConvdtoi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -1584,16 +1584,16 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvdtoi
 ; P8BE-LABEL: fromDiffMemConsAConvdtoi
 ; P8LE-LABEL: fromDiffMemConsAConvdtoi
-; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3
-; P9BE: lxvx [[REG2:[vs0-9]+]], r3, r4
+; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9BE: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
 ; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
 ; P9BE: vmrgew v2, [[REG6]], [[REG5]]
 ; P9BE: xvcvspsxws v2, v2
-; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3
-; P9LE: lxvx [[REG2:[vs0-9]+]], r3, r4
+; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9LE: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
@@ -2027,9 +2027,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsui
 ; P8BE-LABEL: fromDiffConstsui
 ; P8LE-LABEL: fromDiffConstsui
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -2057,9 +2057,9 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAui
 ; P8BE-LABEL: fromDiffMemConsAui
 ; P8LE-LABEL: fromDiffMemConsAui
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -2087,12 +2087,12 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDui
 ; P8BE-LABEL: fromDiffMemConsDui
 ; P8LE-LABEL: fromDiffMemConsDui
-; P9BE: lxvx
-; P9BE: lxvx
+; P9BE: lxv
+; P9BE: lxv
 ; P9BE: vperm
 ; P9BE: blr
-; P9LE: lxvx
-; P9LE: lxvx
+; P9LE: lxv
+; P9LE: lxv
 ; P9LE: vperm
 ; P9LE: blr
 ; P8BE: lxvw4x
@@ -2177,13 +2177,13 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDui
 ; P8LE-LABEL: fromDiffMemVarDui
 ; P9BE-DAG: sldi {{r[0-9]+}}, r4, 2
-; P9BE-DAG: lxvx {{v[0-9]+}}, r3
-; P9BE-DAG: lxvx
+; P9BE-DAG: lxv {{v[0-9]+}}, -12(r3)
+; P9BE-DAG: lxv
 ; P9BE: vperm
 ; P9BE: blr
 ; P9LE-DAG: sldi {{r[0-9]+}}, r4, 2
-; P9LE-DAG: lxvx {{v[0-9]+}}, r3
-; P9LE-DAG: lxvx
+; P9LE-DAG: lxv {{v[0-9]+}}, -12(r3)
+; P9LE-DAG: lxv
 ; P9LE: vperm
 ; P9LE: blr
 ; P8BE-DAG: sldi {{r[0-9]+}}, r4, 2
@@ -2439,9 +2439,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvftoui
 ; P8BE-LABEL: fromDiffConstsConvftoui
 ; P8LE-LABEL: fromDiffConstsConvftoui
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -2461,10 +2461,10 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvftoui
 ; P8BE-LABEL: fromDiffMemConsAConvftoui
 ; P8LE-LABEL: fromDiffMemConsAConvftoui
-; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3)
 ; P9BE: xvcvspuxws v2, [[REG1]]
 ; P9BE: blr
-; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3
+; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3)
 ; P9LE: xvcvspuxws v2, [[REG1]]
 ; P9LE: blr
 ; P8BE: lxvw4x [[REG1:[vs0-9]+]], 0, r3
@@ -2499,13 +2499,13 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDConvftoui
 ; P8BE-LABEL: fromDiffMemConsDConvftoui
 ; P8LE-LABEL: fromDiffMemConsDConvftoui
-; P9BE: lxvx
-; P9BE: lxvx
+; P9BE: lxv
+; P9BE: lxv
 ; P9BE: vperm
 ; P9BE: xvcvspuxws
 ; P9BE: blr
-; P9LE: lxvx
-; P9LE: lxvx
+; P9LE: lxv
+; P9LE: lxv
 ; P9LE: vperm
 ; P9LE: xvcvspuxws
 ; P9LE: blr
@@ -2715,9 +2715,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvdtoui
 ; P8BE-LABEL: fromDiffConstsConvdtoui
 ; P8LE-LABEL: fromDiffConstsConvdtoui
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvw4x
 ; P8BE: blr
@@ -2742,16 +2742,16 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvdtoui
 ; P8BE-LABEL: fromDiffMemConsAConvdtoui
 ; P8LE-LABEL: fromDiffMemConsAConvdtoui
-; P9BE: lxvx [[REG1:[vs0-9]+]], 0, r3
-; P9BE: lxvx [[REG2:[vs0-9]+]], r3, r4
+; P9BE: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9BE: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9BE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG1]], [[REG2]]
 ; P9BE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
 ; P9BE-DAG: xvcvdpsp [[REG6:[vs0-9]+]], [[REG4]]
 ; P9BE: vmrgew v2, [[REG6]], [[REG5]]
 ; P9BE: xvcvspuxws v2, v2
-; P9LE: lxvx [[REG1:[vs0-9]+]], 0, r3
-; P9LE: lxvx [[REG2:[vs0-9]+]], r3, r4
+; P9LE: lxv [[REG1:[vs0-9]+]], 0(r3)
+; P9LE: lxv [[REG2:[vs0-9]+]], 16(r3)
 ; P9LE-DAG: xxmrgld [[REG3:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xxmrghd [[REG4:[vs0-9]+]], [[REG2]], [[REG1]]
 ; P9LE-DAG: xvcvdpsp [[REG5:[vs0-9]+]], [[REG3]]
@@ -3087,9 +3087,9 @@ entry:
 ; P9LE-LABEL: spltConst1ll
 ; P8BE-LABEL: spltConst1ll
 ; P8LE-LABEL: spltConst1ll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3105,9 +3105,9 @@ entry:
 ; P9LE-LABEL: spltConst16kll
 ; P8BE-LABEL: spltConst16kll
 ; P8LE-LABEL: spltConst16kll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3123,9 +3123,9 @@ entry:
 ; P9LE-LABEL: spltConst32kll
 ; P8BE-LABEL: spltConst32kll
 ; P8LE-LABEL: spltConst32kll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3165,9 +3165,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsll
 ; P8BE-LABEL: fromDiffConstsll
 ; P8LE-LABEL: fromDiffConstsll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3188,9 +3188,9 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAll
 ; P8BE-LABEL: fromDiffMemConsAll
 ; P8LE-LABEL: fromDiffMemConsAll
-; P9BE: lxvx v2
+; P9BE: lxv v2
 ; P9BE: blr
-; P9LE: lxvx v2
+; P9LE: lxv v2
 ; P9LE: blr
 ; P8BE: lxvd2x v2
 ; P8BE: blr
@@ -3213,9 +3213,9 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDll
 ; P8BE-LABEL: fromDiffMemConsDll
 ; P8LE-LABEL: fromDiffMemConsDll
-; P9BE: lxvx v2
+; P9BE: lxv v2
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: xxswapd v2
 ; P9LE: blr
 ; P8BE: lxvd2x
@@ -3275,11 +3275,11 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDll
 ; P8LE-LABEL: fromDiffMemVarDll
 ; P9BE: sldi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: xxswapd v2
 ; P9BE-NEXT: blr
 ; P9LE: sldi
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: xxswapd v2
 ; P9LE-NEXT: blr
 ; P8BE: sldi
@@ -3422,9 +3422,9 @@ entry:
 ; P9LE-LABEL: spltCnstConvftoll
 ; P8BE-LABEL: spltCnstConvftoll
 ; P8LE-LABEL: spltCnstConvftoll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3466,9 +3466,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvftoll
 ; P8BE-LABEL: fromDiffConstsConvftoll
 ; P8LE-LABEL: fromDiffConstsConvftoll
-; P9BE: lxvx v2
+; P9BE: lxv v2
 ; P9BE: blr
-; P9LE: lxvx v2
+; P9LE: lxv v2
 ; P9LE: blr
 ; P8BE: lxvd2x v2
 ; P8BE: blr
@@ -3705,9 +3705,9 @@ entry:
 ; P9LE-LABEL: spltCnstConvdtoll
 ; P8BE-LABEL: spltCnstConvdtoll
 ; P8LE-LABEL: spltCnstConvdtoll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3749,9 +3749,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvdtoll
 ; P8BE-LABEL: fromDiffConstsConvdtoll
 ; P8LE-LABEL: fromDiffConstsConvdtoll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -3770,10 +3770,10 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvdtoll
 ; P8BE-LABEL: fromDiffMemConsAConvdtoll
 ; P8LE-LABEL: fromDiffMemConsAConvdtoll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE-NEXT: xvcvdpsxds v2
 ; P9BE-NEXT: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE-NEXT: xvcvdpsxds v2
 ; P9LE-NEXT: blr
 ; P8BE: lxvd2x
@@ -3801,11 +3801,11 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDConvdtoll
 ; P8BE-LABEL: fromDiffMemConsDConvdtoll
 ; P8LE-LABEL: fromDiffMemConsDConvdtoll
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE-NEXT: xxswapd
 ; P9BE-NEXT: xvcvdpsxds v2
 ; P9BE-NEXT: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE-NEXT: xxswapd
 ; P9LE-NEXT: xvcvdpsxds v2
 ; P9LE-NEXT: blr
@@ -3876,12 +3876,12 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDConvdtoll
 ; P8LE-LABEL: fromDiffMemVarDConvdtoll
 ; P9BE: sldi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE-NEXT: xxswapd
 ; P9BE-NEXT: xvcvdpsxds v2
 ; P9BE-NEXT: blr
 ; P9LE: sldi
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE-NEXT: xxswapd
 ; P9LE-NEXT: xvcvdpsxds v2
 ; P9LE-NEXT: blr
@@ -3991,9 +3991,9 @@ entry:
 ; P9LE-LABEL: spltConst1ull
 ; P8BE-LABEL: spltConst1ull
 ; P8LE-LABEL: spltConst1ull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4009,9 +4009,9 @@ entry:
 ; P9LE-LABEL: spltConst16kull
 ; P8BE-LABEL: spltConst16kull
 ; P8LE-LABEL: spltConst16kull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4027,9 +4027,9 @@ entry:
 ; P9LE-LABEL: spltConst32kull
 ; P8BE-LABEL: spltConst32kull
 ; P8LE-LABEL: spltConst32kull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4069,9 +4069,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsull
 ; P8BE-LABEL: fromDiffConstsull
 ; P8LE-LABEL: fromDiffConstsull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4092,9 +4092,9 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAull
 ; P8BE-LABEL: fromDiffMemConsAull
 ; P8LE-LABEL: fromDiffMemConsAull
-; P9BE: lxvx v2
+; P9BE: lxv v2
 ; P9BE: blr
-; P9LE: lxvx v2
+; P9LE: lxv v2
 ; P9LE: blr
 ; P8BE: lxvd2x v2
 ; P8BE: blr
@@ -4117,9 +4117,9 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDull
 ; P8BE-LABEL: fromDiffMemConsDull
 ; P8LE-LABEL: fromDiffMemConsDull
-; P9BE: lxvx v2
+; P9BE: lxv v2
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: xxswapd v2
 ; P9LE: blr
 ; P8BE: lxvd2x
@@ -4179,11 +4179,11 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDull
 ; P8LE-LABEL: fromDiffMemVarDull
 ; P9BE: sldi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: xxswapd v2
 ; P9BE-NEXT: blr
 ; P9LE: sldi
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: xxswapd v2
 ; P9LE-NEXT: blr
 ; P8BE: sldi
@@ -4326,9 +4326,9 @@ entry:
 ; P9LE-LABEL: spltCnstConvftoull
 ; P8BE-LABEL: spltCnstConvftoull
 ; P8LE-LABEL: spltCnstConvftoull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4370,9 +4370,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvftoull
 ; P8BE-LABEL: fromDiffConstsConvftoull
 ; P8LE-LABEL: fromDiffConstsConvftoull
-; P9BE: lxvx v2
+; P9BE: lxv v2
 ; P9BE: blr
-; P9LE: lxvx v2
+; P9LE: lxv v2
 ; P9LE: blr
 ; P8BE: lxvd2x v2
 ; P8BE: blr
@@ -4609,9 +4609,9 @@ entry:
 ; P9LE-LABEL: spltCnstConvdtoull
 ; P8BE-LABEL: spltCnstConvdtoull
 ; P8LE-LABEL: spltCnstConvdtoull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4653,9 +4653,9 @@ entry:
 ; P9LE-LABEL: fromDiffConstsConvdtoull
 ; P8BE-LABEL: fromDiffConstsConvdtoull
 ; P8LE-LABEL: fromDiffConstsConvdtoull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE: blr
 ; P8BE: lxvd2x
 ; P8BE: blr
@@ -4674,10 +4674,10 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsAConvdtoull
 ; P8BE-LABEL: fromDiffMemConsAConvdtoull
 ; P8LE-LABEL: fromDiffMemConsAConvdtoull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE-NEXT: xvcvdpuxds v2
 ; P9BE-NEXT: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE-NEXT: xvcvdpuxds v2
 ; P9LE-NEXT: blr
 ; P8BE: lxvd2x
@@ -4705,11 +4705,11 @@ entry:
 ; P9LE-LABEL: fromDiffMemConsDConvdtoull
 ; P8BE-LABEL: fromDiffMemConsDConvdtoull
 ; P8LE-LABEL: fromDiffMemConsDConvdtoull
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE-NEXT: xxswapd
 ; P9BE-NEXT: xvcvdpuxds v2
 ; P9BE-NEXT: blr
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE-NEXT: xxswapd
 ; P9LE-NEXT: xvcvdpuxds v2
 ; P9LE-NEXT: blr
@@ -4780,12 +4780,12 @@ entry:
 ; P8BE-LABEL: fromDiffMemVarDConvdtoull
 ; P8LE-LABEL: fromDiffMemVarDConvdtoull
 ; P9BE: sldi
-; P9BE: lxvx
+; P9BE: lxv
 ; P9BE-NEXT: xxswapd
 ; P9BE-NEXT: xvcvdpuxds v2
 ; P9BE-NEXT: blr
 ; P9LE: sldi
-; P9LE: lxvx
+; P9LE: lxv
 ; P9LE-NEXT: xxswapd
 ; P9LE-NEXT: xvcvdpuxds v2
 ; P9LE-NEXT: blr
diff --git a/test/CodeGen/PowerPC/livephysregs.mir b/test/CodeGen/PowerPC/livephysregs.mir
new file mode 100644
index 000000000000..6b6268778e99
--- /dev/null
+++ b/test/CodeGen/PowerPC/livephysregs.mir
@@ -0,0 +1,52 @@
+# RUN: llc -o - %s -mtriple=powerpc64le--linux-gnu -run-pass=branch-folder | FileCheck %s
+# The branch-folder should merge bb.1 and bb.5 below and therefore recalculate
+# the liveins list of the merged block. This test is checking whether this
+# recalculated list if okay and contains all the non-saved and saved CSRs.
+# CHECK-LABEL: name: func
+# CHECK: bb.3:
+# CHECK-NEXT: liveins: %x30, %x29, %x3, %x6
+# CHECK: %x4 = RLDICR killed %x6, 16, 47
+# CHECK: %x3 = OR8 killed %x4, killed %x3
+# CHECK: BLR8 implicit %lr8, implicit %rm, implicit %x3
+---
+name: func
+tracksRegLiveness: true
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%x30' }
+  - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%x29' }
+  - { id: 2, offset: -8, size: 8, alignment: 8, isImmutable: true, isAliased: false }
+body: |
+  bb.0:
+    liveins: %x3, %x5, %x29, %x30
+  
+    %x6 = RLWINM8 %x3, 16, 16, 31
+    %x3 = RLDICL killed %x3, 0, 48
+    BC undef %cr5lt, %bb.3
+  
+  bb.1:
+    liveins: %x3, %x6, %x29, %x30
+  
+    %x4 = RLDICR killed %x6, 16, 47
+    %x3 = OR8 killed %x4, killed %x3
+    BLR8 implicit %lr8, implicit %rm, implicit %x3
+  
+  bb.3:
+    liveins: %x3, %x5, %x6, %x29, %x30
+
+    dead %x5 = ADD8 %x5, %x6
+    BC undef %cr5lt, %bb.1
+
+  bb.6:
+    liveins: %x3, %x6, %x29, %x30
+    STD killed %x29, -24, %x1 :: (store 8 into %fixed-stack.1)
+    STD killed %x30, -16, %x1 :: (store 8 into %fixed-stack.0, align 16)
+    NOP implicit-def dead %x29
+    NOP implicit-def dead %x30
+
+    %x30 = LD -16, %x1 :: (load 8 from %fixed-stack.0, align 16)
+    %x29 = LD -24, %x1 :: (load 8 from %fixed-stack.1)
+  
+    %x4 = RLDICR killed %x6, 16, 47
+    %x3 = OR8 killed %x4, killed %x3
+    BLR8 implicit %lr8, implicit %rm, implicit %x3 
+...
diff --git a/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll b/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
index 329f5bb59cb1..de930af75b2d 100644
--- a/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
+++ b/test/CodeGen/PowerPC/p8altivec-shuffles-pred.ll
@@ -21,7 +21,7 @@ entry:
   ret <16 x i8> %strided.vec
 
 ; CHECK-LABEL: @test2
-; CHECK: vsldoi 2, 2, 2, 12
+; CHECK: xxsldwi 34, 34, 34, 3
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll b/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll
index e3326595d132..fe34bcb85637 100644
--- a/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll
+++ b/test/CodeGen/PowerPC/p9-xxinsertw-xxextractuw.ll
@@ -6,7 +6,7 @@
 define <4 x float> @_Z7testInsILj0ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -45,7 +45,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 0
   %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
   ret <4 x float> %vecins
@@ -54,7 +54,7 @@ entry:
 define <4 x float> @_Z7testInsILj1ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -93,7 +93,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 4
   %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
   ret <4 x float> %vecins
@@ -102,7 +102,7 @@ entry:
 define <4 x float> @_Z7testInsILj2ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -141,7 +141,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 8
   %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
   ret <4 x float> %vecins
@@ -150,7 +150,7 @@ entry:
 define <4 x float> @_Z7testInsILj3ELj0EDv4_fET1_S1_S1_(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -189,7 +189,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 12
   %vecins = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   ret <4 x float> %vecins
@@ -198,7 +198,7 @@ entry:
 define <4 x i32> @_Z7testInsILj0ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -237,7 +237,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 0
   %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 7, i32 1, i32 2, i32 3>
   ret <4 x i32> %vecins
@@ -246,7 +246,7 @@ entry:
 define <4 x i32> @_Z7testInsILj1ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -285,7 +285,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 4
   %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
   ret <4 x i32> %vecins
@@ -294,7 +294,7 @@ entry:
 define <4 x i32> @_Z7testInsILj2ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -333,7 +333,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 8
   %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 7, i32 3>
   ret <4 x i32> %vecins
@@ -342,7 +342,7 @@ entry:
 define <4 x i32> @_Z7testInsILj3ELj0EDv4_jET1_S1_S1_(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -381,7 +381,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 12
   %vecins = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
   ret <4 x i32> %vecins
@@ -546,7 +546,7 @@ entry:
 define <4 x float> @_Z7testInsILj0ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -585,7 +585,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 0
   %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 3, i32 5, i32 6, i32 7>
   ret <4 x float> %vecins
@@ -594,7 +594,7 @@ entry:
 define <4 x float> @_Z7testInsILj1ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -633,7 +633,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 4
   %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 3, i32 6, i32 7>
   ret <4 x float> %vecins
@@ -642,7 +642,7 @@ entry:
 define <4 x float> @_Z7testInsILj2ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -681,7 +681,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 8
   %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
   ret <4 x float> %vecins
@@ -690,7 +690,7 @@ entry:
 define <4 x float> @_Z7testInsILj3ELj0EDv4_fET1_S1_S1_r(<4 x float> %a, <4 x float> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_fET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -729,7 +729,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_fET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 12
   %vecins = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
   ret <4 x float> %vecins
@@ -738,7 +738,7 @@ entry:
 define <4 x i32> @_Z7testInsILj0ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -777,7 +777,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 12
 ; CHECK-BE-LABEL: _Z7testInsILj0ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 0
   %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 3, i32 5, i32 6, i32 7>
   ret <4 x i32> %vecins
@@ -786,7 +786,7 @@ entry:
 define <4 x i32> @_Z7testInsILj1ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -825,7 +825,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 8
 ; CHECK-BE-LABEL: _Z7testInsILj1ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 4
   %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 3, i32 6, i32 7>
   ret <4 x i32> %vecins
@@ -834,7 +834,7 @@ entry:
 define <4 x i32> @_Z7testInsILj2ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -873,7 +873,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 4
 ; CHECK-BE-LABEL: _Z7testInsILj2ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 8
   %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 3, i32 7>
   ret <4 x i32> %vecins
@@ -882,7 +882,7 @@ entry:
 define <4 x i32> @_Z7testInsILj3ELj0EDv4_jET1_S1_S1_r(<4 x i32> %a, <4 x i32> %b) {
 entry:
 ; CHECK-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
-; CHECK: xxsldwi 0, 35, 35, 2
+; CHECK: xxswapd 0, 35
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj0EDv4_jET1_S1_S1_
 ; CHECK-BE: xxsldwi 0, 35, 35, 3
@@ -921,7 +921,7 @@ entry:
 ; CHECK: xxsldwi 0, 35, 35, 3
 ; CHECK: xxinsertw 34, 0, 0
 ; CHECK-BE-LABEL: _Z7testInsILj3ELj3EDv4_jET1_S1_S1_
-; CHECK-BE: xxsldwi 0, 35, 35, 2
+; CHECK-BE: xxswapd 0, 35
 ; CHECK-BE: xxinsertw 34, 0, 12
   %vecins = shufflevector <4 x i32> %b, <4 x i32> %a, <4 x i32> <i32 4, i32 5, i32 6, i32 3>
   ret <4 x i32> %vecins
@@ -972,10 +972,10 @@ define <4 x float> @insertVarF(<4 x float> %a, float %f, i32 %el) {
 entry:
 ; CHECK-LABEL: insertVarF
 ; CHECK: stxsspx 1,
-; CHECK: lxvx
+; CHECK: lxv
 ; CHECK-BE-LABEL: insertVarF
 ; CHECK-BE: stxsspx 1,
-; CHECK-BE: lxvx
+; CHECK-BE: lxv
   %vecins = insertelement <4 x float> %a, float %f, i32 %el
   ret <4 x float> %vecins
 }
@@ -983,10 +983,10 @@ define <4 x i32> @insertVarI(<4 x i32> %a, i32 %i, i32 %el) {
 entry:
 ; CHECK-LABEL: insertVarI
 ; CHECK: stwx
-; CHECK: lxvx
+; CHECK: lxv
 ; CHECK-BE-LABEL: insertVarI
 ; CHECK-BE: stwx
-; CHECK-BE: lxvx
+; CHECK-BE: lxv
   %vecins = insertelement <4 x i32> %a, i32 %i, i32 %el
   ret <4 x i32> %vecins
 }
diff --git a/test/CodeGen/PowerPC/ppc64-i128-abi.ll b/test/CodeGen/PowerPC/ppc64-i128-abi.ll
index 4a8fd90db3eb..90dd1d84fc23 100644
--- a/test/CodeGen/PowerPC/ppc64-i128-abi.ll
+++ b/test/CodeGen/PowerPC/ppc64-i128-abi.ll
@@ -63,7 +63,7 @@ define <1 x i128> @v1i128_increment_by_one(<1 x i128> %a) nounwind {
 ; FIXME: li [[R1:r[0-9]+]], 1
 ; FIXME: li [[R2:r[0-9]+]], 0
 ; FIXME: mtvsrdd [[V1:v[0-9]+]], [[R2]], [[R1]]
-; CHECK-P9: lxvx [[V1:v[0-9]+]]
+; CHECK-P9: lxv [[V1:v[0-9]+]]
 ; CHECK-P9: vadduqm v2, v2, [[V1]]
 ; CHECK-P9: blr
 
@@ -207,7 +207,7 @@ define <1 x i128> @call_v1i128_increment_by_one() nounwind {
 ; CHECK-LE: blr
 
 ; CHECK-P9-LABEL: @call_v1i128_increment_by_one
-; CHECK-P9: lxvx
+; CHECK-P9: lxv
 ; CHECK-P9: bl v1i128_increment_by_one
 ; CHECK-P9: blr
 
@@ -237,8 +237,8 @@ define <1 x i128> @call_v1i128_increment_by_val() nounwind {
 ; CHECK-LE: blr
 
 ; CHECK-P9-LABEL: @call_v1i128_increment_by_val
-; CHECK-P9-DAG: lxvx v2
-; CHECK-P9-DAG: lxvx v3
+; CHECK-P9-DAG: lxv v2
+; CHECK-P9-DAG: lxv v3
 ; CHECK-P9: bl v1i128_increment_by_val
 ; CHECK-P9: blr
 
diff --git a/test/CodeGen/PowerPC/pr25157-peephole.ll b/test/CodeGen/PowerPC/pr25157-peephole.ll
index 7f959add00f6..aacd64e401f5 100644
--- a/test/CodeGen/PowerPC/pr25157-peephole.ll
+++ b/test/CodeGen/PowerPC/pr25157-peephole.ll
@@ -65,5 +65,5 @@ L.LB38_2452:
 ; CHECK-P9-LABEL: @aercalc_
 ; CHECK-P9: lfs
 ; CHECK-P9: xxspltd
-; CHECK-P9: stxvx
+; CHECK-P9: stxv
 ; CHECK-P9-NOT: xxswapd
diff --git a/test/CodeGen/PowerPC/pr27078.ll b/test/CodeGen/PowerPC/pr27078.ll
index b100e3a5ba53..d97008ee5578 100644
--- a/test/CodeGen/PowerPC/pr27078.ll
+++ b/test/CodeGen/PowerPC/pr27078.ll
@@ -9,11 +9,11 @@ define <4 x float> @bar(float* %p, float* %q) {
   %6 = shufflevector <12 x float> %5, <12 x float> undef, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
   ret <4 x float>  %6
 
-; CHECK: vsldoi
+; CHECK: xxsldwi
 ; CHECK-NEXT: vmrghw
 ; CHECK-NEXT: vmrglw
-; CHECK-NEXT: vsldoi
-; CHECK-NEXT: vsldoi
-; CHECK-NEXT: vsldoi
+; CHECK-NEXT: xxsldwi
+; CHECK-NEXT: xxsldwi
+; CHECK-NEXT: xxsldwi
 ; CHECK-NEXT: blr
 }
diff --git a/test/CodeGen/PowerPC/swaps-le-6.ll b/test/CodeGen/PowerPC/swaps-le-6.ll
index d573441f2cc9..e7640cab6aef 100644
--- a/test/CodeGen/PowerPC/swaps-le-6.ll
+++ b/test/CodeGen/PowerPC/swaps-le-6.ll
@@ -33,11 +33,11 @@ entry:
 ; CHECK: stxvd2x [[REG5]]
 
 ; CHECK-P9-LABEL: @bar0
-; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]]
+; CHECK-P9-DAG: lxv [[REG1:[0-9]+]]
 ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3)
 ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
 ; CHECK-P9: xxpermdi [[REG5:[0-9]+]], [[REG1]], [[REG4]], 1
-; CHECK-P9: stxvx [[REG5]]
+; CHECK-P9: stxv [[REG5]]
 
 define void @bar1() {
 entry:
@@ -56,9 +56,9 @@ entry:
 ; CHECK: stxvd2x [[REG5]]
 
 ; CHECK-P9-LABEL: @bar1
-; CHECK-P9-DAG: lxvx [[REG1:[0-9]+]]
+; CHECK-P9-DAG: lxv [[REG1:[0-9]+]]
 ; CHECK-P9-DAG: lfd [[REG2:[0-9]+]], 0(3)
 ; CHECK-P9: xxspltd [[REG4:[0-9]+]], [[REG2]], 0
 ; CHECK-P9: xxmrgld [[REG5:[0-9]+]], [[REG4]], [[REG1]]
-; CHECK-P9: stxvx [[REG5]]
+; CHECK-P9: stxv [[REG5]]
 
diff --git a/test/CodeGen/PowerPC/vec_sldwi.ll b/test/CodeGen/PowerPC/vec_sldwi.ll
new file mode 100644
index 000000000000..01537d1f5927
--- /dev/null
+++ b/test/CodeGen/PowerPC/vec_sldwi.ll
@@ -0,0 +1,307 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 < %s | \
+; RUN:   FileCheck %s  -check-prefix=CHECK-LE
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 < %s | \
+; RUN:   FileCheck %s -check-prefix=CHECK-BE
+
+; Possible LE ShuffleVector masks (Case 1):
+; ShuffleVector((vector int)a, vector(int)b, 0, 1, 2, 3)
+; ShuffleVector((vector int)a, vector(int)b, 7, 0, 1, 2)
+; ShuffleVector((vector int)a, vector(int)b, 6, 7, 0, 1)
+; ShuffleVector((vector int)a, vector(int)b, 5, 6, 7, 0)
+; which targets at:
+; xxsldwi a, b, 0
+; xxsldwi a, b, 1
+; xxsldwi a, b, 2
+; xxsldwi a, b, 3
+; Possible LE Swap ShuffleVector masks (Case 2):
+; ShuffleVector((vector int)a, vector(int)b, 4, 5, 6, 7)
+; ShuffleVector((vector int)a, vector(int)b, 3, 4, 5, 6)
+; ShuffleVector((vector int)a, vector(int)b, 2, 3, 4, 5)
+; ShuffleVector((vector int)a, vector(int)b, 1, 2, 3, 4)
+; which targets at:
+; xxsldwi b, a, 0
+; xxsldwi b, a, 1
+; xxsldwi b, a, 2
+; xxsldwi b, a, 3
+; Possible LE ShuffleVector masks when a == b, b is undef (Case 3):
+; ShuffleVector((vector int)a, vector(int)a, 0, 1, 2, 3)
+; ShuffleVector((vector int)a, vector(int)a, 3, 0, 1, 2)
+; ShuffleVector((vector int)a, vector(int)a, 2, 3, 0, 1)
+; ShuffleVector((vector int)a, vector(int)a, 1, 2, 3, 0)
+; which targets at:
+; xxsldwi a, a, 0
+; xxsldwi a, a, 1
+; xxsldwi a, a, 2
+; xxsldwi a, a, 3
+
+; Possible BE ShuffleVector masks (Case 4):
+; ShuffleVector((vector int)a, vector(int)b, 0, 1, 2, 3)
+; ShuffleVector((vector int)a, vector(int)b, 1, 2, 3, 4)
+; ShuffleVector((vector int)a, vector(int)b, 2, 3, 4, 5)
+; ShuffleVector((vector int)a, vector(int)b, 3, 4, 5, 6)
+; which targets at:
+; xxsldwi b, a, 0
+; xxsldwi b, a, 1
+; xxsldwi a, a, 2
+; xxsldwi a, a, 3
+; Possible BE Swap ShuffleVector masks (Case 5):
+; ShuffleVector((vector int)a, vector(int)b, 4, 5, 6, 7)
+; ShuffleVector((vector int)a, vector(int)b, 5, 6, 7, 0)
+; ShuffleVector((vector int)a, vector(int)b, 6, 7, 0, 1)
+; ShuffleVector((vector int)a, vector(int)b, 7, 0, 1, 2)
+; which targets at:
+; xxsldwi b, a, 0
+; xxsldwi b, a, 1
+; xxsldwi b, a, 2
+; xxsldwi b, a, 3
+; Possible BE ShuffleVector masks when a == b, b is undef (Case 6):
+; ShuffleVector((vector int)a, vector(int)b, 0, 1, 2, 3)
+; ShuffleVector((vector int)a, vector(int)a, 1, 2, 3, 0)
+; ShuffleVector((vector int)a, vector(int)a, 2, 3, 0, 1)
+; ShuffleVector((vector int)a, vector(int)a, 3, 0, 1, 2)
+; which targets at:
+; xxsldwi a, a, 0
+; xxsldwi a, a, 1
+; xxsldwi a, a, 2
+; xxsldwi a, a, 3
+
+define <4 x i32> @check_le_vec_sldwi_va_vb_0(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_vb_0
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_vb_1(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_vb_1
+; CHECK-LE: xxsldwi 34, 34, 35, 1
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_vb_2(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_vb_2
+; CHECK-LE: xxsldwi 34, 34, 35, 2
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_vb_3(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_vb_3
+; CHECK-LE: xxsldwi 34, 34, 35, 3
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_swap_vec_sldwi_va_vb_0(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_swap_vec_sldwi_va_vb_0
+; CHECK-LE; vmr 2, 3
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_swap_vec_sldwi_va_vb_1(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_swap_vec_sldwi_va_vb_1
+; CHECK-LE: xxsldwi 34, 35, 34, 1
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_swap_vec_sldwi_va_vb_2(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_swap_vec_sldwi_va_vb_2
+; CHECK-LE: xxsldwi 34, 35, 34, 2
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_swap_vec_sldwi_va_vb_3(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_swap_vec_sldwi_va_vb_3
+; CHECK-LE: xxsldwi 34, 35, 34, 3
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_undef_0(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_undef_0
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_undef_1(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_le_vec_sldwi_va_undef_1
+; CHECK-LE: xxsldwi 34, 34, 34, 1
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_undef_2(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_undef_2
+; CHECK-LE: xxswapd 34, 34
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_le_vec_sldwi_va_undef_3(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_le_vec_sldwi_va_undef_3
+; CHECK-LE: xxsldwi 34, 34, 34, 3
+; CHECK-LE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_vb_0(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_vb_0
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_vb_1(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_vb_1
+; CHECK-BE: xxsldwi 34, 34, 35, 1
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_vb_2(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_vb_2
+; CHECK-BE: xxsldwi 34, 34, 35, 2
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_vb_3(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_vb_3
+; CHECK-BE: xxsldwi 34, 34, 35, 3
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_swap_vec_sldwi_va_vb_0(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_swap_vec_sldwi_va_vb_0
+; CHECK-LE; vmr 2, 3
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_swap_vec_sldwi_va_vb_1(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 5, i32 6, i32 7, i32 0>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_swap_vec_sldwi_va_vb_1
+; CHECK-BE: xxsldwi 34, 35, 34, 1
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_swap_vec_sldwi_va_vb_2(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 6, i32 7, i32 0, i32 1>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_swap_vec_sldwi_va_vb_2
+; CHECK-BE: xxsldwi 34, 35, 34, 2
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_swap_vec_sldwi_va_vb_3(<4 x i32> %VA, <4 x i32> %VB) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> %VB, <4 x i32> <i32 7, i32 0, i32 1, i32 2>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_swap_vec_sldwi_va_vb_3
+; CHECK-BE: xxsldwi 34, 35, 34, 3
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_undef_0(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %0
+; CHECK-LE-LABEL: @check_be_vec_sldwi_va_undef_0
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_undef_1(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_undef_1
+; CHECK-BE: xxsldwi 34, 34, 34, 1
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_undef_2(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_undef_2
+; CHECK-BE: xxswapd 34, 34
+; CHECK-BE: blr
+}
+
+define <4 x i32> @check_be_vec_sldwi_va_undef_3(<4 x i32> %VA) {
+entry:
+  %0 = shufflevector <4 x i32> %VA, <4 x i32> undef, <4 x i32> <i32 3, i32 0, i32 1, i32 2>
+  ret <4 x i32> %0
+; CHECK-BE-LABEL: @check_be_vec_sldwi_va_undef_3
+; CHECK-BE: xxsldwi 34, 34, 34, 3
+; CHECK-BE: blr
+}
+
+; More test cases to test different types of vector inputs
+define <16 x i8> @test_le_vec_sldwi_v16i8_v16i8(<16 x i8> %VA, <16 x i8> %VB) {
+     entry:
+      %0 = shufflevector <16 x i8> %VA, <16 x i8> %VB,<16 x i32> <i32 28, i32 29, i32 30, i32 31,i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+      ret <16 x i8> %0
+; CHECK-LE-LABEL: @test_le_vec_sldwi_v16i8_v16i8
+; CHECK-LE: xxsldwi 34, 34, 35, 1
+; CHECK-LE: blr
+}
+
+define <8 x i16> @test_le_vec_sldwi_v8i16_v8i16(<8 x i16> %VA, <8 x i16> %VB) {
+     entry:
+      %0 = shufflevector <8 x i16> %VA, <8 x i16> %VB,<8 x i32> <i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+      ret <8 x i16> %0
+; CHECK-LE-LABEL: @test_le_vec_sldwi_v8i16_v8i16
+; CHECK-LE: xxsldwi 34, 34, 35, 1
+; CHECK-LE: blr
+}
+
+; Note here xxpermdi 34, 34, 35, 2 <=> xxsldwi 34, 34, 35, 2
+define <2 x i64> @test_be_vec_sldwi_v2i64_v2i64(<2 x i64> %VA, <2 x i64> %VB) {
+     entry:
+      %0 = shufflevector <2 x i64> %VA, <2 x i64> %VB,<2 x i32> <i32 3, i32 0>
+      ret <2 x i64> %0
+; CHECK-LE-LABEL: @test_be_vec_sldwi_v2i64_v2i64
+; CHECK-LE: xxpermdi 34, 34, 35, 2
+; CHECK-LE: blr
+}
diff --git a/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll b/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll
index acedc2606331..0f0426526cc1 100644
--- a/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll
+++ b/test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll
@@ -26,82 +26,82 @@ entry:
 ; CHECK-LABEL: test1
 ; CHECK-P9-LABEL: test1
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %0 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* bitcast (<4 x i32>* @vsi to i8*))
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   store <4 x i32> %0, <4 x i32>* @res_vsi, align 16
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %1 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* bitcast (<4 x i32>* @vui to i8*))
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   store <4 x i32> %1, <4 x i32>* @res_vui, align 16
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %2 = call <4 x i32> @llvm.ppc.vsx.lxvw4x(i8* bitcast (<4 x float>* @vf to i8*))
   %3 = bitcast <4 x i32> %2 to <4 x float>
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   store <4 x float> %3, <4 x float>* @res_vf, align 16
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %4 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* bitcast (<2 x i64>* @vsll to i8*))
   %5 = bitcast <2 x double> %4 to <2 x i64>
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   store <2 x i64> %5, <2 x i64>* @res_vsll, align 16
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %6 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* bitcast (<2 x i64>* @vull to i8*))
   %7 = bitcast <2 x double> %6 to <2 x i64>
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   store <2 x i64> %7, <2 x i64>* @res_vull, align 16
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %8 = call <2 x double> @llvm.ppc.vsx.lxvd2x(i8* bitcast (<2 x double>* @vd to i8*))
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   store <2 x double> %8, <2 x double>* @res_vd, align 16
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %9 = load <4 x i32>, <4 x i32>* @vsi, align 16
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %9, i8* bitcast (<4 x i32>* @res_vsi to i8*))
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %10 = load <4 x i32>, <4 x i32>* @vui, align 16
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %10, i8* bitcast (<4 x i32>* @res_vui to i8*))
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %11 = load <4 x float>, <4 x float>* @vf, align 16
   %12 = bitcast <4 x float> %11 to <4 x i32>
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   call void @llvm.ppc.vsx.stxvw4x(<4 x i32> %12, i8* bitcast (<4 x float>* @res_vf to i8*))
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %13 = load <2 x i64>, <2 x i64>* @vsll, align 16
   %14 = bitcast <2 x i64> %13 to <2 x double>
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   call void @llvm.ppc.vsx.stxvd2x(<2 x double> %14, i8* bitcast (<2 x i64>* @res_vsll to i8*))
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %15 = load <2 x i64>, <2 x i64>* @vull, align 16
   %16 = bitcast <2 x i64> %15 to <2 x double>
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   call void @llvm.ppc.vsx.stxvd2x(<2 x double> %16, i8* bitcast (<2 x i64>* @res_vull to i8*))
 ; CHECK: lxvd2x
-; CHECK-P9-DAG: lxvx
+; CHECK-P9-DAG: lxv
   %17 = load <2 x double>, <2 x double>* @vd, align 16
 ; CHECK: stxvd2x
-; CHECK-P9-DAG: stxvx
+; CHECK-P9-DAG: stxv
   call void @llvm.ppc.vsx.stxvd2x(<2 x double> %17, i8* bitcast (<2 x double>* @res_vd to i8*))
   ret void
 }
diff --git a/test/CodeGen/PowerPC/vsx-ldst.ll b/test/CodeGen/PowerPC/vsx-ldst.ll
index d8dd635aab5f..0bbc633363a7 100644
--- a/test/CodeGen/PowerPC/vsx-ldst.ll
+++ b/test/CodeGen/PowerPC/vsx-ldst.ll
@@ -21,8 +21,8 @@
 
 ; RUN: llc -verify-machineinstrs -mcpu=pwr9 -O2 \
 ; RUN:   -mtriple=powerpc64le-unknown-linux-gnu < %s > %t
-; RUN: grep lxvx < %t | count 6
-; RUN: grep stxvx < %t | count 6
+; RUN: grep lxv < %t | count 6
+; RUN: grep stxv < %t | count 6
 
 
 @vsi = global <4 x i32> <i32 -1, i32 2, i32 -3, i32 4>, align 16
diff --git a/test/CodeGen/PowerPC/vsx-p9.ll b/test/CodeGen/PowerPC/vsx-p9.ll
index ba359501ccc5..0c29b6adad77 100644
--- a/test/CodeGen/PowerPC/vsx-p9.ll
+++ b/test/CodeGen/PowerPC/vsx-p9.ll
@@ -36,109 +36,109 @@ entry:
   %1 = load <16 x i8>, <16 x i8>* @ucb, align 16
   %add.i = add <16 x i8> %1, %0
   tail call void (...) @sink(<16 x i8> %add.i)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vaddubm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %2 = load <16 x i8>, <16 x i8>* @sca, align 16
   %3 = load <16 x i8>, <16 x i8>* @scb, align 16
   %add.i22 = add <16 x i8> %3, %2
   tail call void (...) @sink(<16 x i8> %add.i22)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vaddubm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %4 = load <8 x i16>, <8 x i16>* @usa, align 16
   %5 = load <8 x i16>, <8 x i16>* @usb, align 16
   %add.i21 = add <8 x i16> %5, %4
   tail call void (...) @sink(<8 x i16> %add.i21)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vadduhm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %6 = load <8 x i16>, <8 x i16>* @ssa, align 16
   %7 = load <8 x i16>, <8 x i16>* @ssb, align 16
   %add.i20 = add <8 x i16> %7, %6
   tail call void (...) @sink(<8 x i16> %add.i20)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vadduhm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %8 = load <4 x i32>, <4 x i32>* @uia, align 16
   %9 = load <4 x i32>, <4 x i32>* @uib, align 16
   %add.i19 = add <4 x i32> %9, %8
   tail call void (...) @sink(<4 x i32> %add.i19)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vadduwm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %10 = load <4 x i32>, <4 x i32>* @sia, align 16
   %11 = load <4 x i32>, <4 x i32>* @sib, align 16
   %add.i18 = add <4 x i32> %11, %10
   tail call void (...) @sink(<4 x i32> %add.i18)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vadduwm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %12 = load <2 x i64>, <2 x i64>* @ulla, align 16
   %13 = load <2 x i64>, <2 x i64>* @ullb, align 16
   %add.i17 = add <2 x i64> %13, %12
   tail call void (...) @sink(<2 x i64> %add.i17)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vaddudm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %14 = load <2 x i64>, <2 x i64>* @slla, align 16
   %15 = load <2 x i64>, <2 x i64>* @sllb, align 16
   %add.i16 = add <2 x i64> %15, %14
   tail call void (...) @sink(<2 x i64> %add.i16)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vaddudm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %16 = load <1 x i128>, <1 x i128>* @uxa, align 16
   %17 = load <1 x i128>, <1 x i128>* @uxb, align 16
   %add.i15 = add <1 x i128> %17, %16
   tail call void (...) @sink(<1 x i128> %add.i15)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vadduqm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %18 = load <1 x i128>, <1 x i128>* @sxa, align 16
   %19 = load <1 x i128>, <1 x i128>* @sxb, align 16
   %add.i14 = add <1 x i128> %19, %18
   tail call void (...) @sink(<1 x i128> %add.i14)
-; CHECK: lxvx 34, 0, 3
-; CHECK: lxvx 35, 0, 4
+; CHECK: lxv 34, 0(3)
+; CHECK: lxv 35, 0(4)
 ; CHECK: vadduqm 2, 3, 2
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %20 = load <4 x float>, <4 x float>* @vfa, align 16
   %21 = load <4 x float>, <4 x float>* @vfb, align 16
   %add.i13 = fadd <4 x float> %20, %21
   tail call void (...) @sink(<4 x float> %add.i13)
-; CHECK: lxvx 0, 0, 3
-; CHECK: lxvx 1, 0, 4
+; CHECK: lxv 0, 0(3)
+; CHECK: lxv 1, 0(4)
 ; CHECK: xvaddsp 34, 0, 1
-; CHECK: stxvx 34,
+; CHECK: stxv 34,
 ; CHECK: bl sink
   %22 = load <2 x double>, <2 x double>* @vda, align 16
   %23 = load <2 x double>, <2 x double>* @vdb, align 16
   %add.i12 = fadd <2 x double> %22, %23
   tail call void (...) @sink(<2 x double> %add.i12)
-; CHECK: lxvx 0, 0, 3
-; CHECK: lxvx 1, 0, 4
+; CHECK: lxv 0, 0(3)
+; CHECK: lxv 1, 0(4)
 ; CHECK: xvadddp 0, 0, 1
-; CHECK: stxvx 0,
+; CHECK: stxv 0,
 ; CHECK: bl sink
   ret void
 }
diff --git a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
index 09bf6830416f..98fe3a813cb7 100644
--- a/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
+++ b/test/CodeGen/PowerPC/vsx_insert_extract_le.ll
@@ -23,7 +23,7 @@ define <2 x double> @testi0(<2 x double>* %p1, double* %p2) {
 
 ; CHECK-P9-LABEL: testi0
 ; CHECK-P9: lfd [[REG1:[0-9]+]], 0(4)
-; CHECK-P9: lxvx [[REG2:[0-9]+]], 0, 3
+; CHECK-P9: lxv [[REG2:[0-9]+]], 0(3)
 ; CHECK-P9: xxspltd [[REG3:[0-9]+]], [[REG1]], 0
 ; CHECK-P9: xxpermdi 34, [[REG2]], [[REG3]], 1
 }
@@ -43,7 +43,7 @@ define <2 x double> @testi1(<2 x double>* %p1, double* %p2) {
 
 ; CHECK-P9-LABEL: testi1
 ; CHECK-P9: lfd [[REG1:[0-9]+]], 0(4)
-; CHECK-P9: lxvx [[REG2:[0-9]+]], 0, 3
+; CHECK-P9: lxv [[REG2:[0-9]+]], 0(3)
 ; CHECK-P9: xxspltd [[REG3:[0-9]+]], [[REG1]], 0
 ; CHECK-P9: xxmrgld 34, [[REG3]], [[REG2]]
 }
diff --git a/test/CodeGen/PowerPC/vsx_shuffle_le.ll b/test/CodeGen/PowerPC/vsx_shuffle_le.ll
index 3bf24adfdd91..cfe201999282 100644
--- a/test/CodeGen/PowerPC/vsx_shuffle_le.ll
+++ b/test/CodeGen/PowerPC/vsx_shuffle_le.ll
@@ -19,7 +19,7 @@ define <2 x double> @test00(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxspltd 34, 0, 0
 
 ; CHECK-P9-LABEL: test00
-; CHECK-P9: lxvx 0, 0, 3
+; CHECK-P9: lxv 0, 0(3)
 ; CHECK-P9: xxspltd 34, 0, 1
 }
 
@@ -34,7 +34,7 @@ define <2 x double> @test01(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxswapd 34, 0
 
 ; CHECK-P9-LABEL: test01
-; CHECK-P9: lxvx 34, 0, 3
+; CHECK-P9: lxv 34, 0(3)
 }
 
 define <2 x double> @test02(<2 x double>* %p1, <2 x double>* %p2) {
@@ -51,8 +51,8 @@ define <2 x double> @test02(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxmrgld 34, 1, 0
 
 ; CHECK-P9-LABEL: @test02
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxmrgld 34, 1, 0
 }
 
@@ -70,8 +70,8 @@ define <2 x double> @test03(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxpermdi 34, 1, 0, 1
 
 ; CHECK-P9-LABEL: @test03
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxpermdi 34, 1, 0, 1
 }
 
@@ -85,7 +85,7 @@ define <2 x double> @test10(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: lxvd2x 34, 0, 3
 
 ; CHECK-P9-LABEL: @test10
-; CHECK-P9: lxvx 0, 0, 3
+; CHECK-P9: lxv 0, 0(3)
 ; CHECK-P9: xxswapd 34, 0
 }
 
@@ -100,7 +100,7 @@ define <2 x double> @test11(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxspltd 34, 0, 1
 
 ; CHECK-P9-LABEL: @test11
-; CHECK-P9: lxvx 0, 0, 3
+; CHECK-P9: lxv 0, 0(3)
 ; CHECK-P9: xxspltd 34, 0, 0
 }
 
@@ -118,8 +118,8 @@ define <2 x double> @test12(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxpermdi 34, 1, 0, 2
 
 ; CHECK-P9-LABEL: @test12
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxpermdi 34, 1, 0, 2
 }
 
@@ -137,8 +137,8 @@ define <2 x double> @test13(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxmrghd 34, 1, 0
 
 ; CHECK-P9-LABEL: @test13
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxmrghd 34, 1, 0
 }
 
@@ -156,8 +156,8 @@ define <2 x double> @test20(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxmrgld 34, 0, 1
 
 ; CHECK-P9-LABEL: @test20
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxmrgld 34, 0, 1
 }
 
@@ -175,8 +175,8 @@ define <2 x double> @test21(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxpermdi 34, 0, 1, 1
 
 ; CHECK-P9-LABEL: @test21
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxpermdi 34, 0, 1, 1
 }
 
@@ -191,7 +191,7 @@ define <2 x double> @test22(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxspltd 34, 0, 0
 
 ; CHECK-P9-LABEL: @test22
-; CHECK-P9: lxvx 0, 0, 4
+; CHECK-P9: lxv 0, 0(4)
 ; CHECK-P9: xxspltd 34, 0, 1
 }
 
@@ -206,7 +206,7 @@ define <2 x double> @test23(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxswapd 34, 0
 
 ; CHECK-P9-LABEL: @test23
-; CHECK-P9: lxvx 34, 0, 4
+; CHECK-P9: lxv 34, 0(4)
 }
 
 define <2 x double> @test30(<2 x double>* %p1, <2 x double>* %p2) {
@@ -223,8 +223,8 @@ define <2 x double> @test30(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxpermdi 34, 0, 1, 2
 
 ; CHECK-P9-LABEL: @test30
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxpermdi 34, 0, 1, 2
 }
 
@@ -242,8 +242,8 @@ define <2 x double> @test31(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxmrghd 34, 0, 1
 
 ; CHECK-P9-LABEL: @test31
-; CHECK-P9: lxvx 0, 0, 3
-; CHECK-P9: lxvx 1, 0, 4
+; CHECK-P9: lxv 0, 0(3)
+; CHECK-P9: lxv 1, 0(4)
 ; CHECK-P9: xxmrghd 34, 0, 1
 }
 
@@ -257,7 +257,7 @@ define <2 x double> @test32(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: lxvd2x 34, 0, 4
 
 ; CHECK-P9-LABEL: @test32
-; CHECK-P9: lxvx 0, 0, 4
+; CHECK-P9: lxv 0, 0(4)
 ; CHECK-P9: xxswapd 34, 0
 }
 
@@ -272,6 +272,6 @@ define <2 x double> @test33(<2 x double>* %p1, <2 x double>* %p2) {
 ; CHECK: xxspltd 34, 0, 1
 
 ; CHECK-P9-LABEL: @test33
-; CHECK-P9: lxvx 0, 0, 4
+; CHECK-P9: lxv 0, 0(4)
 ; CHECK-P9: xxspltd 34, 0, 0
 }
diff --git a/test/CodeGen/Thumb/machine-cse-physreg.mir b/test/CodeGen/Thumb/machine-cse-physreg.mir
new file mode 100644
index 000000000000..5206e89cf779
--- /dev/null
+++ b/test/CodeGen/Thumb/machine-cse-physreg.mir
@@ -0,0 +1,35 @@
+# RUN: llc -mtriple thumbv5e -run-pass=machine-cse -o - %s | FileCheck %s
+
+# This is a contrived example made to expose a bug in
+# MachineCSE, see PR32538.
+
+# MachineCSE must not remove this def of %cpsr:
+# CHECK-LABEL: bb.1:
+# CHECK: , %cpsr = tLSLri
+
+...
+---
+name:            spam
+registers:
+  - { id: 0, class: tgpr }
+  - { id: 1, class: tgpr }
+  - { id: 2, class: tgpr }
+  - { id: 3, class: tgpr }
+liveins:
+  - { reg: '%r0', virtual-reg: '%0' }
+body:             |
+  bb.0:
+    liveins: %r0
+    %0 = COPY %r0
+    %1, %cpsr = tLSLri %0, 2, 14, _
+    tCMPi8 %0, 5, 14, _, implicit-def %cpsr
+    tBcc %bb.8, 8, %cpsr
+
+  bb.1:
+    %2, %cpsr = tLSLri %0, 2, 14, _
+
+  bb.8:
+    liveins: %cpsr
+    %3 = COPY %cpsr
+    tSTRi killed %3, %0, 0, 14, _
+...
diff --git a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
index 172a00a7c86f..89cb71a52c04 100644
--- a/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
+++ b/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machine-licm"
+; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn -stats 2>&1 | grep "9 machinelicm"
 ; RUN: llc < %s -march=x86-64 -mattr=+sse3,+sse4.1 -mcpu=penryn | FileCheck %s
 ; rdar://6627786
 ; rdar://7792037
diff --git a/test/CodeGen/X86/GlobalISel/memop-vec.ll b/test/CodeGen/X86/GlobalISel/memop-vec.ll
index f1ffc15f4d03..870e812bbb69 100644
--- a/test/CodeGen/X86/GlobalISel/memop-vec.ll
+++ b/test/CodeGen/X86/GlobalISel/memop-vec.ll
@@ -1,39 +1,116 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx                       -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
-; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx                       -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX
+; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=SKX
 
 define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) {
-; ALL-LABEL: test_load_v4i32_noalign:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovups (%rdi), %xmm0
-; ALL-NEXT:    retq
+; SKX-LABEL: test_load_v4i32_noalign:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups (%rdi), %xmm0
+; SKX-NEXT:    retq
   %r = load <4 x i32>, <4 x i32>* %p1, align 1
   ret <4 x i32> %r
 }
 
 define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) {
-; ALL-LABEL: test_load_v4i32_align:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovaps (%rdi), %xmm0
-; ALL-NEXT:    retq
+; SKX-LABEL: test_load_v4i32_align:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps (%rdi), %xmm0
+; SKX-NEXT:    retq
   %r = load <4 x i32>, <4 x i32>* %p1, align 16
   ret <4 x i32> %r
 }
 
+define <8 x i32> @test_load_v8i32_noalign(<8 x i32> * %p1) {
+; SKX-LABEL: test_load_v8i32_noalign:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups (%rdi), %ymm0
+; SKX-NEXT:    retq
+  %r = load <8 x i32>, <8 x i32>* %p1, align 1
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @test_load_v8i32_align(<8 x i32> * %p1) {
+; SKX-LABEL: test_load_v8i32_align:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps (%rdi), %ymm0
+; SKX-NEXT:    retq
+  %r = load <8 x i32>, <8 x i32>* %p1, align 32
+  ret <8 x i32> %r
+}
+
+define <16 x i32> @test_load_v16i32_noalign(<16 x i32> * %p1) {
+; SKX-LABEL: test_load_v16i32_noalign:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups (%rdi), %zmm0
+; SKX-NEXT:    retq
+  %r = load <16 x i32>, <16 x i32>* %p1, align 1
+  ret <16 x i32> %r
+}
+
+define <16 x i32> @test_load_v16i32_align(<16 x i32> * %p1) {
+; SKX-LABEL: test_load_v16i32_align:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups (%rdi), %zmm0
+; SKX-NEXT:    retq
+  %r = load <16 x i32>, <16 x i32>* %p1, align 32
+  ret <16 x i32> %r
+}
+
 define void @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) {
-; ALL-LABEL: test_store_v4i32_noalign:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovups %xmm0, (%rdi)
-; ALL-NEXT:    retq
+; SKX-LABEL: test_store_v4i32_noalign:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    retq
   store <4 x i32> %val, <4 x i32>* %p1, align 1
   ret void
 }
 
 define void @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) {
-; ALL-LABEL: test_store_v4i32_align:
-; ALL:       # BB#0:
-; ALL-NEXT:    vmovaps %xmm0, (%rdi)
-; ALL-NEXT:    retq
+; SKX-LABEL: test_store_v4i32_align:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    retq
   store <4 x i32> %val, <4 x i32>* %p1, align 16
   ret void
 }
+
+define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) {
+; SKX-LABEL: test_store_v8i32_noalign:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  store <8 x i32> %val, <8 x i32>* %p1, align 1
+  ret void
+}
+
+define void @test_store_v8i32_align(<8 x i32> %val, <8 x i32>* %p1) {
+; SKX-LABEL: test_store_v8i32_align:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  store <8 x i32> %val, <8 x i32>* %p1, align 32
+  ret void
+}
+
+define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) {
+; SKX-LABEL: test_store_v16i32_noalign:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovups %zmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  store <16 x i32> %val, <16 x i32>* %p1, align 1
+  ret void
+}
+
+define void @test_store_v16i32_align(<16 x i32> %val, <16 x i32>* %p1) {
+; SKX-LABEL: test_store_v16i32_align:
+; SKX:       # BB#0:
+; SKX-NEXT:    vmovaps %zmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  store <16 x i32> %val, <16 x i32>* %p1, align 64
+  ret void
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir b/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir
index f925c836f3d1..cc03f3a57f0b 100644
--- a/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-AVX2.mir
@@ -14,7 +14,16 @@
     ret void
   }
 
-...
+  define <8 x i32> @test_load_v8i32_noalign(<8 x i32>* %p1) {
+    %r = load <8 x i32>, <8 x i32>* %p1, align 1
+    ret <8 x i32> %r
+  }
+
+  define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) {
+    store <8 x i32> %val, <8 x i32>* %p1, align 1
+    ret void
+  }
+
 ---
 name:            test_mul_vec256
 alignment:       4
@@ -84,3 +93,47 @@ body:             |
     RET 0
 
 ...
+---
+name:            test_load_v8i32_noalign
+# CHECK-LABEL: name:  test_load_v8i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: gpr }
+# CHECK-NEXT:    - { id: 1, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<8 x s32>) = G_LOAD %0(p0) :: (load 32 from %ir.p1, align 1)
+    %ymm0 = COPY %1(<8 x s32>)
+    RET 0, implicit %ymm0
+
+...
+---
+name:            test_store_v8i32_noalign
+# CHECK-LABEL: name:  test_store_v8i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: vecr }
+# CHECK-NEXT:    - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %ymm0
+
+    %0(<8 x s32>) = COPY %ymm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<8 x s32>), %1(p0) :: (store 32 into %ir.p1, align 1)
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir b/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir
index e0c12ff44a2f..278413ad38ef 100644
--- a/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir
+++ b/test/CodeGen/X86/GlobalISel/regbankselect-AVX512.mir
@@ -15,22 +15,29 @@
     ret void
   }
 
+  define <16 x i32> @test_load_v16i32_noalign(<16 x i32>* %p1) {
+    %r = load <16 x i32>, <16 x i32>* %p1, align 1
+    ret <16 x i32> %r
+  }
+
+  define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) {
+    store <16 x i32> %val, <16 x i32>* %p1, align 1
+    ret void
+  }
+
 ...
 ---
 name:            test_mul_vec512
+# CHECK-LABEL: name:  test_mul_vec512
 alignment:       4
 legalized:       true
 regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-# CHECK-LABEL: name:            test_mul_vec512
-# CHECK: registers:
-# CHECK:  - { id: 0, class: vecr }
-# CHECK:  - { id: 1, class: vecr }
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: vecr }
+# CHECK-NEXT:    - { id: 1, class: vecr }
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body:             |
   bb.1 (%ir-block.0):
 
@@ -41,19 +48,16 @@ body:             |
 ...
 ---
 name:            test_add_vec512
+# CHECK-LABEL: name:  test_add_vec512
 alignment:       4
 legalized:       true
 regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-# CHECK-LABEL: name:            test_add_vec512
-# CHECK: registers:
-# CHECK:  - { id: 0, class: vecr }
-# CHECK:  - { id: 1, class: vecr }
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: vecr }
+# CHECK-NEXT:    - { id: 1, class: vecr }
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body:             |
   bb.1 (%ir-block.0):
 
@@ -64,24 +68,65 @@ body:             |
 ...
 ---
 name:            test_sub_vec512
+# CHECK-LABEL: name:  test_sub_vec512
 alignment:       4
 legalized:       true
 regBankSelected: false
-selected:        false
-tracksRegLiveness: true
-# CHECK-LABEL: name:            test_sub_vec512
-# CHECK: registers:
-# CHECK:  - { id: 0, class: vecr }
-# CHECK:  - { id: 1, class: vecr }
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: vecr }
+# CHECK-NEXT:    - { id: 1, class: vecr }
 registers:
   - { id: 0, class: _ }
   - { id: 1, class: _ }
-  - { id: 2, class: _ }
 body:             |
   bb.1 (%ir-block.0):
 
     %0(<16 x s32>) = IMPLICIT_DEF
     %1(<16 x s32>) = G_SUB %0, %0
     RET 0
+...
+---
+
+name:            test_load_v16i32_noalign
+# CHECK-LABEL: name:  test_load_v16i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: gpr }
+# CHECK-NEXT:    - { id: 1, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 1)
+    %zmm0 = COPY %1(<16 x s32>)
+    RET 0, implicit %zmm0
+
+...
+---
+name:            test_store_v16i32_noalign
+# CHECK-LABEL: name:  test_store_v16i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: false
+# CHECK:       registers:
+# CHECK-NEXT:    - { id: 0, class: vecr }
+# CHECK-NEXT:    - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %zmm0
+
+    %0(<16 x s32>) = COPY %zmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 1)
+    RET 0
 
 ...
diff --git a/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir b/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir
new file mode 100644
index 000000000000..539520c0b8f5
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-leaf-constant.mir
@@ -0,0 +1,96 @@
+# RUN: llc -mtriple=i586-linux-gnu -mcpu=haswell -mattr=-slow-incdec -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK
+#
+# This is necessary to test that attribute-based rule predicates work and that
+# they properly reset between functions.
+
+--- |
+  define i32 @const_i32_1() {
+    ret i32 1
+  }
+
+  define i32 @const_i32_1_optsize() #0 {
+    ret i32 1
+  }
+
+  define i32 @const_i32_1b() {
+    ret i32 1
+  }
+
+  define i32 @const_i32_1_optsizeb() #0 {
+    ret i32 1
+  }
+
+  attributes #0 = { optsize }
+...
+---
+name:            const_i32_1
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name: const_i32_1
+# CHECK:       registers:
+# CHECK-NEXT:  - { id: 0, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV32ri 1
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s32) = G_CONSTANT i32 1
+    %eax = COPY %0(s32)
+    RET 0, implicit %eax
+...
+---
+name:            const_i32_1_optsize
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name: const_i32_1_optsize
+# CHECK:       registers:
+# CHECK-NEXT:  - { id: 0, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV32r1
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s32) = G_CONSTANT i32 1
+    %eax = COPY %0(s32)
+    RET 0, implicit %eax
+...
+---
+name:            const_i32_1b
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name: const_i32_1b
+# CHECK:       registers:
+# CHECK-NEXT:  - { id: 0, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV32ri 1
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s32) = G_CONSTANT i32 1
+    %eax = COPY %0(s32)
+    RET 0, implicit %eax
+...
+---
+name:            const_i32_1_optsizeb
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name: const_i32_1_optsizeb
+# CHECK:       registers:
+# CHECK-NEXT:  - { id: 0, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV32r1
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s32) = G_CONSTANT i32 1
+    %eax = COPY %0(s32)
+    RET 0, implicit %eax
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v256.mir b/test/CodeGen/X86/GlobalISel/select-memop-v256.mir
new file mode 100644
index 000000000000..b9a7e4a8cc4a
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-memop-v256.mir
@@ -0,0 +1,188 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                      -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL
+
+
+--- |
+  define <8 x i32> @test_load_v8i32_noalign(<8 x i32>* %p1) {
+    %r = load <8 x i32>, <8 x i32>* %p1, align 1
+    ret <8 x i32> %r
+  }
+
+  define <8 x i32> @test_load_v8i32_align(<8 x i32>* %p1) {
+    %r = load <8 x i32>, <8 x i32>* %p1, align 32
+    ret <8 x i32> %r
+  }
+
+  define void @test_store_v8i32_noalign(<8 x i32> %val, <8 x i32>* %p1) {
+    store <8 x i32> %val, <8 x i32>* %p1, align 1
+    ret void
+  }
+
+  define void @test_store_v8i32_align(<8 x i32> %val, <8 x i32>* %p1) {
+    store <8 x i32> %val, <8 x i32>* %p1, align 32
+    ret void
+  }
+
+
+...
+---
+name:            test_load_v8i32_noalign
+# ALL-LABEL: name:  test_load_v8i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NO_AVX512F:       registers:
+# NO_AVX512F-NEXT:    - { id: 0, class: gr64 }
+# NO_AVX512F-NEXT:    - { id: 1, class: vr256 }
+#
+# AVX512ALL:        registers:
+# AVX512ALL-NEXT:     - { id: 0, class: gr64 }
+# AVX512ALL-NEXT:     - { id: 1, class: vr256x }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# NO_AVX512F:           %0 = COPY %rdi
+# NO_AVX512F-NEXT:      %1 = VMOVUPSYrm %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1)
+# NO_AVX512F-NEXT:      %ymm0 = COPY %1
+# NO_AVX512F-NEXT:      RET 0, implicit %ymm0
+#
+# AVX512F:              %0 = COPY %rdi
+# AVX512F-NEXT:         %1 = VMOVUPSZ256rm_NOVLX %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1)
+# AVX512F-NEXT:         %ymm0 = COPY %1
+# AVX512F-NEXT:         RET 0, implicit %ymm0
+#
+# AVX512VL:             %0 = COPY %rdi
+# AVX512VL-NEXT:        %1 = VMOVUPSZ256rm %0, 1, _, 0, _ :: (load 32 from %ir.p1, align 1)
+# AVX512VL-NEXT:        %ymm0 = COPY %1
+# AVX512VL-NEXT:        RET 0, implicit %ymm0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<8 x s32>) = G_LOAD %0(p0) :: (load 32 from %ir.p1, align 1)
+    %ymm0 = COPY %1(<8 x s32>)
+    RET 0, implicit %ymm0
+
+...
+---
+name:            test_load_v8i32_align
+# ALL-LABEL: name:  test_load_v8i32_align
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NO_AVX512F:       registers:
+# NO_AVX512F-NEXT:    - { id: 0, class: gr64 }
+# NO_AVX512F-NEXT:    - { id: 1, class: vr256 }
+#
+# AVX512ALL:        registers:
+# AVX512ALL-NEXT:     - { id: 0, class: gr64 }
+# AVX512ALL-NEXT:     - { id: 1, class: vr256x }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# NO_AVX512F:           %0 = COPY %rdi
+# NO_AVX512F-NEXT:      %1 = VMOVAPSYrm %0, 1, _, 0, _ :: (load 32 from %ir.p1)
+# NO_AVX512F-NEXT:      %ymm0 = COPY %1
+# NO_AVX512F-NEXT:      RET 0, implicit %ymm0
+#
+# AVX512F:              %0 = COPY %rdi
+# AVX512F-NEXT:         %1 = VMOVAPSZ256rm_NOVLX %0, 1, _, 0, _ :: (load 32 from %ir.p1)
+# AVX512F-NEXT:         %ymm0 = COPY %1
+# AVX512F-NEXT:         RET 0, implicit %ymm0
+#
+# AVX512VL:             %0 = COPY %rdi
+# AVX512VL-NEXT:        %1 = VMOVAPSZ256rm %0, 1, _, 0, _ :: (load 32 from %ir.p1)
+# AVX512VL-NEXT:        %ymm0 = COPY %1
+# AVX512VL-NEXT:        RET 0, implicit %ymm0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<8 x s32>) = G_LOAD %0(p0) :: (load 32 from %ir.p1)
+    %ymm0 = COPY %1(<8 x s32>)
+    RET 0, implicit %ymm0
+
+...
+---
+name:            test_store_v8i32_noalign
+# ALL-LABEL: name:  test_store_v8i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NO_AVX512F:       registers:
+# NO_AVX512F-NEXT:    - { id: 0, class: vr256 }
+# NO_AVX512F-NEXT:    - { id: 1, class: gr64 }
+#
+# AVX512ALL:        registers:
+# AVX512ALL-NEXT:     - { id: 0, class: vr256x }
+# AVX512ALL-NEXT:     - { id: 1, class: gr64 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# NO_AVX512F:           %0 = COPY %ymm0
+# NO_AVX512F-NEXT:      %1 = COPY %rdi
+# NO_AVX512F-NEXT:      VMOVUPSYmr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1)
+# NO_AVX512F-NEXT:      RET 0
+#
+# AVX512F:              %0 = COPY %ymm0
+# AVX512F-NEXT:         %1 = COPY %rdi
+# AVX512F-NEXT:         VMOVUPSZ256mr_NOVLX %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1)
+# AVX512F-NEXT:         RET 0
+#
+# AVX512VL:             %0 = COPY %ymm0
+# AVX512VL-NEXT:        %1 = COPY %rdi
+# AVX512VL-NEXT:        VMOVUPSZ256mr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1, align 1)
+# AVX512VL-NEXT:        RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %ymm0
+
+    %0(<8 x s32>) = COPY %ymm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<8 x s32>), %1(p0) :: (store 32 into %ir.p1, align 1)
+    RET 0
+
+...
+---
+name:            test_store_v8i32_align
+# ALL-LABEL: name:  test_store_v8i32_align
+alignment:       4
+legalized:       true
+regBankSelected: true
+# NO_AVX512F:       registers:
+# NO_AVX512F-NEXT:    - { id: 0, class: vr256 }
+# NO_AVX512F-NEXT:    - { id: 1, class: gr64 }
+#
+# AVX512ALL:        registers:
+# AVX512ALL-NEXT:     - { id: 0, class: vr256x }
+# AVX512ALL-NEXT:     - { id: 1, class: gr64 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# NO_AVX512F:           %0 = COPY %ymm0
+# NO_AVX512F-NEXT:      %1 = COPY %rdi
+# NO_AVX512F-NEXT:      VMOVAPSYmr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1)
+# NO_AVX512F-NEXT:      RET 0
+#
+# AVX512F:              %0 = COPY %ymm0
+# AVX512F-NEXT:         %1 = COPY %rdi
+# AVX512F-NEXT:         VMOVAPSZ256mr_NOVLX %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1)
+# AVX512F-NEXT:         RET 0
+#
+# AVX512VL:             %0 = COPY %ymm0
+# AVX512VL-NEXT:        %1 = COPY %rdi
+# AVX512VL-NEXT:        VMOVAPSZ256mr %1, 1, _, 0, _, %0 :: (store 32 into %ir.p1)
+# AVX512VL-NEXT:        RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %ymm0
+
+    %0(<8 x s32>) = COPY %ymm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<8 x s32>), %1(p0) :: (store 32 into %ir.p1)
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v512.mir b/test/CodeGen/X86/GlobalISel/select-memop-v512.mir
new file mode 100644
index 000000000000..87978a684d4c
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-memop-v512.mir
@@ -0,0 +1,127 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=AVX512F
+--- |
+  define <16 x i32> @test_load_v16i32_noalign(<16 x i32>* %p1) {
+    %r = load <16 x i32>, <16 x i32>* %p1, align 1
+    ret <16 x i32> %r
+  }
+
+  define <16 x i32> @test_load_v16i32_align(<16 x i32>* %p1) {
+    %r = load <16 x i32>, <16 x i32>* %p1, align 32
+    ret <16 x i32> %r
+  }
+
+  define void @test_store_v16i32_noalign(<16 x i32> %val, <16 x i32>* %p1) {
+    store <16 x i32> %val, <16 x i32>* %p1, align 1
+    ret void
+  }
+
+  define void @test_store_v16i32_align(<16 x i32> %val, <16 x i32>* %p1) {
+    store <16 x i32> %val, <16 x i32>* %p1, align 32
+    ret void
+  }
+
+...
+---
+name:            test_load_v16i32_noalign
+# AVX512F-LABEL: name:  test_load_v16i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX512F:      registers:
+# AVX512F-NEXT:   - { id: 0, class: gr64 }
+# AVX512F-NEXT:   - { id: 1, class: vr512 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# AVX512F:          %0 = COPY %rdi
+# AVX512F-NEXT:     %1 = VMOVUPSZrm %0, 1, _, 0, _ :: (load 64 from %ir.p1, align 1)
+# AVX512F-NEXT:     %zmm0 = COPY %1
+# AVX512F-NEXT:     RET 0, implicit %zmm0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 1)
+    %zmm0 = COPY %1(<16 x s32>)
+    RET 0, implicit %zmm0
+
+...
+---
+name:            test_load_v16i32_align
+# AVX512F-LABEL: name:  test_load_v16i32_align
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX512F:      registers:
+# AVX512F-NEXT:   - { id: 0, class: gr64 }
+# AVX512F-NEXT:   - { id: 1, class: vr512 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# AVX512F:          %0 = COPY %rdi
+# AVX512F-NEXT:     %1 = VMOVUPSZrm %0, 1, _, 0, _ :: (load 64 from %ir.p1, align 32)
+# AVX512F-NEXT:     %zmm0 = COPY %1
+# AVX512F-NEXT:     RET 0, implicit %zmm0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<16 x s32>) = G_LOAD %0(p0) :: (load 64 from %ir.p1, align 32)
+    %zmm0 = COPY %1(<16 x s32>)
+    RET 0, implicit %zmm0
+
+...
+---
+name:            test_store_v16i32_noalign
+# AVX512F-LABEL: name:  test_store_v16i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX512F:      registers:
+# AVX512F-NEXT:   - { id: 0, class: vr512 }
+# AVX512F-NEXT:   - { id: 1, class: gr64 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# AVX512F:          %0 = COPY %zmm0
+# AVX512F-NEXT:     %1 = COPY %rdi
+# AVX512F-NEXT:     VMOVUPSZmr %1, 1, _, 0, _, %0 :: (store 64 into %ir.p1, align 1)
+# AVX512F-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %zmm0
+
+    %0(<16 x s32>) = COPY %zmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 1)
+    RET 0
+
+...
+---
+name:            test_store_v16i32_align
+# AVX512F-LABEL: name:  test_store_v16i32_align
+alignment:       4
+legalized:       true
+regBankSelected: true
+# AVX512F:      registers:
+# AVX512F-NEXT:   - { id: 0, class: vr512 }
+# AVX512F-NEXT:   - { id: 1, class: gr64 }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# AVX512F:          %0 = COPY %zmm0
+# AVX512F-NEXT:     %1 = COPY %rdi
+# AVX512F-NEXT:     VMOVUPSZmr %1, 1, _, 0, _, %0 :: (store 64 into %ir.p1, align 32)
+# AVX512F-NEXT:     RET 0
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %zmm0
+
+    %0(<16 x s32>) = COPY %zmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<16 x s32>), %1(p0) :: (store 64 into %ir.p1, align 32)
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index cf514d7aeb31..016ddb9c5e78 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -1,10 +1,8 @@
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck --check-prefix=FAST-YMM-ZMM %s
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s
-
-; FAST-YMM-ZMM-NOT: vzeroupper
-; BTVER2-NOT: vzeroupper
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=VZ --check-prefix=AVX512
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=FAST-YMM-ZMM
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-unknown-unknown -mcpu=btver2 | FileCheck %s --check-prefix=ALL --check-prefix=NO-VZ --check-prefix=BTVER2
 
 declare i32 @foo()
 declare <4 x float> @do_sse(<4 x float>)
@@ -15,43 +13,86 @@ declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind
 
 ;; Basic checking - don't emit any vzeroupper instruction
 
-; CHECK: _test00
-define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
-entry:
-  ; CHECK-NOT: vzeroupper
+define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind {
+; ALL-LABEL: test00:
+; ALL:       # BB#0:
+; ALL-NEXT:    pushq %rax
+; ALL-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; ALL-NEXT:    callq do_sse
+; ALL-NEXT:    popq %rax
+; ALL-NEXT:    retq
   %add.i = fadd <4 x float> %a, %b
   %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind
-  ; CHECK: ret
   ret <4 x float> %call3
 }
 
 ;; Check parameter 256-bit parameter passing
 
-; CHECK: _test01
-define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp {
-entry:
+define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind {
+; VZ-LABEL: test01:
+; VZ:       # BB#0:
+; VZ-NEXT:    subq $56, %rsp
+; VZ-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; VZ-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; VZ-NEXT:    vzeroupper
+; VZ-NEXT:    callq do_sse
+; VZ-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; VZ-NEXT:    callq do_sse
+; VZ-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; VZ-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; VZ-NEXT:    addq $56, %rsp
+; VZ-NEXT:    retq
+;
+; FAST-YMM-ZMM-LABEL: test01:
+; FAST-YMM-ZMM:       # BB#0:
+; FAST-YMM-ZMM-NEXT:    subq $56, %rsp
+; FAST-YMM-ZMM-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; FAST-YMM-ZMM-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; FAST-YMM-ZMM-NEXT:    callq do_sse
+; FAST-YMM-ZMM-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; FAST-YMM-ZMM-NEXT:    callq do_sse
+; FAST-YMM-ZMM-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; FAST-YMM-ZMM-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; FAST-YMM-ZMM-NEXT:    addq $56, %rsp
+; FAST-YMM-ZMM-NEXT:    retq
+;
+; BTVER2-LABEL: test01:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    subq $56, %rsp
+; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %xmm0
+; BTVER2-NEXT:    vmovups %ymm2, (%rsp) # 32-byte Spill
+; BTVER2-NEXT:    callq do_sse
+; BTVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; BTVER2-NEXT:    callq do_sse
+; BTVER2-NEXT:    vmovaps %xmm0, {{.*}}(%rip)
+; BTVER2-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
+; BTVER2-NEXT:    addq $56, %rsp
+; BTVER2-NEXT:    retq
   %tmp = load <4 x float>, <4 x float>* @x, align 16
-  ; CHECK: vzeroupper
-  ; CHECK-NEXT: callq _do_sse
   %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
   store <4 x float> %call, <4 x float>* @x, align 16
-  ; CHECK-NOT: vzeroupper
-  ; CHECK: callq _do_sse
   %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind
   store <4 x float> %call2, <4 x float>* @x, align 16
-  ; CHECK: ret
   ret <8 x float> %c
 }
 
 ;; Check that vzeroupper is emitted for tail calls.
 
-; CHECK: _test02
-define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind uwtable ssp {
-entry:
+define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind {
+; VZ-LABEL: test02:
+; VZ:       # BB#0:
+; VZ-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; VZ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; VZ-NEXT:    vzeroupper
+; VZ-NEXT:    jmp do_sse # TAILCALL
+;
+; NO-VZ-LABEL: test02:
+; NO-VZ:       # BB#0:
+; NO-VZ-NEXT:    vaddps %ymm1, %ymm0, %ymm0
+; NO-VZ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NO-VZ-NEXT:    jmp do_sse # TAILCALL
   %add.i = fadd <8 x float> %a, %b
   %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
-  ; CHECK: vzeroupper
-  ; CHECK: jmp _do_sse
   %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind
   ret <4 x float> %call3
 }
@@ -59,30 +100,113 @@ entry:
 ;; Test the pass convergence and also that vzeroupper is only issued when necessary,
 ;; for this function it should be only once
 
-; CHECK: _test03
-define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind {
+; VZ-LABEL: test03:
+; VZ:       # BB#0: # %entry
+; VZ-NEXT:    pushq %rbx
+; VZ-NEXT:    subq $16, %rsp
+; VZ-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; VZ-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; VZ-NEXT:    .p2align 4, 0x90
+; VZ-NEXT:  .LBB3_1: # %while.cond
+; VZ-NEXT:    # =>This Inner Loop Header: Depth=1
+; VZ-NEXT:    callq foo
+; VZ-NEXT:    testl %eax, %eax
+; VZ-NEXT:    jne .LBB3_1
+; VZ-NEXT:  # BB#2: # %for.body.preheader
+; VZ-NEXT:    movl $4, %ebx
+; VZ-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; VZ-NEXT:    .p2align 4, 0x90
+; VZ-NEXT:  .LBB3_3: # %for.body
+; VZ-NEXT:    # =>This Inner Loop Header: Depth=1
+; VZ-NEXT:    callq do_sse
+; VZ-NEXT:    callq do_sse
+; VZ-NEXT:    vmovaps {{.*}}(%rip), %ymm0
+; VZ-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; VZ-NEXT:    vzeroupper
+; VZ-NEXT:    callq do_sse
+; VZ-NEXT:    decl %ebx
+; VZ-NEXT:    jne .LBB3_3
+; VZ-NEXT:  # BB#4: # %for.end
+; VZ-NEXT:    addq $16, %rsp
+; VZ-NEXT:    popq %rbx
+; VZ-NEXT:    retq
+;
+; FAST-YMM-ZMM-LABEL: test03:
+; FAST-YMM-ZMM:       # BB#0: # %entry
+; FAST-YMM-ZMM-NEXT:    pushq %rbx
+; FAST-YMM-ZMM-NEXT:    subq $16, %rsp
+; FAST-YMM-ZMM-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; FAST-YMM-ZMM-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; FAST-YMM-ZMM-NEXT:    .p2align 4, 0x90
+; FAST-YMM-ZMM-NEXT:  .LBB3_1: # %while.cond
+; FAST-YMM-ZMM-NEXT:    # =>This Inner Loop Header: Depth=1
+; FAST-YMM-ZMM-NEXT:    callq foo
+; FAST-YMM-ZMM-NEXT:    testl %eax, %eax
+; FAST-YMM-ZMM-NEXT:    jne .LBB3_1
+; FAST-YMM-ZMM-NEXT:  # BB#2: # %for.body.preheader
+; FAST-YMM-ZMM-NEXT:    movl $4, %ebx
+; FAST-YMM-ZMM-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; FAST-YMM-ZMM-NEXT:    .p2align 4, 0x90
+; FAST-YMM-ZMM-NEXT:  .LBB3_3: # %for.body
+; FAST-YMM-ZMM-NEXT:    # =>This Inner Loop Header: Depth=1
+; FAST-YMM-ZMM-NEXT:    callq do_sse
+; FAST-YMM-ZMM-NEXT:    callq do_sse
+; FAST-YMM-ZMM-NEXT:    vmovaps {{.*}}(%rip), %ymm0
+; FAST-YMM-ZMM-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; FAST-YMM-ZMM-NEXT:    callq do_sse
+; FAST-YMM-ZMM-NEXT:    decl %ebx
+; FAST-YMM-ZMM-NEXT:    jne .LBB3_3
+; FAST-YMM-ZMM-NEXT:  # BB#4: # %for.end
+; FAST-YMM-ZMM-NEXT:    addq $16, %rsp
+; FAST-YMM-ZMM-NEXT:    popq %rbx
+; FAST-YMM-ZMM-NEXT:    retq
+;
+; BTVER2-LABEL: test03:
+; BTVER2:       # BB#0: # %entry
+; BTVER2-NEXT:    pushq %rbx
+; BTVER2-NEXT:    subq $16, %rsp
+; BTVER2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; BTVER2-NEXT:    .p2align 4, 0x90
+; BTVER2-NEXT:  .LBB3_1: # %while.cond
+; BTVER2-NEXT:    # =>This Inner Loop Header: Depth=1
+; BTVER2-NEXT:    callq foo
+; BTVER2-NEXT:    testl %eax, %eax
+; BTVER2-NEXT:    jne .LBB3_1
+; BTVER2-NEXT:  # BB#2: # %for.body.preheader
+; BTVER2-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
+; BTVER2-NEXT:    movl $4, %ebx
+; BTVER2-NEXT:    .p2align 4, 0x90
+; BTVER2-NEXT:  .LBB3_3: # %for.body
+; BTVER2-NEXT:    # =>This Inner Loop Header: Depth=1
+; BTVER2-NEXT:    callq do_sse
+; BTVER2-NEXT:    callq do_sse
+; BTVER2-NEXT:    vmovaps {{.*}}(%rip), %ymm0
+; BTVER2-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; BTVER2-NEXT:    callq do_sse
+; BTVER2-NEXT:    decl %ebx
+; BTVER2-NEXT:    jne .LBB3_3
+; BTVER2-NEXT:  # BB#4: # %for.end
+; BTVER2-NEXT:    addq $16, %rsp
+; BTVER2-NEXT:    popq %rbx
+; BTVER2-NEXT:    retq
 entry:
   %add.i = fadd <4 x float> %a, %b
   br label %while.cond
 
-while.cond: 
+while.cond:
   %call = tail call i32 @foo()
   %tobool = icmp eq i32 %call, 0
   br i1 %tobool, label %for.body, label %while.cond
 
 for.body:
-  ; CHECK: LBB
-  ; CHECK-NOT: vzeroupper
   %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ]
   %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ]
-  ; CHECK: callq _do_sse
   %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
-  ; CHECK-NEXT: callq _do_sse
   %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind
   %tmp11 = load <8 x float>, <8 x float>* @g, align 32
   %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind
-  ; CHECK: vzeroupper
-  ; CHECK-NEXT: callq _do_sse
   %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind
   %1 = add nsw i32 %i.018, 1
   %exitcond = icmp eq i32 %1, 4
@@ -94,15 +218,30 @@ for.end:
 
 ;; Check that we also perform vzeroupper when we return from a function.
 
-; CHECK: _test04
-define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
-entry:
+define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind {
+; VZ-LABEL: test04:
+; VZ:       # BB#0:
+; VZ-NEXT:    pushq %rax
+; VZ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; VZ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; VZ-NEXT:    callq do_avx
+; VZ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; VZ-NEXT:    popq %rax
+; VZ-NEXT:    vzeroupper
+; VZ-NEXT:    retq
+;
+; NO-VZ-LABEL: test04:
+; NO-VZ:       # BB#0:
+; NO-VZ-NEXT:    pushq %rax
+; NO-VZ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; NO-VZ-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; NO-VZ-NEXT:    callq do_avx
+; NO-VZ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; NO-VZ-NEXT:    popq %rax
+; NO-VZ-NEXT:    retq
   %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ; CHECK-NOT: vzeroupper
-  ; CHECK: call
   %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
   %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ; CHECK: vzeroupper
-  ; CHECK: ret
   ret <4 x float> %shuf2
 }
+
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index 3337f42eb142..51f9a382ccbf 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -2216,9 +2216,9 @@ define i32@test_int_x86_avx512_kunpck_wd(i32 %x0, i32 %x1) {
 ;
 ; AVX512F-32-LABEL: test_int_x86_avx512_kunpck_wd:
 ; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckwd %k1, %k0, %k0
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    kmovw {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    kunpckwd %k0, %k1, %k0
 ; AVX512F-32-NEXT:    kmovd %k0, %eax
 ; AVX512F-32-NEXT:    retl
   %res = call i32 @llvm.x86.avx512.kunpck.wd(i32 %x0, i32 %x1)
diff --git a/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll b/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll
new file mode 100644
index 000000000000..019c5282f63b
--- /dev/null
+++ b/test/CodeGen/X86/avx512vpopcntdq-intrinsics.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq --show-mc-encoding | FileCheck %s --check-prefix=X86_64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vpopcntdq --show-mc-encoding | FileCheck %s --check-prefix=X86
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; The following tests check that patterns that includes      ;;
+;; ctpop intrinsic + select are translated to the vpopcntd/q  ;;
+;; instruction in a correct way.                              ;;
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+define <16 x i32> @test_mask_vpopcnt_d(<16 x i32> %a, i16 %mask, <16 x i32> %b) {
+; X86_64-LABEL: test_mask_vpopcnt_d:
+; X86_64:       # BB#0:
+; X86_64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X86_64-NEXT:    vpopcntd %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x55,0xc1]
+; X86_64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mask_vpopcnt_d:
+; X86:       # BB#0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpopcntd %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0x7d,0x49,0x55,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %a
+  ret <16 x i32> %3
+}
+
+define <16 x i32> @test_maskz_vpopcnt_d(i16 %mask, <16 x i32> %a) {
+; X86_64-LABEL: test_maskz_vpopcnt_d:
+; X86_64:       # BB#0:
+; X86_64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X86_64-NEXT:    vpopcntd %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x55,0xc0]
+; X86_64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_maskz_vpopcnt_d:
+; X86:       # BB#0:
+; X86-NEXT:    kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04]
+; X86-NEXT:    vpopcntd %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0x55,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %1 = tail call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %a)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> zeroinitializer
+  ret <16 x i32> %3
+}
+
+define <8 x i64> @test_mask_vpopcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; X86_64-LABEL: test_mask_vpopcnt_q:
+; X86_64:       # BB#0:
+; X86_64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X86_64-NEXT:    vpopcntq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x55,0xc8]
+; X86_64-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86_64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_mask_vpopcnt_q:
+; X86:       # BB#0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpopcntq %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x55,0xc8]
+; X86-NEXT:    vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %b
+  ret <8 x i64> %3
+}
+
+define <8 x i64> @test_maskz_vpopcnt_q(<8 x i64> %a, i8 %mask) {
+; X86_64-LABEL: test_maskz_vpopcnt_q:
+; X86_64:       # BB#0:
+; X86_64-NEXT:    kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
+; X86_64-NEXT:    vpopcntq %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x55,0xc0]
+; X86_64-NEXT:    retq # encoding: [0xc3]
+;
+; X86-LABEL: test_maskz_vpopcnt_q:
+; X86:       # BB#0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04]
+; X86-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; X86-NEXT:    vpopcntq %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x55,0xc0]
+; X86-NEXT:    retl # encoding: [0xc3]
+  %1 = tail call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %a)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> zeroinitializer
+  ret <8 x i64> %3
+}
+
+declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>)
+declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>)
diff --git a/test/CodeGen/X86/fast-isel-select-cmp.ll b/test/CodeGen/X86/fast-isel-select-cmp.ll
index 1af30e9f32fe..4a8e8792f98d 100644
--- a/test/CodeGen/X86/fast-isel-select-cmp.ll
+++ b/test/CodeGen/X86/fast-isel-select-cmp.ll
@@ -4,9 +4,9 @@
 ; different basic blocks.
 
 define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) {
-; CHECK-LABEL: select_cmp_cmov_i32
+; CHECK-LABEL: select_cmp_cmov_i32:
 ; CHECK-LABEL: continue
-; CHECK-NOT:   cmp
+; CHECK-NOT:   cmp{{[^_]}}
   %1 = icmp ult i32 %a, %b
   br i1 %1, label %continue, label %exit
 
@@ -19,9 +19,9 @@ exit:
 }
 
 define float @select_fcmp_oeq_f32(float %a, float %b, float %c, float %d) {
-; CHECK-LABEL: select_fcmp_oeq_f32
+; CHECK-LABEL: select_fcmp_oeq_f32:
 ; CHECK-LABEL: continue
-; CHECK-NOT:   cmp
+; CHECK-NOT:   cmp{{[^_]}}
   %1 = fcmp oeq float %a, %b
   br i1 %1, label %continue, label %exit
 
@@ -34,7 +34,7 @@ exit:
 }
 
 define float @select_fcmp_one_f32(float %a, float %b, float %c, float %d) {
-; CHECK-LABEL: select_fcmp_one_f32
+; CHECK-LABEL: select_fcmp_one_f32:
 ; CHECK-LABEL: continue
 ; CHECK-NOT:   ucomi
   %1 = fcmp one float %a, %b
diff --git a/test/CodeGen/X86/fp-intrinsics.ll b/test/CodeGen/X86/fp-intrinsics.ll
index 88aef6bb0659..0f8d730d7535 100644
--- a/test/CodeGen/X86/fp-intrinsics.ll
+++ b/test/CodeGen/X86/fp-intrinsics.ll
@@ -103,9 +103,156 @@ if.end:
   ret double %a.0
 }
 
+; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f5
+; CHECK:  sqrtsd
+define double @f5() {
+entry:
+  %result = call double @llvm.experimental.constrained.sqrt.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that pow(42.1, 3.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f6
+; CHECK:  pow
+define double @f6() {
+entry:
+  %result = call double @llvm.experimental.constrained.pow.f64(double 42.1,
+                                               double 3.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that powi(42.1, 3) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f7
+; CHECK:  powi
+define double @f7() {
+entry:
+  %result = call double @llvm.experimental.constrained.powi.f64(double 42.1,
+                                               i32 3,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that sin(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f8
+; CHECK:  sin
+define double @f8() {
+entry:
+  %result = call double @llvm.experimental.constrained.sin.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that cos(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f9
+; CHECK:  cos
+define double @f9() {
+entry:
+  %result = call double @llvm.experimental.constrained.cos.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that exp(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f10
+; CHECK:  exp
+define double @f10() {
+entry:
+  %result = call double @llvm.experimental.constrained.exp.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that exp2(42.1) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f11
+; CHECK:  exp2
+define double @f11() {
+entry:
+  %result = call double @llvm.experimental.constrained.exp2.f64(double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that log(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f12
+; CHECK:  log
+define double @f12() {
+entry:
+  %result = call double @llvm.experimental.constrained.log.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that log10(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f13
+; CHECK:  log10
+define double @f13() {
+entry:
+  %result = call double @llvm.experimental.constrained.log10.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that log2(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f14
+; CHECK:  log2
+define double @f14() {
+entry:
+  %result = call double @llvm.experimental.constrained.log2.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that rint(42.1) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f15
+; CHECK:  rint
+define double @f15() {
+entry:
+  %result = call double @llvm.experimental.constrained.rint.f64(double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that nearbyint(42.1) isn't simplified when the rounding mode is
+; unknown.
+; CHECK-LABEL: f16
+; CHECK:  nearbyint
+define double @f16() {
+entry:
+  %result = call double @llvm.experimental.constrained.nearbyint.f64(
+                                               double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
 
 @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata"
 declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata)
+declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.exp.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.exp2.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata)
diff --git a/test/CodeGen/X86/hoist-invariant-load.ll b/test/CodeGen/X86/hoist-invariant-load.ll
index 5ade5b470b54..e7929c9cecdc 100644
--- a/test/CodeGen/X86/hoist-invariant-load.ll
+++ b/test/CodeGen/X86/hoist-invariant-load.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc -mcpu=haswell < %s -stats -O2 2>&1 | grep "4 machine-licm.*hoisted"
+; RUN: llc -mcpu=haswell < %s -stats -O2 2>&1 | grep "4 machinelicm.*hoisted"
 ; For test:
 ; 2 invariant loads, 1 for OBJC_SELECTOR_REFERENCES_
 ; and 1 for objc_msgSend from the GOT
diff --git a/test/CodeGen/X86/misched-copy.ll b/test/CodeGen/X86/misched-copy.ll
index 3e3729285d27..7abd157f147a 100644
--- a/test/CodeGen/X86/misched-copy.ll
+++ b/test/CodeGen/X86/misched-copy.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -verify-machineinstrs -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -march=x86 -mcpu=core2 -pre-RA-sched=source -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s
 ;
 ; Test scheduling of copy instructions.
 ;
diff --git a/test/CodeGen/X86/or-branch.ll b/test/CodeGen/X86/or-branch.ll
index 4899a0fc7e88..71d7746642e9 100644
--- a/test/CodeGen/X86/or-branch.ll
+++ b/test/CodeGen/X86/or-branch.ll
@@ -1,16 +1,34 @@
-; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=0 | FileCheck %s --check-prefix=JUMP2 --check-prefix=CHECK
-; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=1 | FileCheck %s --check-prefix=JUMP1 --check-prefix=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=0 | FileCheck %s --check-prefix=JUMP2
+; RUN: llc < %s -mtriple=i386-unknown-unknown -jump-is-expensive=1 | FileCheck %s --check-prefix=JUMP1
 
 define void @foo(i32 %X, i32 %Y, i32 %Z) nounwind {
 ; JUMP2-LABEL: foo:
-; JUMP2-DAG:     jl
-; JUMP2-DAG:     je
+; JUMP2:       # BB#0: # %entry
+; JUMP2-NEXT:    cmpl $5, {{[0-9]+}}(%esp)
+; JUMP2-NEXT:    jl .LBB0_3
+; JUMP2-NEXT:  # BB#1: # %entry
+; JUMP2-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; JUMP2-NEXT:    testl %eax, %eax
+; JUMP2-NEXT:    je .LBB0_3
+; JUMP2-NEXT:  # BB#2: # %UnifiedReturnBlock
+; JUMP2-NEXT:    retl
+; JUMP2-NEXT:  .LBB0_3: # %cond_true
+; JUMP2-NEXT:    jmp bar # TAILCALL
 ;
 ; JUMP1-LABEL: foo:
-; JUMP1-DAG:     sete
-; JUMP1-DAG:     setl
-; JUMP1:         orb
-; JUMP1:         jne
+; JUMP1:       # BB#0: # %entry
+; JUMP1-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; JUMP1-NEXT:    sete %al
+; JUMP1-NEXT:    cmpl $5, {{[0-9]+}}(%esp)
+; JUMP1-NEXT:    setl %cl
+; JUMP1-NEXT:    orb %al, %cl
+; JUMP1-NEXT:    cmpb $1, %cl
+; JUMP1-NEXT:    jne .LBB0_1
+; JUMP1-NEXT:  # BB#2: # %cond_true
+; JUMP1-NEXT:    jmp bar # TAILCALL
+; JUMP1-NEXT:  .LBB0_1: # %UnifiedReturnBlock
+; JUMP1-NEXT:    retl
 entry:
   %tmp1 = icmp eq i32 %X, 0
   %tmp3 = icmp slt i32 %Y, 5
@@ -29,11 +47,33 @@ UnifiedReturnBlock:
 ; regardless of whether they are expensive or not.
 
 define void @unpredictable(i32 %X, i32 %Y, i32 %Z) nounwind {
-; CHECK-LABEL: unpredictable:
-; CHECK-DAG:     sete
-; CHECK-DAG:     setl
-; CHECK:         orb
-; CHECK:         jne
+; JUMP2-LABEL: unpredictable:
+; JUMP2:       # BB#0: # %entry
+; JUMP2-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; JUMP2-NEXT:    sete %al
+; JUMP2-NEXT:    cmpl $5, {{[0-9]+}}(%esp)
+; JUMP2-NEXT:    setl %cl
+; JUMP2-NEXT:    orb %al, %cl
+; JUMP2-NEXT:    cmpb $1, %cl
+; JUMP2-NEXT:    jne .LBB1_1
+; JUMP2-NEXT:  # BB#2: # %cond_true
+; JUMP2-NEXT:    jmp bar # TAILCALL
+; JUMP2-NEXT:  .LBB1_1: # %UnifiedReturnBlock
+; JUMP2-NEXT:    retl
+;
+; JUMP1-LABEL: unpredictable:
+; JUMP1:       # BB#0: # %entry
+; JUMP1-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; JUMP1-NEXT:    sete %al
+; JUMP1-NEXT:    cmpl $5, {{[0-9]+}}(%esp)
+; JUMP1-NEXT:    setl %cl
+; JUMP1-NEXT:    orb %al, %cl
+; JUMP1-NEXT:    cmpb $1, %cl
+; JUMP1-NEXT:    jne .LBB1_1
+; JUMP1-NEXT:  # BB#2: # %cond_true
+; JUMP1-NEXT:    jmp bar # TAILCALL
+; JUMP1-NEXT:  .LBB1_1: # %UnifiedReturnBlock
+; JUMP1-NEXT:    retl
 entry:
   %tmp1 = icmp eq i32 %X, 0
   %tmp3 = icmp slt i32 %Y, 5
diff --git a/test/CodeGen/X86/pr27681.mir b/test/CodeGen/X86/pr27681.mir
index 002761bc1e68..956df172b253 100644
--- a/test/CodeGen/X86/pr27681.mir
+++ b/test/CodeGen/X86/pr27681.mir
@@ -57,7 +57,7 @@ body:             |
     %cl = SETNEr implicit %eflags
     ; Verify that removal of the %bl antidependence does not use %ch
     ; as a replacement register.
-    ; CHECK: %cl = AND8rr %cl, killed %b
+    ; CHECK: %cl = AND8rr killed %cl, killed %b
     %cl = AND8rr killed %cl, killed %bl, implicit-def dead %eflags
     CMP32ri8 %ebp, -1, implicit-def %eflags
     %edx = MOV32ri 0
diff --git a/test/CodeGen/X86/sandybridge-loads.ll b/test/CodeGen/X86/sandybridge-loads.ll
index 2e31154068fc..8570fe7fe7ba 100644
--- a/test/CodeGen/X86/sandybridge-loads.ll
+++ b/test/CodeGen/X86/sandybridge-loads.ll
@@ -1,13 +1,20 @@
-; RUN: llc -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -o - < %s | FileCheck %s
-
-;CHECK-LABEL: wideloads:
-;CHECK: vmovaps
-;CHECK: vinsertf128
-;CHECK: vmovaps
-;CHECK-NOT: vinsertf128
-;CHECK: ret
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=corei7-avx | FileCheck %s
 
 define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+; CHECK-LABEL: wideloads:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps (%rdi), %xmm0
+; CHECK-NEXT:    vinsertf128 $1, 16(%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    vmovaps (%rsi), %ymm1
+; CHECK-NEXT:    vcmpltps %ymm0, %ymm1, %ymm1
+; CHECK-NEXT:    vmovaps (%rdx), %ymm2
+; CHECK-NEXT:    vcmpltps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; CHECK-NEXT:    vmovaps %ymm0, (%rax)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %v0 = load <8 x float>, <8 x float>* %a, align 16  ; <---- unaligned!
   %v1 = load <8 x float>, <8 x float>* %b, align 32  ; <---- aligned!
   %m0 = fcmp olt <8 x float> %v1, %v0
@@ -19,17 +26,16 @@ define void @wideloads(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwi
   ret void
 }
 
-; CHECK: widestores
-; loads:
-; CHECK: vmovaps
-; CHECK: vmovaps
-; stores:
-; CHECK: vmovaps
-; CHECK: vextractf128
-; CHECK: vmovaps
-;CHECK: ret
-
 define void @widestores(<8 x float>* %a, <8 x float>* %b, <8 x float>* %c) nounwind uwtable noinline ssp {
+; CHECK-LABEL: widestores:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps (%rdi), %ymm0
+; CHECK-NEXT:    vmovaps (%rsi), %ymm1
+; CHECK-NEXT:    vmovaps %ymm0, (%rsi)
+; CHECK-NEXT:    vextractf128 $1, %ymm1, 16(%rdi)
+; CHECK-NEXT:    vmovaps %xmm1, (%rdi)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
   %v0 = load <8 x float>, <8 x float>* %a, align 32
   %v1 = load <8 x float>, <8 x float>* %b, align 32
   store <8 x float> %v0, <8 x float>* %b, align 32 ; <--- aligned
diff --git a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
index 383ab21bd404..19305d0dad62 100644
--- a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
@@ -354,9 +354,8 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
 define i32 @test_mm_crc32_u8(i32 %a0, i8 %a1) {
 ; X32-LABEL: test_mm_crc32_u8:
 ; X32:       # BB#0:
-; X32-NEXT:    movb {{[0-9]+}}(%esp), %cl
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    crc32b %cl, %eax
+; X32-NEXT:    crc32b {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_crc32_u8:
@@ -372,9 +371,8 @@ declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind readnone
 define i32 @test_mm_crc32_u16(i32 %a0, i16 %a1) {
 ; X32-LABEL: test_mm_crc32_u16:
 ; X32:       # BB#0:
-; X32-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    crc32w %cx, %eax
+; X32-NEXT:    crc32w {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_crc32_u16:
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll
index 72542f499087..a00d47bb13e9 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1651,26 +1651,9 @@ define <8 x float> @stack_fold_sqrtps_ymm(<8 x float> %a0) {
 }
 declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
 
-define double @stack_fold_sqrtsd(double %a0) {
-  ;CHECK-LABEL: stack_fold_sqrtsd
-  ;CHECK:       vsqrtsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call double @llvm.sqrt.f64(double %a0)
-  ret double %2
-}
-declare double @llvm.sqrt.f64(double) nounwind readnone
-
+; TODO stack_fold_sqrtsd
 ; TODO stack_fold_sqrtsd_int
-
-define float @stack_fold_sqrtss(float %a0) {
-  ;CHECK-LABEL: stack_fold_sqrtss
-  ;CHECK:       vsqrtss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call float @llvm.sqrt.f32(float %a0)
-  ret float %2
-}
-declare float @llvm.sqrt.f32(float) nounwind readnone
-
+; TODO stack_fold_sqrtss
 ; TODO stack_fold_sqrtss_int
 
 define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
diff --git a/test/CodeGen/X86/twoaddr-coalesce-2.ll b/test/CodeGen/X86/twoaddr-coalesce-2.ll
index cbcde0655597..9da071f7ede6 100644
--- a/test/CodeGen/X86/twoaddr-coalesce-2.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce-2.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -march=x86 -mattr=+sse2 -mcpu=penryn -stats 2>&1 | \
-; RUN:   grep "twoaddrinstr" | grep "Number of instructions aggressively commuted"
+; RUN:   grep "twoaddressinstruction" | grep "Number of instructions aggressively commuted"
 ; rdar://6480363
 
 target triple = "i386-apple-darwin9.6"
diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll
index f737ea2b7fba..4d183f3172b3 100644
--- a/test/CodeGen/X86/vector-narrow-binop.ll
+++ b/test/CodeGen/X86/vector-narrow-binop.ll
@@ -22,17 +22,17 @@ define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d
 ;
 ; AVX1-LABEL: PR32790:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT:    vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm1
+; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm1
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: PR32790:
@@ -60,46 +60,17 @@ define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d
 define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
 ; SSE-LABEL: do_not_use_256bit_op:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    psubd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: do_not_use_256bit_op:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
-; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: do_not_use_256bit_op:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
-; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: do_not_use_256bit_op:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<def>
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm1
-; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: do_not_use_256bit_op:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpand %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %concat1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %concat2 = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %and = and <8 x i32> %concat1, %concat2
diff --git a/test/CodeGen/X86/vector-popcnt-128.ll b/test/CodeGen/X86/vector-popcnt-128.ll
index 27909c6bb4a0..adda108bdc77 100644
--- a/test/CodeGen/X86/vector-popcnt-128.ll
+++ b/test/CodeGen/X86/vector-popcnt-128.ll
@@ -5,6 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
 
 define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE2-LABEL: testv2i64:
@@ -81,19 +82,41 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; SSE41-NEXT:    psadbw %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: testv2i64:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: testv2i64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv2i64:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv2i64:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    vzeroupper
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %in)
   ret <2 x i64> %out
 }
@@ -193,23 +216,49 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; SSE41-NEXT:    packuswb %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX-LABEL: testv4i32:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: testv4i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: testv4i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX2-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX2-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
+; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX2-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv4i32:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    vzeroupper
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %in)
   ret <4 x i32> %out
 }
diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll
index 7a675619d720..accbad35e9d7 100644
--- a/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/test/CodeGen/X86/vector-popcnt-256.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
 
 define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64:
@@ -39,6 +40,13 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv4i64:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %in)
   ret <4 x i64> %out
 }
@@ -92,6 +100,13 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX2-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv8i32:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %in)
   ret <8 x i32> %out
 }
@@ -137,6 +152,21 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX2-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv16i16:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %in)
   ret <16 x i16> %out
 }
@@ -173,6 +203,18 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX2-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
 ; AVX2-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv32i8:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %in)
   ret <32 x i8> %out
 }
diff --git a/test/CodeGen/X86/vector-popcnt-512.ll b/test/CodeGen/X86/vector-popcnt-512.ll
index cf4f21e62b61..aa50206e7a5e 100644
--- a/test/CodeGen/X86/vector-popcnt-512.ll
+++ b/test/CodeGen/X86/vector-popcnt-512.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VPOPCNTDQ
 
 define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512F-LABEL: testv8i64:
@@ -39,6 +40,11 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512BW-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv8i64:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %in)
   ret <8 x i64> %out
 }
@@ -92,6 +98,11 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv16i32:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %in)
   ret <16 x i32> %out
 }
@@ -135,6 +146,30 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv32i16:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %in)
   ret <32 x i16> %out
 }
@@ -169,6 +204,24 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv64i8:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm0, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm4, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %in)
   ret <64 x i8> %out
 }
diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll
index fa3471c2fe40..2e65bd8c75c7 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -282,8 +282,7 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
 define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
 ; ALL-LABEL: shuffle_v16f32_extract_256:
 ; ALL:       # BB#0:
-; ALL-NEXT:    vmovups (%rsi), %zmm0
-; ALL-NEXT:    vextractf32x8 $1, %zmm0, %ymm0
+; ALL-NEXT:    vmovups 32(%rsi), %ymm0
 ; ALL-NEXT:    retq
   %ptr_a = bitcast float* %a to <16 x float>*
   %v_a = load <16 x float>, <16 x float>* %ptr_a, align 4
diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll
index 5aab21749d14..706edd27a3f1 100644
--- a/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -511,11 +511,10 @@ define <8 x float> @expand14(<4 x float> %a) {
 ;
 ; KNL64-LABEL: expand14:
 ; KNL64:       # BB#0:
+; KNL64-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,3,0,0]
+; KNL64-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; KNL64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL64-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
-; KNL64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0]
-; KNL64-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
 ; KNL64-NEXT:    retq
 ;
@@ -529,11 +528,10 @@ define <8 x float> @expand14(<4 x float> %a) {
 ;
 ; KNL32-LABEL: expand14:
 ; KNL32:       # BB#0:
+; KNL32-NEXT:    vpermilps {{.*#+}} xmm1 = mem[3,3,0,0]
+; KNL32-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; KNL32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL32-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
-; KNL32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[3,3,0,0]
-; KNL32-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
 ; KNL32-NEXT:    retl
    %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
@@ -545,39 +543,35 @@ define <8 x float> @expand14(<4 x float> %a) {
 define <8 x float> @expand15(<4 x float> %a) {
 ; SKX64-LABEL: expand15:
 ; SKX64:       # BB#0:
-; SKX64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SKX64-NEXT:    vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>
-; SKX64-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0]
+; SKX64-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
+; SKX64-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
 ; SKX64-NEXT:    vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
-; SKX64-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0
+; SKX64-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0
 ; SKX64-NEXT:    retq
 ;
 ; KNL64-LABEL: expand15:
 ; KNL64:       # BB#0:
+; KNL64-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
+; KNL64-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; KNL64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL64-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
-; KNL64-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0]
-; KNL64-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
 ; KNL64-NEXT:    retq
 ;
 ; SKX32-LABEL: expand15:
 ; SKX32:       # BB#0:
-; SKX32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SKX32-NEXT:    vmovaps {{.*#+}} ymm0 = <0,2,4,0,u,u,u,u>
-; SKX32-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,0,0]
+; SKX32-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
+; SKX32-NEXT:    vpermilps {{.*#+}} xmm2 = xmm0[0,1,1,3]
 ; SKX32-NEXT:    vmovaps {{.*#+}} ymm0 = [0,1,8,3,10,3,2,3]
-; SKX32-NEXT:    vpermi2ps %ymm1, %ymm2, %ymm0
+; SKX32-NEXT:    vpermi2ps %ymm2, %ymm1, %ymm0
 ; SKX32-NEXT:    retl
 ;
 ; KNL32-LABEL: expand15:
 ; KNL32:       # BB#0:
+; KNL32-NEXT:    vpermilps {{.*#+}} xmm1 = mem[0,1,0,0]
+; KNL32-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; KNL32-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,0,1,3]
-; KNL32-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,0,u,u,u,u>
-; KNL32-NEXT:    vpermilps {{.*#+}} xmm1 = xmm1[0,1,0,0]
-; KNL32-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,1,1]
 ; KNL32-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6,7]
 ; KNL32-NEXT:    retl
    %addV = fadd <4 x float> <float 0.0,float 1.0,float 2.0,float 0.0> , <float 0.0,float 1.0,float 2.0,float 0.0>
diff --git a/test/CodeGen/X86/vector-sqrt.ll b/test/CodeGen/X86/vector-sqrt.ll
index c5ac4466b5fa..13088b7fa5f2 100644
--- a/test/CodeGen/X86/vector-sqrt.ll
+++ b/test/CodeGen/X86/vector-sqrt.ll
@@ -5,8 +5,10 @@
 define <2 x double> @sqrtd2(double* nocapture readonly %v) local_unnamed_addr #0 {
 ; CHECK-LABEL: sqrtd2:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vsqrtsd (%rdi), %xmm0, %xmm0
-; CHECK-NEXT:    vsqrtsd 8(%rdi), %xmm1, %xmm1
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
 entry:
@@ -27,10 +29,14 @@ declare double @sqrt(double) local_unnamed_addr #1
 define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 {
 ; CHECK-LABEL: sqrtf4:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vsqrtss (%rdi), %xmm0, %xmm0
-; CHECK-NEXT:    vsqrtss 4(%rdi), %xmm1, %xmm1
-; CHECK-NEXT:    vsqrtss 8(%rdi), %xmm2, %xmm2
-; CHECK-NEXT:    vsqrtss 12(%rdi), %xmm3, %xmm3
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vsqrtss %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vsqrtss %xmm3, %xmm3, %xmm3
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll
index a5fac9ac6a41..d4fbb72bbe6d 100644
--- a/test/CodeGen/X86/vector-trunc-math.ll
+++ b/test/CodeGen/X86/vector-trunc-math.ll
@@ -3030,10 +3030,10 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: trunc_and_v8i32_v8i16:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    pslld $16, %xmm1
 ; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pslld $16, %xmm0
 ; SSE-NEXT:    psrad $16, %xmm0
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
@@ -3786,10 +3786,10 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: trunc_xor_v8i32_v8i16:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    pxor %xmm3, %xmm1
 ; SSE-NEXT:    pslld $16, %xmm1
 ; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    pxor %xmm2, %xmm0
 ; SSE-NEXT:    pslld $16, %xmm0
 ; SSE-NEXT:    psrad $16, %xmm0
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
@@ -4542,10 +4542,10 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; SSE-LABEL: trunc_or_v8i32_v8i16:
 ; SSE:       # BB#0:
-; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm3, %xmm1
 ; SSE-NEXT:    pslld $16, %xmm1
 ; SSE-NEXT:    psrad $16, %xmm1
+; SSE-NEXT:    por %xmm2, %xmm0
 ; SSE-NEXT:    pslld $16, %xmm0
 ; SSE-NEXT:    psrad $16, %xmm0
 ; SSE-NEXT:    packssdw %xmm1, %xmm0
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index 22d0065b264f..a22a60756264 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -7,6 +7,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41
@@ -117,6 +118,17 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv2i64:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    vzeroupper
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -284,6 +296,17 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv2i64u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    vzeroupper
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv2i64u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -501,6 +524,18 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv4i32:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    vzeroupper
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -700,6 +735,18 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv4i32u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    vzeroupper
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv4i32u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -843,6 +890,25 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv8i16:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -984,6 +1050,25 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv8i16u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %xmm0, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv8i16u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -1106,6 +1191,22 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv16i8:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -1224,6 +1325,22 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ; AVX-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv16i8u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %xmm0, %xmm1, %xmm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: testv16i8u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    pxor %xmm1, %xmm1
@@ -1258,6 +1375,12 @@ define <2 x i64> @foldv2i64() nounwind {
 ; AVX-NEXT:    vmovq %rax, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv2i64:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    movl $8, %eax
+; AVX512VPOPCNTDQ-NEXT:    vmovq %rax, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv2i64:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movl $8, %eax
@@ -1280,6 +1403,12 @@ define <2 x i64> @foldv2i64u() nounwind {
 ; AVX-NEXT:    vmovq %rax, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv2i64u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    movl $8, %eax
+; AVX512VPOPCNTDQ-NEXT:    vmovq %rax, %xmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv2i64u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movl $8, %eax
@@ -1300,6 +1429,11 @@ define <4 x i32> @foldv4i32() nounwind {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv4i32:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv4i32:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
@@ -1319,6 +1453,11 @@ define <4 x i32> @foldv4i32u() nounwind {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv4i32u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,32,0]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv4i32u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,32,0]
@@ -1338,6 +1477,11 @@ define <8 x i16> @foldv8i16() nounwind {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv8i16:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv8i16:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
@@ -1357,6 +1501,11 @@ define <8 x i16> @foldv8i16u() nounwind {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv8i16u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv8i16u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,16,0,16,0,3,3]
@@ -1376,6 +1525,11 @@ define <16 x i8> @foldv16i8() nounwind {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv16i8:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv16i8:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
@@ -1395,6 +1549,11 @@ define <16 x i8> @foldv16i8u() nounwind {
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
 ; AVX-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: foldv16i8u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-SSE-LABEL: foldv16i8u:
 ; X32-SSE:       # BB#0:
 ; X32-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,8,0,8,0,3,3,1,1,0,1,2,3,4,5]
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index a0b277ddd732..101ae95550e7 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CDVL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512cd,-avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512CD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512VPOPCNTDQ
 ;
 ; Just one 32-bit run to make sure we do reasonable things for i64 tzcnt.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2
@@ -12,11 +13,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -28,6 +26,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
@@ -92,6 +92,17 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv4i64:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv4i64:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -117,11 +128,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm3, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1]
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -133,6 +141,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubq %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsubq %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
@@ -182,6 +192,17 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubq %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv4i64u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv4i64u:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -205,28 +226,27 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
-; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
@@ -234,12 +254,12 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv8i32:
@@ -307,6 +327,17 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv8i32:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv8i32:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -335,28 +366,27 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32u:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm3
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1]
-; AVX1-NEXT:    vpsubd %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpsubd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
-; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
-; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm5, %xmm5
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm2, %xmm2
-; AVX1-NEXT:    vpackuswb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm5, %xmm5
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubd %xmm0, %xmm2, %xmm5
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsubd %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm6, %xmm3
@@ -364,12 +394,12 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm3, %xmm3
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm3, %xmm3
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: testv8i32u:
@@ -414,6 +444,17 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv8i32u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv8i32u:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -442,32 +483,31 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubw %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpsubw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm5
+; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -532,6 +572,25 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv16i16:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv16i16:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -557,32 +616,31 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16u:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubw %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpsubw %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
-; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm4
-; AVX1-NEXT:    vpaddb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vpand %xmm4, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
+; AVX1-NEXT:    vpaddb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm5
+; AVX1-NEXT:    vpaddb %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
+; AVX1-NEXT:    vpsubw %xmm0, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpsllw $8, %xmm0, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
@@ -647,6 +705,25 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX512CD-NEXT:    vpsrlw $8, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv16i16u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm0, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv16i16u:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -674,27 +751,26 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -747,6 +823,22 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv32i8:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv32i8:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -771,27 +863,26 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX1-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
-; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
-; AVX1-NEXT:    vpaddb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vpsubb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm2
-; AVX1-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX1-NEXT:    vpsubb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm5
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm6, %xmm5
+; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm4, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm6, %xmm1
+; AVX1-NEXT:    vpaddb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpsubb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT:    vpsubb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm2
+; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
+; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
+; AVX1-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -844,6 +935,22 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX512CD-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
 ; AVX512CD-NEXT:    retq
 ;
+; AVX512VPOPCNTDQ-LABEL: testv32i8u:
+; AVX512VPOPCNTDQ:       # BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubb {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    retq
+;
 ; X32-AVX-LABEL: testv32i8u:
 ; X32-AVX:       # BB#0:
 ; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
diff --git a/test/CodeGen/X86/vector-tzcnt-512.ll b/test/CodeGen/X86/vector-tzcnt-512.ll
index 2d1715949a5e..abbe964e983c 100644
--- a/test/CodeGen/X86/vector-tzcnt-512.ll
+++ b/test/CodeGen/X86/vector-tzcnt-512.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,-avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CD
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512CDBW
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=-avx512cd,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VPOPCNTDQ
 
 define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512CD-LABEL: testv8i64:
@@ -64,6 +65,15 @@ define <8 x i64> @testv8i64(<8 x i64> %in) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv8i64:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 0)
   ret <8 x i64> %out
 }
@@ -105,6 +115,15 @@ define <8 x i64> @testv8i64u(<8 x i64> %in) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm3, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv8i64u:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %in, i1 -1)
   ret <8 x i64> %out
 }
@@ -186,6 +205,15 @@ define <16 x i32> @testv16i32(<16 x i32> %in) nounwind {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv16i32:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 0)
   ret <16 x i32> %out
 }
@@ -231,6 +259,15 @@ define <16 x i32> @testv16i32u(<16 x i32> %in) nounwind {
 ; AVX512BW-NEXT:    vpsadbw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv16i32u:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubd %zmm0, %zmm1, %zmm1
+; AVX512VPOPCNTDQ-NEXT:    vpandd %zmm1, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubd {{.*}}(%rip){1to16}, %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    vpopcntd %zmm0, %zmm0
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %in, i1 -1)
   ret <16 x i32> %out
 }
@@ -305,6 +342,38 @@ define <32 x i16> @testv32i16(<32 x i16> %in) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv32i16:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm0, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 0)
   ret <32 x i16> %out
 }
@@ -379,6 +448,38 @@ define <32 x i16> @testv32i16u(<32 x i16> %in) nounwind {
 ; AVX512BW-NEXT:    vpaddb %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv32i16u:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm0, %ymm2, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm0, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm0, %ymm5, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubw %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsllw $8, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm1, %ymm2, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %in, i1 -1)
   ret <32 x i16> %out
 }
@@ -441,6 +542,32 @@ define <64 x i8> @testv64i8(<64 x i8> %in) nounwind {
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv64i8:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 0)
   ret <64 x i8> %out
 }
@@ -503,6 +630,32 @@ define <64 x i8> @testv64i8u(<64 x i8> %in) nounwind {
 ; AVX512BW-NEXT:    vpshufb %zmm0, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vpaddb %zmm2, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
+;
+; AVX512VPOPCNTDQ-LABEL: testv64i8u:
+; AVX512VPOPCNTDQ:       ## BB#0:
+; AVX512VPOPCNTDQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm0, %ymm2, %ymm3
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm3, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm5, %ymm6, %ymm5
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm0, %ymm6, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm5, %ymm0, %ymm0
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpsubb %ymm3, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm2, %ymm6, %ymm2
+; AVX512VPOPCNTDQ-NEXT:    vpsrlw $4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpand %ymm4, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpshufb %ymm1, %ymm6, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    vpaddb %ymm2, %ymm1, %ymm1
+; AVX512VPOPCNTDQ-NEXT:    retq
   %out = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %in, i1 -1)
   ret <64 x i8> %out
 }
diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll
index fbaf500e8333..b5c7f86567a1 100644
--- a/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/test/CodeGen/X86/wide-integer-cmp.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i686-linux-gnu %s -o - | FileCheck %s
 
-
 define i32 @branch_eq(i64 %a, i64 %b) {
 ; CHECK-LABEL: branch_eq:
 ; CHECK:       # BB#0: # %entry
diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll
index 6b2e4de5cdaa..42c4c23c6349 100644
--- a/test/CodeGen/X86/widened-broadcast.ll
+++ b/test/CodeGen/X86/widened-broadcast.ll
@@ -151,8 +151,7 @@ define <8 x i32> @load_splat_8i32_8i32_01010101(<8 x i32>* %ptr) nounwind uwtabl
 ;
 ; AVX1-LABEL: load_splat_8i32_8i32_01010101:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovapd (%rdi), %ymm0
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = mem[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -288,8 +287,7 @@ define <16 x i16> @load_splat_16i16_16i16_0101010101010101(<16 x i16>* %ptr) nou
 ;
 ; AVX1-LABEL: load_splat_16i16_16i16_0101010101010101:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = mem[0,0,0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -315,22 +313,10 @@ define <16 x i16> @load_splat_16i16_16i16_0123012301230123(<16 x i16>* %ptr) nou
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_16i16_16i16_0123012301230123:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vbroadcastsd (%rdi), %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_16i16_16i16_0123012301230123:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %ymm0
-; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_16i16_16i16_0123012301230123:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_16i16_16i16_0123012301230123:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vbroadcastsd (%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <16 x i16>, <16 x i16>* %ptr
   %ret = shufflevector <16 x i16> %ld, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3,i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -513,8 +499,7 @@ define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(<32 x i8
 ;
 ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7]
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -587,26 +572,10 @@ define <4 x float> @load_splat_4f32_8f32_0000(<8 x float>* %ptr) nounwind uwtabl
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_4f32_8f32_0000:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_4f32_8f32_0000:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %ymm0
-; AVX2-NEXT:    vbroadcastss %xmm0, %xmm0
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_4f32_8f32_0000:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512-NEXT:    vbroadcastss %xmm0, %xmm0
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_4f32_8f32_0000:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vbroadcastss (%rdi), %xmm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <8 x float>, <8 x float>* %ptr
   %ret = shufflevector <8 x float> %ld, <8 x float> undef, <4 x i32> zeroinitializer
@@ -627,22 +596,10 @@ define <8 x float> @load_splat_8f32_16f32_89898989(<16 x float>* %ptr) nounwind
 ; SSE42-NEXT:    movapd %xmm0, %xmm1
 ; SSE42-NEXT:    retq
 ;
-; AVX1-LABEL: load_splat_8f32_16f32_89898989:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vbroadcastsd 32(%rdi), %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: load_splat_8f32_16f32_89898989:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vbroadcastsd 32(%rdi), %ymm0
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: load_splat_8f32_16f32_89898989:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovapd (%rdi), %zmm0
-; AVX512-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: load_splat_8f32_16f32_89898989:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vbroadcastsd 32(%rdi), %ymm0
+; AVX-NEXT:    retq
 entry:
   %ld = load <16 x float>, <16 x float>* %ptr
   %ret = shufflevector <16 x float> %ld, <16 x float> undef, <8 x i32> <i32 8, i32 9, i32 8, i32 9, i32 8, i32 9, i32 8, i32 9>
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 6fbec91e77a3..f4d0503f4a79 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -57,10 +57,8 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovups (%rdi), %ymm0
 ; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX1-NEXT:    vmovups 64(%rdi), %ymm2
-; AVX1-NEXT:    vmovups 96(%rdi), %ymm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
 ; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX1-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -69,10 +67,8 @@ define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovupd (%rdi), %ymm0
 ; AVX2-NEXT:    vmovupd 32(%rdi), %ymm1
-; AVX2-NEXT:    vmovupd 64(%rdi), %ymm2
-; AVX2-NEXT:    vmovupd 96(%rdi), %ymm3
-; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT:    vinsertf128 $1, 64(%rdi), %ymm0, %ymm0
+; AVX2-NEXT:    vinsertf128 $1, 96(%rdi), %ymm1, %ymm1
 ; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
 ; AVX2-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/x87.ll b/test/CodeGen/X86/x87.ll
index 683d7b05cf8c..9bc654861b69 100644
--- a/test/CodeGen/X86/x87.ll
+++ b/test/CodeGen/X86/x87.ll
@@ -1,13 +1,16 @@
 ; RUN: llc < %s -march=x86 | FileCheck %s -check-prefix=X87
 ; RUN: llc < %s -march=x86-64 -mattr=-sse | FileCheck %s -check-prefix=X87
-; RUN: llc < %s -march=x86 -mattr=-x87 | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
-; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
-; RUN: llc < %s -march=x86 -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
-; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87 --implicit-check-not "{{ }}f{{.*}}"
+; RUN: llc < %s -march=x86 -mattr=-x87 | FileCheck %s -check-prefix=NOX87
+; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse | FileCheck %s -check-prefix=NOX87
+; RUN: llc < %s -march=x86 -mattr=-x87,+sse | FileCheck %s -check-prefix=NOX87
+; RUN: llc < %s -march=x86-64 -mattr=-x87,-sse2 | FileCheck %s -check-prefix=NOX87
 
 define void @test(i32 %i, i64 %l, float* %pf, double* %pd, fp128* %pld) nounwind readnone {
 ; X87-LABEL: test:
 ; NOX87-LABEL: test:
+
+; NOX87-NOT: {{ }}f{{.*}}
+
 ; X87: fild
 ; NOX87: __floatunsisf
   %tmp = uitofp i32 %i to float
diff --git a/test/CodeGen/XCore/epilogue_prologue.ll b/test/CodeGen/XCore/epilogue_prologue.ll
index aed49f4b67ba..d214c40dd9b9 100644
--- a/test/CodeGen/XCore/epilogue_prologue.ll
+++ b/test/CodeGen/XCore/epilogue_prologue.ll
@@ -6,7 +6,7 @@
 ; When using FP, for large or small frames, we may need one scratch register.
 
 ; FP + small frame: spill FP+SR = entsp 2
-; CHECKFP-LABEL: f1
+; CHECKFP-LABEL: f1:
 ; CHECKFP: entsp 2
 ; CHECKFP-NEXT: stw r10, sp[1]
 ; CHECKFP-NEXT: ldaw r10, sp[0]
@@ -15,7 +15,7 @@
 ; CHECKFP-NEXT: retsp 2
 ;
 ; !FP + small frame: no spills = no stack adjustment needed
-; CHECK-LABEL: f1
+; CHECK-LABEL: f1:
 ; CHECK: stw lr, sp[0]
 ; CHECK: ldw lr, sp[0]
 ; CHECK-NEXT: retsp 0
@@ -27,7 +27,7 @@ entry:
 
 
 ; FP + small frame: spill FP+SR+R0+LR = entsp 3 + extsp 1
-; CHECKFP-LABEL:f3
+; CHECKFP-LABEL: f3:
 ; CHECKFP: entsp 3
 ; CHECKFP-NEXT: stw r10, sp[1]
 ; CHECKFP-NEXT: ldaw r10, sp[0]
@@ -43,7 +43,7 @@ entry:
 ; CHECKFP-NEXT: retsp 3
 ;
 ; !FP + small frame: spill R0+LR = entsp 2
-; CHECK-LABEL: f3
+; CHECK-LABEL: f3:
 ; CHECK: entsp 2
 ; CHECK-NEXT: stw [[REG:r[4-9]+]], sp[1]
 ; CHECK-NEXT: mov [[REG]], r0
@@ -60,7 +60,7 @@ entry:
 
 
 ; FP + large frame: spill FP+SR = entsp 2 + 100000
-; CHECKFP-LABEL: f4
+; CHECKFP-LABEL: f4:
 ; CHECKFP: entsp 65535
 ; CHECKFP-NEXT: .Lcfi{{[0-9]+}}
 ; CHECKFP-NEXT: .cfi_def_cfa_offset 262140
@@ -81,7 +81,7 @@ entry:
 ; CHECKFP-NEXT: retsp 34467
 ;
 ; !FP + large frame: spill SR+SR = entsp 2 + 100000
-; CHECK-LABEL: f4
+; CHECK-LABEL: f4:
 ; CHECK: entsp 65535
 ; CHECK-NEXT: .Lcfi{{[0-9]+}}
 ; CHECK-NEXT: .cfi_def_cfa_offset 262140
@@ -107,7 +107,7 @@ entry:
 ; CHECKFP-NEXT: .LCPI[[CNST1:[0-9_]+]]:
 ; CHECKFP-NEXT: .long 200001
 ; CHECKFP-NEXT: .text
-; CHECKFP-LABEL: f6
+; CHECKFP-LABEL: f6:
 ; CHECKFP: entsp 65535
 ; CHECKFP-NEXT: .Lcfi{{[0-9]+}}
 ; CHECKFP-NEXT: .cfi_def_cfa_offset 262140
@@ -160,7 +160,7 @@ entry:
 ; CHECK-NEXT: .LCPI[[CNST1:[0-9_]+]]:
 ; CHECK-NEXT: .long 200002
 ; CHECK-NEXT: .text
-; CHECK-LABEL: f6
+; CHECK-LABEL: f6:
 ; CHECK: entsp 65535
 ; CHECK-NEXT: .Lcfi{{[0-9]+}}
 ; CHECK-NEXT: .cfi_def_cfa_offset 262140
@@ -207,7 +207,7 @@ entry:
 }
 
 ; FP + large frame: spill FP+SR+LR = entsp 2 + 256  + extsp 1
-; CHECKFP-LABEL:f8
+; CHECKFP-LABEL: f8:
 ; CHECKFP: entsp 258
 ; CHECKFP-NEXT: stw r10, sp[1]
 ; CHECKFP-NEXT: ldaw r10, sp[0]
@@ -221,7 +221,7 @@ entry:
 ; CHECKFP-NEXT: retsp 258
 ;
 ; !FP + large frame: spill SR+SR+LR = entsp 3 + 256
-; CHECK-LABEL:f8
+; CHECK-LABEL: f8:
 ; CHECK: entsp 257
 ; CHECK-NEXT: ldaw r0, sp[254]
 ; CHECK-NEXT: bl f5
@@ -235,7 +235,7 @@ entry:
 }
 
 ; FP + large frame: spill FP+SR+LR = entsp 2 + 32768  + extsp 1
-; CHECKFP-LABEL:f9
+; CHECKFP-LABEL: f9:
 ; CHECKFP: entsp 32770
 ; CHECKFP-NEXT: stw r10, sp[1]
 ; CHECKFP-NEXT: ldaw r10, sp[0]
@@ -249,7 +249,7 @@ entry:
 ; CHECKFP-NEXT: retsp 32770
 ;
 ; !FP + large frame: spill SR+SR+LR = entsp 3 + 32768
-; CHECK-LABEL:f9
+; CHECK-LABEL: f9:
 ; CHECK: entsp 32771
 ; CHECK-NEXT: ldaw r0, sp[32768]
 ; CHECK-NEXT: bl f5
diff --git a/test/DebugInfo/Generic/empty.ll b/test/DebugInfo/Generic/empty.ll
index d5f738fa0271..79912841fa6d 100644
--- a/test/DebugInfo/Generic/empty.ll
+++ b/test/DebugInfo/Generic/empty.ll
@@ -13,10 +13,9 @@
 ; CHECK-NOT: file_names[
 
 ; CHECK: .debug_pubnames contents:
-; CHECK-NOT: Offset
+; CHECK-NOT: {{^}}0x
 
-; CHECK: .debug_pubtypes contents:
-; CHECK-NOT: Offset
+; CHECK: contents:
 
 ; Don't emit DW_AT_addr_base when there are no addresses.
 ; FISSION-NOT: DW_AT_GNU_addr_base [DW_FORM_sec_offset]
@@ -24,8 +23,10 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!5}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: FullDebug, file: !4, enums: !2, retainedTypes: !2, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: FullDebug, file: !4, enums: !2, retainedTypes: !6, globals: !2)
 !2 = !{}
 !3 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
 !4 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
 !5 = !{i32 1, !"Debug Info Version", i32 3}
+!6 = !{!7}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/DebugInfo/Generic/nodebug.ll b/test/DebugInfo/Generic/nodebug.ll
index f85b00bf9f7e..9b0eb9b4dd07 100644
--- a/test/DebugInfo/Generic/nodebug.ll
+++ b/test/DebugInfo/Generic/nodebug.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: object-emission
 
-; RUN: %llc_dwarf < %s -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; RUN: %llc_dwarf < %s -filetype=obj | llvm-dwarfdump - | FileCheck %s
 
 ; Test that a nodebug function (a function not appearing in the debug info IR
 ; metadata subprogram list) with DebugLocs on its IR doesn't cause crashes/does
@@ -17,9 +17,16 @@
 ; }
 
 ; Check that there's no DW_TAG_subprogram, not even for the 'f2' function.
+; CHECK: .debug_info contents:
 ; CHECK: DW_TAG_compile_unit
 ; CHECK-NOT: DW_TAG_subprogram
 
+; Expect no line table entry since there are no functions and file references in this compile unit
+; CHECK: .debug_line contents:
+; CHECK: Line table prologue:
+; CHECK: total_length: 0x00000019
+; CHECK-NOT: file_names[
+
 @i = external global i32
 
 ; Function Attrs: uwtable
@@ -35,7 +42,7 @@ attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !12, globals: !2, imports: !2)
 !1 = !DIFile(filename: "nodebug.cpp", directory: "/tmp/dbginfo")
 !2 = !{}
 !4 = distinct !DISubprogram(name: "f1", linkageName: "_Z2f1v", line: 2, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 2, file: !1, scope: !5, type: !6, variables: !2)
@@ -46,3 +53,5 @@ attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="
 !9 = !{i32 2, !"Debug Info Version", i32 3}
 !10 = !{!"clang version 3.5.0 "}
 !11 = !DILocation(line: 3, scope: !4)
+!12 = !{!13}
+!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/DebugInfo/Generic/skeletoncu.ll b/test/DebugInfo/Generic/skeletoncu.ll
index 6d91afd0fa79..b9761b2ab565 100644
--- a/test/DebugInfo/Generic/skeletoncu.ll
+++ b/test/DebugInfo/Generic/skeletoncu.ll
@@ -7,9 +7,11 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, splitDebugFilename: "my.dwo", emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2, dwoId: 43981)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, splitDebugFilename: "my.dwo", emissionKind: FullDebug, enums: !2, retainedTypes: !5, globals: !2, imports: !2, dwoId: 43981)
 !1 = !DIFile(filename: "<stdin>", directory: "/")
 !2 = !{}
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
 
diff --git a/test/DebugInfo/Inputs/split-dwarf-dwp.cpp b/test/DebugInfo/Inputs/split-dwarf-dwp.cpp
new file mode 100644
index 000000000000..b07a1537d6bf
--- /dev/null
+++ b/test/DebugInfo/Inputs/split-dwarf-dwp.cpp
@@ -0,0 +1,12 @@
+void f1();
+__attribute__((always_inline)) void f2() {
+  f1();
+}
+void f3() {
+  f2();
+}
+
+To produce split-dwarf-dwp.o{,dwp}, run:
+
+  $ clang++ split-dwarf-dwp.cpp -gsplit-dwarf -c -Xclang -fdebug-compilation-dir=Output -fno-split-dwarf-inlining
+  $ llvm-dwp split-dwarf-dwp.dwo -o split-dwarf-dwp.o.dwp
diff --git a/test/DebugInfo/Inputs/split-dwarf-dwp.o b/test/DebugInfo/Inputs/split-dwarf-dwp.o
new file mode 100644
index 0000000000000000000000000000000000000000..614c62040dec63fc13b815d42ca1ea408e9cbcfa
GIT binary patch
literal 2744
zcmbtV&1(};5TDJqrmbyEYrmv|3yQR2W7<%x7mf8RV9_G<B5K_vn>IArgv}<cf>sa&
zZw0R&Ja|;GcoanN>fhi^@TlhsI<xyG`}*>V2dBK5dB6F3^WHSOb!G8t%rJn-fb$UT
z7zLQ`j`VDxW?>XYU^(~v`w!aR{nDsc%Kmh5!}dzZ>#5>K4Q@95deaAE5Rzl>7ZMYR
zorU;^g|R_n=m5r?#??i^(6guJ@axuY95%KBlVAts4eLiyi3OW9@*}uPvv7<=CP1N3
zvE5ax<#>&9&9&xI=~UX9^u4CLZe`M$xpSE*YqC;lRnJJ+>=echhw)%MyamQ)(wOLv
z_iW=KsUj_+;7At}$!D=k1Jn*t4p$6LAvcWO!~pW(axu~X^@0lZI)woAhCQG*+HLmb
zx{W)r7)A!ReHa<L;MX6W+0?!ufQf%PGQ}gPHWQ>rmy9O#O1nrkVKCZAnh9}f7pW%n
zL>oyn!<&_mP&Hwo10%qM#6I|;4txZWWCt3)>5z19?QzpR+RE+O(o73TWLjY|zA1W1
zI?eAtA@N-FAr_8$=-(d-AY8s3J{#fkYw<iX&U;JB!?g%HA#j?L_H+r1go^V^FwSS?
zoK%;4q<D)M66KcjpAFE?{E%uqsQPvurSFAujWy4y*eT!H^dVJr^3By1G_Y#t^Pba!
zRNxa@%WkR0W)J_pod)l!b|ExM-gdEw$t0lO%)54#f(o^2)p4<^Tl1aNwHwRR4c{)T
z3vXv_rR1UGp5RK^bwr2%CoBx7yXO1lY+0eS=TGc;F4%7Z6?&sA148D+(2R@yVU$#d
z-<R=|0#6G+7T1MB`DLOUL@Z|{JunDX5^_uiQRg^@FKaNeJrWM;xzzXz=%?%dzUY5j
z<e~m)Zq@#m5L0tf$Xj9>Qho2CkBU(k$E!AV@9h)+j*=iHL(%4Hd=)La{(p-8`TnSX
z`n6X3e~Orz6CwCS49N<3#8ltMXwucU2t?vJqxv+<YW)|8QBFQ5ssViya$ZOlT%YAj
zG_cM-Bu90i#&;*g>hf<F#gB`4wLYJpe2CuQ()B;D>7Twr9MAXi88O{amjt+oc9~S)
zk7(4@U((d4ud-U7@~OR3y;q{p6Cvq4L*Fwso@Ppn8fp<)0z4i;fu^3H&ZLW96X*Z>
GkNCeP(yvbd

literal 0
HcmV?d00001

diff --git a/test/DebugInfo/Inputs/split-dwarf-dwp.o.dwp b/test/DebugInfo/Inputs/split-dwarf-dwp.o.dwp
new file mode 100644
index 0000000000000000000000000000000000000000..16a0af8c062f3c5c18997352081cefe736489fad
GIT binary patch
literal 1256
zcmbtTO>fgc5S_J?v?2m2w1-v#$r3_p510@uhZf0Ll|Vv(#0j-rtbEwT!nG~iaoSTM
zCw>S&0R9F11y1~rUU*}6)n(HgPkOsEZ{E%-v(Dx7*Dov~NI{TK*O^j@K3Lar(1ros
zf^&?x)8?CkM2D>Hug%p4EiWy*BP?N4A4T!5+Y{Z+E{9!Sdf-5OJ6ib<SN>MlUisgT
zHi#^jvXzhQu>Z9GY}dhA+ZBV){@q@%qgQa}eR{IhR*w58e8+9jA~Yc(uI}I`R7IN9
zkLJrs8DlNn`7)<jI>}C*CMv5W&z!M4bcfDfT`sbZjyLqi2j0H3m!{3xV@(_FueF*L
z1%>ZN@u;DA)J7g64|fx7gRj64jKKsfK%O5DiigXwniIknYUsS%AbT;UH5@>B3l6}0
zYt?nZok!EfX*gSiNj8tpN#>L3w2Yc|C{n5FGR)(+it5Uc^)$)iT+h=ai~h;;+20iH
z>dBPc>f(&||HU~P@K{XzmUH`J6x2AC%+FwITnl))0jBpQJf_CS0$u=q79FOSpUc$P
ze<2U?lj<<NCn%a4`+v$q++T<3Jw%bUp>9I_OXiiKtMh7qfVzrx>l-PWpQ7pI+|;;F
cBE@69=?&gb(D-j-;)guHBMa)DO_?kG50jaCX#fBK

literal 0
HcmV?d00001

diff --git a/test/DebugInfo/MIR/X86/empty-inline.mir b/test/DebugInfo/MIR/X86/empty-inline.mir
new file mode 100644
index 000000000000..1766a8f44616
--- /dev/null
+++ b/test/DebugInfo/MIR/X86/empty-inline.mir
@@ -0,0 +1,122 @@
+# RUN: llc  -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+#
+# This testcase has an implicit def pseudo-iunstruction with a debug location.
+#
+# CHECK: .debug_info contents:
+# CHECK: DW_TAG_subprogram
+# CHECK:	DW_AT_low_pc [DW_FORM_addr]	(0x0000000000000000)
+# CHECK-NOT: DW_TAG
+# CHECK:	DW_AT_specification {{.*}} "_ZN1C5m_fn3Ev"
+# CHECK-NOT: DW_TAG
+#	     Here should not be an inlined subroutine with 0 length.
+# CHECK: NULL
+#
+# CHECK: Address            Line   Column File   ISA Discriminator Flags
+# CHECK-NEXT:                ---
+# CHECK-NEXT:                 25      0      1   0             0  is_stmt
+# CHECK-NEXT:                 29     28      1   0             0  is_stmt prologue_end
+# CHECK-NEXT:                 29     28      1   0             0  is_stmt end_sequence
+--- |
+  source_filename = "t.ll"
+  target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-apple-macosx"
+  
+  %class.E = type { %class.D }
+  %class.D = type { %class.B }
+  %class.B = type { %class.A, %class.A }
+  %class.A = type { i8 }
+  %class.C = type <{ %class.E*, %class.B, [2 x i8] }>
+  
+  @a = local_unnamed_addr global %class.E* null, align 4
+  
+  define i32 @_ZN1C5m_fn3Ev(%class.C* nocapture) local_unnamed_addr align 2 !dbg !6 {
+    %2 = alloca %class.B, align 1
+    %3 = load %class.E*, %class.E** @a, align 4
+    %4 = icmp eq %class.E* %3, null
+    br i1 %4, label %10, label %5
+  
+  ; <label>:5:                                      ; preds = %1
+    %6 = bitcast %class.C* %0 to %class.D**
+    %7 = load %class.D*, %class.D** %6, align 4
+    %8 = bitcast %class.D* %7 to i8*
+    %9 = load i8, i8* %8, align 1
+    br label %10
+  
+  ; <label>:10:                                     ; preds = %5, %1
+    %11 = phi i8 [ %9, %5 ], [ undef, %1 ], !dbg !10
+    %12 = getelementptr inbounds %class.C, %class.C* %0, i32 0, i32 1, i32 0, i32 0
+    store i8 %11, i8* %12, align 1, !dbg !14
+    ret i32 undef
+  }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4, !5}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "Apple LLVM version 8.1.0 (clang-802.0.30.3)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !2)
+  !1 = !DIFile(filename: "test.ii", directory: "/")
+  !2 = !{}
+  !3 = !{i32 2, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !{i32 1, !"PIC Level", i32 2}
+  !6 = distinct !DISubprogram(name: "m_fn3", linkageName: "_ZN1C5m_fn3Ev", scope: !7, file: !1, line: 25, type: !8, isLocal: false, isDefinition: true, scopeLine: 25, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !9, variables: !2)
+  !7 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "C", file: !1, line: 14, size: 64, align: 32, elements: !2, identifier: "_ZTS1C")
+  !8 = !DISubroutineType(types: !2)
+  !9 = !DISubprogram(name: "m_fn3", linkageName: "_ZN1C5m_fn3Ev", scope: !7, file: !1, line: 15, type: !8, isLocal: false, isDefinition: false, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: true)
+  !10 = !DILocation(line: 99, column: 9, scope: !11, inlinedAt: !14)
+  !11 = distinct !DISubprogram(name: "m_fn1", linkageName: "_ZN1A5m_fn1Ev", scope: !12, file: !1, line: 5, type: !8, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !13, variables: !2)
+  !12 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !1, line: 1, size: 8, align: 8, elements: !2, identifier: "_ZTS1A")
+  !13 = !DISubprogram(name: "m_fn1", linkageName: "_ZN1A5m_fn1Ev", scope: !12, file: !1, line: 5, type: !8, isLocal: false, isDefinition: false, scopeLine: 5, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true)
+  !14 = !DILocation(line: 29, column: 28, scope: !6)
+
+...
+---
+name:            _ZN1C5m_fn3Ev
+alignment:       4
+exposesReturnsTwice: false
+noVRegs:         true
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rdi' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+#stack:           
+#  - { id: 0, name: '<unnamed alloca>', offset: -16, size: 2, alignment: 8 }
+body:             |
+  bb.0 (%ir-block.1):
+    successors: %bb.1(0x30000000), %bb.2(0x50000000)
+    liveins: %rdi
+  
+    CMP64mi8 %rip, 1, _, @a, _, 0, implicit-def %eflags :: (dereferenceable load 8 from @a, align 4)
+    JE_1 %bb.1, implicit %eflags
+  
+  bb.2 (%ir-block.5):
+    liveins: %rdi
+  
+    %rax = MOV64rm %rdi, 1, _, 0, _ :: (load 8 from %ir.6, align 4)
+    %al = MOV8rm killed %rax, 1, _, 0, _ :: (load 1 from %ir.8)
+    MOV8mr killed %rdi, 1, _, 8, _, killed %al, debug-location !14 :: (store 1 into %ir.12)
+    RETQ undef %eax
+  
+  bb.1:
+    liveins: %rdi
+  
+    %al = IMPLICIT_DEF debug-location !10
+    MOV8mr killed %rdi, 1, _, 8, _, killed %al, debug-location !14 :: (store 1 into %ir.12)
+    RETQ undef %eax
+
+...
diff --git a/test/DebugInfo/PDB/Inputs/merge-ids-1.yaml b/test/DebugInfo/PDB/Inputs/merge-ids-1.yaml
new file mode 100644
index 000000000000..3b5e8b5e761a
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/merge-ids-1.yaml
@@ -0,0 +1,36 @@
+IpiStream:
+  Records:
+    # 'One' [TypeIndex: 0x1000 (4096)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'One'
+    # 'Two' [TypeIndex: 0x1001 (4097)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'Two'
+    # 'OnlyInFirst' [TypeIndex: 0x1002 (4098)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'OnlyInFirst'
+    # 'SubOne' [TypeIndex: 0x1003 (4099)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'SubOne'
+    # 'SubTwo' [TypeIndex: 0x1004 (4100)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'SubTwo'
+    # 'SubOne', 'SubTwo' [TypeIndex: 0x1005 (4101)]
+    - Kind:            LF_SUBSTR_LIST
+      StringList:      
+        StringIndices:   [ 4099, 4100 ]
+    # 'Main' {'SubOne', 'SubTwo'} [TypeIndex: 0x1006 (4102)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              4101
+        String:          'Main'
diff --git a/test/DebugInfo/PDB/Inputs/merge-ids-2.yaml b/test/DebugInfo/PDB/Inputs/merge-ids-2.yaml
new file mode 100644
index 000000000000..74f6ee502249
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/merge-ids-2.yaml
@@ -0,0 +1,31 @@
+IpiStream:       
+  Records:
+    # 'SubTwo' [TypeIndex: 0x1000 (4096)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'SubTwo'
+    # 'OnlyInSecond' [TypeIndex: 0x1001 (4097)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'OnlyInSecond'
+    # 'SubOne' [TypeIndex: 0x1002 (4098)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'SubOne'
+    # 'SubOne', 'SubTwo' [TypeIndex: 0x1003 (4099)]
+    - Kind:            LF_SUBSTR_LIST
+      StringList:      
+        StringIndices:   [ 4098, 4096 ]
+    # 'One' [TypeIndex: 0x1004 (4100)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              0
+        String:          'One'
+    # 'Main' {'SubOne', 'SubTwo'} [TypeIndex: 0x1005 (4101)]
+    - Kind:            LF_STRING_ID
+      StringId:        
+        Id:              4099
+        String:          'Main'
diff --git a/test/DebugInfo/PDB/Inputs/merge-ids-and-types-1.yaml b/test/DebugInfo/PDB/Inputs/merge-ids-and-types-1.yaml
new file mode 100644
index 000000000000..30ff563d7fc6
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/merge-ids-and-types-1.yaml
@@ -0,0 +1,113 @@
+# The idea is to set up some types in the TPI stream, and then have records in
+# the IPI stream that refer to them.  There are three types of IPI records that
+# can refer to TPI records.  They are:
+# 1) LF_PROCEDURE - Referred to by LF_FUNC_ID
+# 2) LF_STRUCTURE - Referred to by LF_UDT_MOD_SRC_LINE
+#                   Referred to by LF_UDT_SRC_LINE
+# 3) LF_MFUNCTION - Referred to by LF_MFUNC_ID
+# We will set up one of each of these, and then create IPI records that refer to
+# them.  We intentionally choose an unintuitive ordering of the records in both
+# streams (while still maintaining the topological sorting required by CodeView
+# type streams), to make sure the merging algorithm is sufficiently exercised.
+# For easy understanding, a semantic representation of the types we will set up
+# is as follows:
+#  - int main(int, char**)
+#
+#  - struct FooBar {
+#    public:
+#      void *FooMember;
+#      void FooMethod(int);
+#    };
+TpiStream:
+  Records:
+    # TypeIndex: 4096 (0x1000)
+    # char**
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    1136
+        Attrs:           32778
+    # TypeIndex: 4097 (0x1001)
+    # public void *FooMember
+    - Kind:            LF_FIELDLIST
+      FieldList:       
+        - Kind:            LF_MEMBER
+          DataMember:      
+            Attrs:           3           # public
+            Type:            1027        # void*
+            FieldOffset:     0
+            Name:            FooMember   # FooMember
+    # TypeIndex: 4098 (0x1002)
+    # (int, char**)
+    - Kind:            LF_ARGLIST
+      ArgList:         
+        ArgIndices:      [ 116, 4096 ]
+    # TypeIndex: 4099 (0x1003)
+    # struct FooBar {
+    # public:
+    #   void *FooMember;
+    # };
+    - Kind:            LF_STRUCTURE
+      Class:           
+        MemberCount:     1
+        Options:         [ None, HasUniqueName ]
+        FieldList:       4097
+        Name:            FooBar
+        UniqueName:      'FooBar'
+        DerivationList:  0
+        VTableShape:     0
+        Size:            4
+    # TypeIndex: 4100 (0x1004)
+    # FooBar *
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    4099       # FooBar
+        Attrs:           32778
+    # TypeIndex: 4101 (0x1005)
+    # (int)
+    - Kind:            LF_ARGLIST
+      ArgList:         
+        ArgIndices:      [ 116 ]
+    # TypeIndex: 4102 (0x1006)
+    - Kind:            LF_MFUNCTION
+      MemberFunction:  
+        ReturnType:      3                      # void
+        ClassType:       4099                   # struct FooBar
+        ThisType:        4100                   # FooBar *
+        CallConv:        ThisCall
+        Options:         [ None, Constructor ]
+        ParameterCount:  1
+        ArgumentList:    4101                   # (int)
+        ThisPointerAdjustment: 0
+    # TypeIndex: 4103 (0x1007)
+    # int (int, char**)
+    - Kind:            LF_PROCEDURE
+      Procedure:       
+        ReturnType:      116         # int
+        CallConv:        NearC
+        Options:         [ None ]
+        ParameterCount:  2
+        ArgumentList:    4098        # (int, char**)
+IpiStream:
+  Records:
+    # TypeIndex: 4096 (0x1000)
+    # int main(int, char **)
+    - Kind:            LF_FUNC_ID
+      FuncId:          
+        ParentScope:     0
+        FunctionType:    4103       # int main(int, char**)
+        Name:            main
+    # TypeIndex: 4097 (0x1001)
+    # void FooBar::FooMethod(int)
+    - Kind:            LF_MFUNC_ID
+      MemberFuncId:    
+        ClassType:       4099       # struct FooBar
+        FunctionType:    4102       # void FooMethod(int)
+        Name:            FooMethod
+    # TypeIndex: 4098 (0x1002)
+    # struct FooBar
+    - Kind:            LF_UDT_MOD_SRC_LINE
+      UdtModSourceLine: 
+        UDT:             4099       # struct FooBar
+        SourceFile:      0          # We don't support this yet
+        LineNumber:      0
+        Module:          0          # We don't support this yet
diff --git a/test/DebugInfo/PDB/Inputs/merge-ids-and-types-2.yaml b/test/DebugInfo/PDB/Inputs/merge-ids-and-types-2.yaml
new file mode 100644
index 000000000000..1bd54deebffd
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/merge-ids-and-types-2.yaml
@@ -0,0 +1,143 @@
+# In file 1 we set up some basic types and IDs to refer to them.  In this file
+# we will set up the same types.  For some of them we will make them identical
+# but re-order the records in the file to make sure they have different type
+# indices and appear in different orders.  In other cases we will make slight
+# adjustments to the types, to ensure that they do not get merged in.
+# 
+# For easy understanding, a semantic representation of the types we will set up
+# is as follows:
+#  - int main(int, char**)    // This record should share an LF_PROCEDURE and id
+#                             // record with corresponding function from the
+#                             // first file.
+#  - int main2(int, char**)   // This record should share the LF_PROCEDURE
+#                             // record but have a unique id record.
+#  - void foo(int, char**)    // This record should have a unique LF_PROCEDURE
+#                             // record, but the LF_ARGLIST record internally
+#                             // should be shared.
+#
+#  - struct FooBar {          // Because the type of this record exactly matches
+#                             // the corresponding file, its entire type record
+#                             // hierarchy should be shared.
+#    public:
+#      void *FooMember;
+#      void FooMethod2(int);  // Note that the *type* of this member should be
+#                             // the same as the type of the record from the
+#                             // first stream.  But since it has a different
+#                             // name, it will not share an id record.
+#    };
+TpiStream:
+  Records:
+    # TypeIndex: 4096 (0x1000)
+    # (int)
+    - Kind:            LF_ARGLIST
+      ArgList:         
+        ArgIndices:      [ 116 ]
+    # TypeIndex: 4097 (0x1001)
+    # public void *FooMember
+    - Kind:            LF_FIELDLIST
+      FieldList:       
+        - Kind:            LF_MEMBER
+          DataMember:      
+            Attrs:           3           # public
+            Type:            1027        # void*
+            FieldOffset:     0
+            Name:            FooMember   # FooMember
+    # TypeIndex: 4098 (0x1002)
+    # char**
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    1136
+        Attrs:           32778
+    # TypeIndex: 4099 (0x1003)
+    # (int, char**)
+    - Kind:            LF_ARGLIST
+      ArgList:         
+        ArgIndices:      [ 116, 4098 ]
+    # TypeIndex: 4100 (0x1004)
+    # struct FooBar {
+    # public:
+    #   void *FooMember;
+    # };
+    - Kind:            LF_STRUCTURE
+      Class:           
+        MemberCount:     1
+        Options:         [ None, HasUniqueName ]
+        FieldList:       4097
+        Name:            FooBar
+        UniqueName:      'FooBar'
+        DerivationList:  0
+        VTableShape:     0
+        Size:            4
+    # TypeIndex: 4101 (0x1005)
+    # void (int, char**)
+    - Kind:            LF_PROCEDURE
+      Procedure:       
+        ReturnType:      3           # void
+        CallConv:        NearC
+        Options:         [ None ]
+        ParameterCount:  2
+        ArgumentList:    4099        # (int, char**)
+    # TypeIndex: 4102 (0x1006)
+    # FooBar *
+    - Kind:            LF_POINTER
+      Pointer:         
+        ReferentType:    4100       # FooBar
+        Attrs:           32778
+    # TypeIndex: 4103 (0x1007)
+    # int (int, char**)
+    - Kind:            LF_PROCEDURE
+      Procedure:       
+        ReturnType:      116         # int
+        CallConv:        NearC
+        Options:         [ None ]
+        ParameterCount:  2
+        ArgumentList:    4099        # (int, char**)
+    # TypeIndex: 4104 (0x1008)
+    - Kind:            LF_MFUNCTION
+      MemberFunction:  
+        ReturnType:      3                      # void
+        ClassType:       4100                   # struct FooBar
+        ThisType:        4102                   # FooBar *
+        CallConv:        ThisCall
+        Options:         [ None, Constructor ]
+        ParameterCount:  1
+        ArgumentList:    4096                   # (int)
+        ThisPointerAdjustment: 0
+IpiStream:
+  Records:
+    # TypeIndex: 4096 (0x1000)
+    # struct FooBar
+    - Kind:            LF_UDT_MOD_SRC_LINE
+      UdtModSourceLine: 
+        UDT:             4100       # struct FooBar
+        SourceFile:      0          # We don't support this yet
+        LineNumber:      0
+        Module:          0          # We don't support this yet
+    # TypeIndex: 4097 (0x1001)
+    # int main2(int, char **)
+    - Kind:            LF_FUNC_ID
+      FuncId:          
+        ParentScope:     0
+        FunctionType:    4103       # int main2(int, char**)
+        Name:            main2
+    # TypeIndex: 4098 (0x1002)
+    # void foo(int, char **)
+    - Kind:            LF_FUNC_ID
+      FuncId:          
+        ParentScope:     0
+        FunctionType:    4101       # void main2(int, char**)
+        Name:            foo
+    # TypeIndex: 4099 (0x1003)
+    # void FooBar::FooMethod2(int)
+    - Kind:            LF_MFUNC_ID
+      MemberFuncId:    
+        ClassType:       4100       # struct FooBar
+        FunctionType:    4104       # void FooBar::FooMethod2(int)
+        Name:            FooMethod2
+    # TypeIndex: 4100 (0x1004)
+    # int main(int, char **)
+    - Kind:            LF_FUNC_ID
+      FuncId:          
+        ParentScope:     0
+        FunctionType:    4103       # int main(int, char**)
+        Name:            main
diff --git a/test/DebugInfo/PDB/Inputs/merge1.yaml b/test/DebugInfo/PDB/Inputs/merge-types-1.yaml
similarity index 100%
rename from test/DebugInfo/PDB/Inputs/merge1.yaml
rename to test/DebugInfo/PDB/Inputs/merge-types-1.yaml
diff --git a/test/DebugInfo/PDB/Inputs/merge2.yaml b/test/DebugInfo/PDB/Inputs/merge-types-2.yaml
similarity index 100%
rename from test/DebugInfo/PDB/Inputs/merge2.yaml
rename to test/DebugInfo/PDB/Inputs/merge-types-2.yaml
diff --git a/test/DebugInfo/PDB/Inputs/source-names-1.yaml b/test/DebugInfo/PDB/Inputs/source-names-1.yaml
new file mode 100644
index 000000000000..96f7dedd2fc4
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/source-names-1.yaml
@@ -0,0 +1,8 @@
+---
+DbiStream:
+  Modules:
+    - Module:          'C:\src\test.obj'
+      ObjFile:         'C:\src\test.obj'
+      SourceFiles:
+        - 'C:\src\test.c'
+...
diff --git a/test/DebugInfo/PDB/Inputs/source-names-2.yaml b/test/DebugInfo/PDB/Inputs/source-names-2.yaml
new file mode 100644
index 000000000000..5f782ddbca25
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/source-names-2.yaml
@@ -0,0 +1,8 @@
+---
+DbiStream:
+  Modules:
+    - Module:          'C:\src\test.obj'
+      ObjFile:         'C:\src\test.obj'
+      SourceFiles:
+        - 'C:\src\test.cc'
+...
diff --git a/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test b/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test
new file mode 100644
index 000000000000..ac32ce040b98
--- /dev/null
+++ b/test/DebugInfo/PDB/pdbdump-merge-ids-and-types.test
@@ -0,0 +1,65 @@
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.1.pdb %p/Inputs/merge-ids-and-types-1.yaml
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.2.pdb %p/Inputs/merge-ids-and-types-2.yaml
+; RUN: llvm-pdbdump merge -pdb=%t.3.pdb %t.1.pdb %t.2.pdb
+; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=TPI-TYPES %s
+; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=INTMAIN %s
+; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=VOIDMAIN %s
+; RUN: llvm-pdbdump raw -ipi-records %t.3.pdb | FileCheck -check-prefix=IPI-TYPES %s
+; RUN: llvm-pdbdump raw -ipi-records %t.3.pdb | FileCheck -check-prefix=IPI-NAMES %s
+; RUN: llvm-pdbdump raw -ipi-records %t.3.pdb | FileCheck -check-prefix=IPI-UDT %s
+
+TPI-TYPES:     Type Info Stream (TPI)
+TPI-TYPES:     Record count: 9
+TPI-TYPES-DAG: TypeLeafKind: LF_POINTER
+TPI-TYPES-DAG: TypeLeafKind: LF_FIELDLIST
+TPI-TYPES-DAG: TypeLeafKind: LF_ARGLIST
+TPI-TYPES-DAG: TypeLeafKind: LF_STRUCTURE
+TPI-TYPES-DAG: TypeLeafKind: LF_MEMBER
+TPI-TYPES-DAG: TypeLeafKind: LF_POINTER
+TPI-TYPES-DAG: TypeLeafKind: LF_ARGLIST
+TPI-TYPES-DAG: TypeLeafKind: LF_MFUNCTION
+TPI-TYPES-DAG: TypeLeafKind: LF_PROCEDURE
+TPI-TYPES-DAG: TypeLeafKind: LF_PROCEDURE
+TPI-TYPES-DAG: TypeLeafKind: LF_ARGLIST
+
+; Both procedures should use the same arglist even though they have a different
+; return type.
+INTMAIN:      ArgList ([[ID:.*]])
+INTMAIN-NEXT:   TypeLeafKind: LF_ARGLIST
+INTMAIN-NEXT:   NumArgs: 2
+INTMAIN-NEXT:   Arguments [
+INTMAIN-NEXT:     ArgType: int
+INTMAIN-NEXT:     ArgType: char**
+INTMAIN:        TypeLeafKind: LF_PROCEDURE
+INTMAIN:          ReturnType: int
+INTMAIN:          NumParameters: 2
+INTMAIN-NEXT:     ArgListType: (int, char**) ([[ID]])
+
+VOIDMAIN:      ArgList ([[ID:.*]])
+VOIDMAIN-NEXT:   TypeLeafKind: LF_ARGLIST
+VOIDMAIN-NEXT:   NumArgs: 2
+VOIDMAIN-NEXT:   Arguments [
+VOIDMAIN-NEXT:     ArgType: int
+VOIDMAIN-NEXT:     ArgType: char**
+VOIDMAIN:        TypeLeafKind: LF_PROCEDURE
+VOIDMAIN:          ReturnType: void
+VOIDMAIN:          NumParameters: 2
+VOIDMAIN-NEXT:     ArgListType: (int, char**) ([[ID]])
+
+IPI-TYPES:     Type Info Stream (IPI)
+IPI-TYPES:     Record count: 6
+IPI-TYPES-DAG: TypeLeafKind: LF_FUNC_ID
+IPI-TYPES-DAG: TypeLeafKind: LF_MFUNC_ID
+IPI-TYPES-DAG: TypeLeafKind: LF_UDT_MOD_SRC_LINE
+IPI-TYPES-DAG: TypeLeafKind: LF_FUNC_ID
+IPI-TYPES-DAG: TypeLeafKind: LF_FUNC_ID
+IPI-TYPES-DAG: TypeLeafKind: LF_MFUNC_ID
+
+IPI-NAMES-DAG: Name: main
+IPI-NAMES-DAG: Name: FooMethod
+IPI-NAMES-DAG: Name: main2
+IPI-NAMES-DAG: Name: foo
+IPI-NAMES-DAG: Name: FooMethod2
+
+IPI-UDT:      TypeLeafKind: LF_UDT_MOD_SRC_LINE
+IPI-UDT-NEXT: UDT: FooBar
diff --git a/test/DebugInfo/PDB/pdbdump-mergeids.test b/test/DebugInfo/PDB/pdbdump-mergeids.test
new file mode 100644
index 000000000000..6a4d19eba042
--- /dev/null
+++ b/test/DebugInfo/PDB/pdbdump-mergeids.test
@@ -0,0 +1,31 @@
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.1.pdb %p/Inputs/merge-ids-1.yaml
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.2.pdb %p/Inputs/merge-ids-2.yaml
+; RUN: llvm-pdbdump merge -pdb=%t.3.pdb %t.1.pdb %t.2.pdb
+; RUN: llvm-pdbdump raw -ipi-records %t.3.pdb | FileCheck -check-prefix=MERGED %s
+; RUN: llvm-pdbdump raw -ipi-records %t.3.pdb | FileCheck -check-prefix=SUBSTRS %s
+; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=TPI-EMPTY %s
+
+
+MERGED: Type Info Stream (IPI)
+MERGED: Record count: 8
+MERGED-DAG: StringData: One
+MERGED-DAG: StringData: Two
+MERGED-DAG: StringData: SubOne
+MERGED-DAG: StringData: SubTwo
+MERGED-DAG: StringData: Main
+MERGED-DAG: TypeLeafKind: LF_SUBSTR_LIST
+MERGED-DAG: StringData: OnlyInFirst
+MERGED-DAG: StringData: OnlyInSecond
+
+SUBSTRS:      StringList
+SUBSTRS:        TypeLeafKind: LF_SUBSTR_LIST
+SUBSTRS-NEXT:   NumStrings: 2
+SUBSTRS-NEXT:   Strings [
+SUBSTRS-NEXT:     SubOne
+SUBSTRS-NEXT:     SubTwo
+SUBSTRS:      StringId
+SUBSTRS-NEXT:   TypeLeafKind: LF_STRING_ID
+SUBSTRS-NEXT:   Id: "SubOne" "SubTwo"
+SUBSTRS-NEXT:   StringData: Main
+
+TPI-EMPTY: Record count: 0
diff --git a/test/DebugInfo/PDB/pdbdump-mergetypes.test b/test/DebugInfo/PDB/pdbdump-mergetypes.test
index 96f6316d4766..a26b92631828 100644
--- a/test/DebugInfo/PDB/pdbdump-mergetypes.test
+++ b/test/DebugInfo/PDB/pdbdump-mergetypes.test
@@ -1,5 +1,5 @@
-; RUN: llvm-pdbdump yaml2pdb -pdb=%t.1.pdb %p/Inputs/merge1.yaml
-; RUN: llvm-pdbdump yaml2pdb -pdb=%t.2.pdb %p/Inputs/merge2.yaml
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.1.pdb %p/Inputs/merge-types-1.yaml
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.2.pdb %p/Inputs/merge-types-2.yaml
 ; RUN: llvm-pdbdump merge -pdb=%t.3.pdb %t.1.pdb %t.2.pdb
 ; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=MERGED %s
 ; RUN: llvm-pdbdump raw -tpi-records %t.3.pdb | FileCheck -check-prefix=ARGLIST %s
diff --git a/test/DebugInfo/PDB/pdbdump-objfilename.yaml b/test/DebugInfo/PDB/pdbdump-objfilename.yaml
new file mode 100644
index 000000000000..fac9ce9083c7
--- /dev/null
+++ b/test/DebugInfo/PDB/pdbdump-objfilename.yaml
@@ -0,0 +1,14 @@
+# RUN: llvm-pdbdump yaml2pdb -pdb=%T/objfilename.pdb %s
+# RUN: llvm-pdbdump pdb2yaml -dbi-module-info %T/objfilename.pdb \
+# RUN:     | FileCheck %s
+#
+# CHECK: DbiStream:
+# CHECK: Modules:
+# CHECK-NEXT: - Module:{{ *}}'C:\src\test.obj'
+# CHECK-NEXT: ObjFile:{{ *}}'C:\src\test.obj'
+---
+DbiStream:
+  Modules:
+    - Module:          'C:\src\test.obj'
+      ObjFile:         'C:\src\test.obj'
+...
diff --git a/test/DebugInfo/PDB/pdbdump-source-names.test b/test/DebugInfo/PDB/pdbdump-source-names.test
new file mode 100644
index 000000000000..181f4d5e0ee4
--- /dev/null
+++ b/test/DebugInfo/PDB/pdbdump-source-names.test
@@ -0,0 +1,20 @@
+# Test that we can write source file names to PDBs and read them back.
+# Because the subsection the file names are stored in is 4-byte
+# aligned, there is a possibility of misaligning the file names. This
+# will cause them to be read back empty or truncated.  To guard
+# against this, we test with two different lengths of file name data
+# that differ by one byte, so that at least one of those will only
+# pass if alignment is implemented correctly.
+
+RUN: llvm-pdbdump yaml2pdb -pdb=%T/source-names-1.pdb %p/Inputs/source-names-1.yaml
+RUN: llvm-pdbdump pdb2yaml -dbi-module-source-info %T/source-names-1.pdb \
+RUN:     | FileCheck -check-prefix=CHECK1 %s
+RUN: llvm-pdbdump yaml2pdb -pdb=%T/source-names-2.pdb %p/Inputs/source-names-2.yaml
+RUN: llvm-pdbdump pdb2yaml -dbi-module-source-info %T/source-names-2.pdb \
+RUN:     | FileCheck -check-prefix=CHECK2 %s
+
+CHECK1: SourceFiles:
+CHECK1: 'C:\src\test.c'
+
+CHECK2: SourceFiles:
+CHECK2: 'C:\src\test.cc'
diff --git a/test/DebugInfo/X86/array.ll b/test/DebugInfo/X86/array.ll
index 78cffcf69cf9..78dc12b4d377 100644
--- a/test/DebugInfo/X86/array.ll
+++ b/test/DebugInfo/X86/array.ll
@@ -16,85 +16,109 @@
 ; Test that we only emit register-indirect locations for the array array.
 ; rdar://problem/14874886
 ;
-; CHECK:     ##DEBUG_VALUE: main:array <- [%RSP+0]
 ; CHECK-NOT: ##DEBUG_VALUE: main:array <- %R{{.*}}
+; CHECK: movq    %rsp, %rdi
+; CHECK-NOT: ##DEBUG_VALUE: main:array <- %R{{.*}}
+; CHECK:     ##DEBUG_VALUE: main:array <- [%RDI+0]
+; CHECK-NOT: ##DEBUG_VALUE: main:array <- %R{{.*}}
+; ModuleID = '/tmp/array.c'
+source_filename = "/tmp/array.c"
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.9.0"
+target triple = "x86_64-apple-macosx10.12.0"
 
 @main.array = private unnamed_addr constant [4 x i32] [i32 0, i32 1, i32 2, i32 3], align 16
 
 ; Function Attrs: nounwind ssp uwtable
-define void @f(i32* nocapture %p) #0 !dbg !4 {
-  tail call void @llvm.dbg.value(metadata i32* %p, i64 0, metadata !11, metadata !DIExpression()), !dbg !28
-  store i32 42, i32* %p, align 4, !dbg !29, !tbaa !30
-  ret void, !dbg !34
+define void @f(i32* nocapture %p) local_unnamed_addr #0 !dbg !8 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32* %p, i64 0, metadata !14, metadata !15), !dbg !16
+  store i32 42, i32* %p, align 4, !dbg !17, !tbaa !18
+  ret void, !dbg !22
 }
 
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
 ; Function Attrs: nounwind ssp uwtable
-define i32 @main(i32 %argc, i8** nocapture readnone %argv) #0 !dbg !12 {
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) local_unnamed_addr #0 !dbg !23 {
+entry:
   %array = alloca [4 x i32], align 16
-  tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !19, metadata !DIExpression()), !dbg !35
-  tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !20, metadata !DIExpression()), !dbg !35
-  tail call void @llvm.dbg.value(metadata [4 x i32]* %array, i64 0, metadata !21, metadata !DIExpression(DW_OP_deref)), !dbg !36
-  %1 = bitcast [4 x i32]* %array to i8*, !dbg !36
-  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ([4 x i32]* @main.array to i8*), i64 16, i32 16, i1 false), !dbg !36
-  tail call void @llvm.dbg.value(metadata [4 x i32]* %array, i64 0, metadata !21, metadata !DIExpression(DW_OP_deref)), !dbg !36
-  %2 = getelementptr inbounds [4 x i32], [4 x i32]* %array, i64 0, i64 0, !dbg !37
-  call void @f(i32* %2), !dbg !37
-  tail call void @llvm.dbg.value(metadata [4 x i32]* %array, i64 0, metadata !21, metadata !DIExpression(DW_OP_deref)), !dbg !36
-  %3 = load i32, i32* %2, align 16, !dbg !38, !tbaa !30
-  ret i32 %3, !dbg !38
+  tail call void @llvm.dbg.value(metadata i32 %argc, i64 0, metadata !30, metadata !15), !dbg !36
+  tail call void @llvm.dbg.value(metadata i8** %argv, i64 0, metadata !31, metadata !15), !dbg !37
+  %0 = bitcast [4 x i32]* %array to i8*, !dbg !38
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* nonnull %0) #3, !dbg !38
+  tail call void @llvm.dbg.declare(metadata [4 x i32]* %array, metadata !32, metadata !15), !dbg !39
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %0, i8* bitcast ([4 x i32]* @main.array to i8*), i64 16, i32 16, i1 false), !dbg !39
+  %arraydecay = getelementptr inbounds [4 x i32], [4 x i32]* %array, i64 0, i64 0, !dbg !40
+  call void @f(i32* nonnull %arraydecay), !dbg !41
+  %1 = load i32, i32* %arraydecay, align 16, !dbg !42, !tbaa !18
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* nonnull %0) #3, !dbg !43
+  ret i32 %1, !dbg !44
 }
 
-; Function Attrs: nounwind
-declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
 
-; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #2
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
 attributes #0 = { nounwind ssp uwtable }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind readnone }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { argmemonly nounwind }
+attributes #3 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!25, !26}
-!llvm.ident = !{!27}
+!llvm.module.flags = !{!3, !4, !5, !6}
+!llvm.ident = !{!7}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.5.0 ", isOptimized: true, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
-!1 = !DIFile(filename: "array.c", directory: "")
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (trunk 303873) (llvm/trunk 303875)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/array.c", directory: "/")
 !2 = !{}
-!4 = distinct !DISubprogram(name: "f", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !10)
-!5 = !DIFile(filename: "array.c", directory: "")
-!6 = !DISubroutineType(types: !7)
-!7 = !{null, !8}
-!8 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !9)
-!9 = !DIBasicType(tag: DW_TAG_base_type, name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-!10 = !{!11}
-!11 = !DILocalVariable(name: "p", line: 1, arg: 1, scope: !4, file: !5, type: !8)
-!12 = distinct !DISubprogram(name: "main", line: 5, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 5, file: !1, scope: !5, type: !13, variables: !18)
-!13 = !DISubroutineType(types: !14)
-!14 = !{!9, !9, !15}
-!15 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !16)
-!16 = !DIDerivedType(tag: DW_TAG_pointer_type, size: 64, align: 64, baseType: !17)
-!17 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char)
-!18 = !{!19, !20, !21}
-!19 = !DILocalVariable(name: "argc", line: 5, arg: 1, scope: !12, file: !5, type: !9)
-!20 = !DILocalVariable(name: "argv", line: 5, arg: 2, scope: !12, file: !5, type: !15)
-!21 = !DILocalVariable(name: "array", line: 6, scope: !12, file: !5, type: !22)
-!22 = !DICompositeType(tag: DW_TAG_array_type, size: 128, align: 32, baseType: !9, elements: !23)
-!23 = !{!24}
-!24 = !DISubrange(count: 4)
-!25 = !{i32 2, !"Dwarf Version", i32 2}
-!26 = !{i32 1, !"Debug Info Version", i32 3}
-!27 = !{!"clang version 3.5.0 "}
-!28 = !DILocation(line: 1, scope: !4)
-!29 = !DILocation(line: 2, scope: !4)
-!30 = !{!31, !31, i64 0}
-!31 = !{!"int", !32, i64 0}
-!32 = !{!"omnipotent char", !33, i64 0}
-!33 = !{!"Simple C/C++ TBAA"}
-!34 = !DILocation(line: 3, scope: !4)
-!35 = !DILocation(line: 5, scope: !12)
-!36 = !DILocation(line: 6, scope: !12)
-!37 = !DILocation(line: 7, scope: !12)
-!38 = !DILocation(line: 8, scope: !12)
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 7, !"PIC Level", i32 2}
+!7 = !{!"clang version 5.0.0 (trunk 303873) (llvm/trunk 303875)"}
+!8 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !9, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !13)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null, !11}
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{!14}
+!14 = !DILocalVariable(name: "p", arg: 1, scope: !8, file: !1, line: 1, type: !11)
+!15 = !DIExpression()
+!16 = !DILocation(line: 1, column: 13, scope: !8)
+!17 = !DILocation(line: 2, column: 8, scope: !8)
+!18 = !{!19, !19, i64 0}
+!19 = !{!"int", !20, i64 0}
+!20 = !{!"omnipotent char", !21, i64 0}
+!21 = !{!"Simple C/C++ TBAA"}
+!22 = !DILocation(line: 3, column: 1, scope: !8)
+!23 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 5, type: !24, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !29)
+!24 = !DISubroutineType(types: !25)
+!25 = !{!12, !12, !26}
+!26 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !27, size: 64)
+!27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 64)
+!28 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!29 = !{!30, !31, !32}
+!30 = !DILocalVariable(name: "argc", arg: 1, scope: !23, file: !1, line: 5, type: !12)
+!31 = !DILocalVariable(name: "argv", arg: 2, scope: !23, file: !1, line: 5, type: !26)
+!32 = !DILocalVariable(name: "array", scope: !23, file: !1, line: 6, type: !33)
+!33 = !DICompositeType(tag: DW_TAG_array_type, baseType: !12, size: 128, elements: !34)
+!34 = !{!35}
+!35 = !DISubrange(count: 4)
+!36 = !DILocation(line: 5, column: 14, scope: !23)
+!37 = !DILocation(line: 5, column: 27, scope: !23)
+!38 = !DILocation(line: 6, column: 3, scope: !23)
+!39 = !DILocation(line: 6, column: 7, scope: !23)
+!40 = !DILocation(line: 7, column: 5, scope: !23)
+!41 = !DILocation(line: 7, column: 3, scope: !23)
+!42 = !DILocation(line: 8, column: 10, scope: !23)
+!43 = !DILocation(line: 9, column: 1, scope: !23)
+!44 = !DILocation(line: 8, column: 3, scope: !23)
diff --git a/test/DebugInfo/X86/dbg-value-frame-index.ll b/test/DebugInfo/X86/dbg-value-frame-index.ll
index 7b49aacfaefd..3c3c21257ed9 100644
--- a/test/DebugInfo/X86/dbg-value-frame-index.ll
+++ b/test/DebugInfo/X86/dbg-value-frame-index.ll
@@ -32,7 +32,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 !3 = !{i32 2, !"Debug Info Version", i32 3}
 !4 = distinct !DISubprogram(name: "test", type: !10, unit: !0)
 !5 = !DILocalVariable(name: "w", scope: !4, type: !9)
-!6 = !DIExpression(DW_OP_deref)
+!6 = !DIExpression()
 !7 = !DILocation(line: 210, column: 12, scope: !4)
 !8 = !{!9}
 !9 = !DIBasicType(name: "bool", size: 8, encoding: DW_ATE_boolean)
diff --git a/test/DebugInfo/X86/debug-loc-offset.ll b/test/DebugInfo/X86/debug-loc-offset.ll
index 8e3e1e97c319..8f2210e2c014 100644
--- a/test/DebugInfo/X86/debug-loc-offset.ll
+++ b/test/DebugInfo/X86/debug-loc-offset.ll
@@ -35,10 +35,6 @@
 ; CHECK: DW_AT_low_pc
 ; CHECK: DW_AT_high_pc
 
-; CHECK: DW_TAG_compile_unit
-; CHECK: DW_AT_low_pc
-; CHECK: DW_AT_high_pc
-
 ; CHECK: DW_TAG_subprogram
 ; CHECK-NOT: DW_TAG
 ; CHECK: DW_AT_linkage_name [DW_FORM_strp]{{.*}}"_Z3baz1A"
@@ -53,6 +49,10 @@
 ; CHECK: DW_AT_location [DW_FORM_exprloc]
 ; CHECK-NOT: DW_AT_location
 
+; CHECK: DW_TAG_compile_unit
+; CHECK: DW_AT_low_pc
+; CHECK: DW_AT_high_pc
+
 ; CHECK: .debug_loc contents:
 ; CHECK: 0x00000000: Beginning address offset: 0x0000000000000000
 ; CHECK:                Ending address offset: 0x0000000000000017
diff --git a/test/DebugInfo/X86/debug-macro.ll b/test/DebugInfo/X86/debug-macro.ll
index 2b3adce4776e..a8b3d4b9b87e 100644
--- a/test/DebugInfo/X86/debug-macro.ll
+++ b/test/DebugInfo/X86/debug-macro.ll
@@ -1,40 +1,38 @@
-; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck --check-prefix=CHECK-INFO %s
-; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=macro - | FileCheck --check-prefix=CHECK-MACRO %s
-; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-dump=line - | FileCheck --check-prefix=CHECK-LINE %s
+; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump - | FileCheck %s
 
 
-; CHECK-INFO: .debug_info contents:
-; CHECK-INFO: DW_TAG_compile_unit
-; CHECK-INFO-NOT: DW_TAG
-; CHECK-INFO:   DW_AT_name {{.*}}"debug-macro.cpp")
-; CHECK-INFO:   DW_AT_macro_info {{.*}}(0x00000000)
-; CHECK-INFO: DW_TAG_compile_unit
-; CHECK-INFO-NOT: DW_TAG
-; CHECK-INFO:   DW_AT_name {{.*}}"debug-macro1.cpp")
-; CHECK-INFO:   DW_AT_macro_info {{.*}}(0x00000044)
-; CHECK-INFO: DW_TAG_compile_unit
-; CHECK-INFO-NOT: DW_TAG
-; CHECK-INFO:   DW_AT_name {{.*}}"debug-macro2.cpp")
-; CHECK-INFO-NOT: DW_AT_macro_info
+; CHECK-LABEL: .debug_info contents:
+; CHECK: DW_TAG_compile_unit
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}}"debug-macro.cpp")
+; CHECK:   DW_AT_macro_info {{.*}}(0x00000000)
+; CHECK: DW_TAG_compile_unit
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}}"debug-macro1.cpp")
+; CHECK:   DW_AT_macro_info {{.*}}(0x00000044)
+; CHECK: DW_TAG_compile_unit
+; CHECK-NOT: DW_TAG
+; CHECK:   DW_AT_name {{.*}}"debug-macro2.cpp")
+; CHECK-NOT: DW_AT_macro_info
 
-; CHECK-MACRO:     .debug_macinfo contents:
-; CHECK-MACRO-NEXT: DW_MACINFO_define - lineno: 0 macro: NameCMD ValueCMD
-; CHECK-MACRO-NEXT: DW_MACINFO_start_file - lineno: 0 filenum: 1
-; CHECK-MACRO-NEXT:   DW_MACINFO_start_file - lineno: 9 filenum: 2
-; CHECK-MACRO-NEXT:     DW_MACINFO_define - lineno: 1 macro: NameDef Value
-; CHECK-MACRO-NEXT:     DW_MACINFO_undef - lineno: 11 macro: NameUndef
-; CHECK-MACRO-NEXT:   DW_MACINFO_end_file
-; CHECK-MACRO-NEXT:   DW_MACINFO_undef - lineno: 10 macro: NameUndef2
-; CHECK-MACRO-NEXT: DW_MACINFO_end_file
-; CHECK-MACRO-NEXT: DW_MACINFO_start_file - lineno: 0 filenum: 1
-; CHECK-MACRO-NEXT: DW_MACINFO_end_file
+; CHECK-LABEL:     .debug_macinfo contents:
+; CHECK-NEXT: DW_MACINFO_define - lineno: 0 macro: NameCMD ValueCMD
+; CHECK-NEXT: DW_MACINFO_start_file - lineno: 0 filenum: 1
+; CHECK-NEXT:   DW_MACINFO_start_file - lineno: 9 filenum: 2
+; CHECK-NEXT:     DW_MACINFO_define - lineno: 1 macro: NameDef Value
+; CHECK-NEXT:     DW_MACINFO_undef - lineno: 11 macro: NameUndef
+; CHECK-NEXT:   DW_MACINFO_end_file
+; CHECK-NEXT:   DW_MACINFO_undef - lineno: 10 macro: NameUndef2
+; CHECK-NEXT: DW_MACINFO_end_file
+; CHECK-NEXT: DW_MACINFO_start_file - lineno: 0 filenum: 1
+; CHECK-NEXT: DW_MACINFO_end_file
 
-; CHECK-LINE: .debug_line contents:
-; CHECK-LINE: Dir  Mod Time   File Len   File Name
-; CHECK-LINE: file_names[  1] {{.*}}debug-macro.cpp
-; CHECK-LINE: file_names[  2] {{.*}}debug-macro.h
-; CHECK-LINE: Dir  Mod Time   File Len   File Name
-; CHECK-LINE: file_names[  1] {{.*}}debug-macro1.cpp
+; CHECK-LABEL: .debug_line contents:
+; CHECK: Dir  Mod Time   File Len   File Name
+; CHECK: file_names[  1] {{.*}}debug-macro.cpp
+; CHECK: file_names[  2] {{.*}}debug-macro.h
+; CHECK: Dir  Mod Time   File Len   File Name
+; CHECK: file_names[  1] {{.*}}debug-macro1.cpp
 
 !llvm.dbg.cu = !{!0, !16, !20}
 !llvm.module.flags = !{!13, !14}
@@ -58,10 +56,14 @@
 !14 = !{i32 1, !"Debug Info Version", i32 3}
 !15 = !{!"clang version 3.5.0 "}
 
-!16 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !17, enums: !2, retainedTypes: !2, globals: !2, imports: !2, macros: !18)
+!16 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !17, enums: !2, retainedTypes: !22, globals: !2, imports: !2, macros: !18)
 !17 = !DIFile(filename: "debug-macro1.cpp", directory: "/")
 !18 = !{!19}
 !19 = !DIMacroFile(line: 0, file: !17, nodes: !2)
 
-!20 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !21, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!20 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: FullDebug, file: !21, enums: !2, retainedTypes: !24, globals: !2, imports: !2)
 !21 = !DIFile(filename: "debug-macro2.cpp", directory: "/")
+!22 = !{!23}
+!23 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!24 = !{!25}
+!25 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
diff --git a/test/DebugInfo/X86/empty.ll b/test/DebugInfo/X86/empty.ll
index 774b908adb30..b89af579fd90 100644
--- a/test/DebugInfo/X86/empty.ll
+++ b/test/DebugInfo/X86/empty.ll
@@ -8,10 +8,9 @@
 ; CHECK-NOT: file_names[
 
 ; CHECK: .debug_pubnames contents:
-; CHECK-NOT: Offset
-
-; CHECK: .debug_pubtypes contents:
-; CHECK-NOT: Offset
+; CHECK-NEXT: length = 0x0000000e
+; CHECK-NEXT: Offset
+; CHECK-NEXT: {{^$}}
 
 ; Don't emit DW_AT_addr_base when there are no addresses.
 ; FISSION-NOT: DW_AT_GNU_addr_base [DW_FORM_sec_offset]
@@ -19,8 +18,10 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!5}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: FullDebug, file: !4, enums: !2, retainedTypes: !2, globals: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.1 (trunk 143523)", isOptimized: true, emissionKind: FullDebug, file: !4, enums: !2, retainedTypes: !6, globals: !2)
 !2 = !{}
 !3 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
 !4 = !DIFile(filename: "empty.c", directory: "/home/nlewycky")
 !5 = !{i32 1, !"Debug Info Version", i32 3}
+!6 = !{!7}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/DebugInfo/X86/fission-hash.ll b/test/DebugInfo/X86/fission-hash.ll
index 1a5fba293175..de9966ab0be0 100644
--- a/test/DebugInfo/X86/fission-hash.ll
+++ b/test/DebugInfo/X86/fission-hash.ll
@@ -1,16 +1,18 @@
 ; RUN: llc -split-dwarf-file=foo.dwo -O0 %s -mtriple=x86_64-unknown-linux-gnu -filetype=obj -o %t
 ; RUN: llvm-dwarfdump -debug-dump=all %t | FileCheck %s
 
-; The source is an empty file.
+; The source is an empty file, modified to include/retain an 'int' type, since empty CUs are omitted.
 
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x0c1e629c9e5ada4f)
-; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x0c1e629c9e5ada4f)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x50d985146a74bb00)
+; CHECK: DW_AT_GNU_dwo_id [DW_FORM_data8] (0x50d985146a74bb00)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 188230) (llvm/trunk 188234)", isOptimized: false, splitDebugFilename: "foo.dwo", emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 188230) (llvm/trunk 188234)", isOptimized: false, splitDebugFilename: "foo.dwo", emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !5, globals: !2, imports: !2)
 !1 = !DIFile(filename: "foo.c", directory: "/usr/local/google/home/echristo/tmp")
 !2 = !{}
 !3 = !{i32 2, !"Dwarf Version", i32 3}
 !4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/DebugInfo/X86/gnu-public-names-empty.ll b/test/DebugInfo/X86/gnu-public-names-empty.ll
index b04f7af64908..c5d44ad0f082 100644
--- a/test/DebugInfo/X86/gnu-public-names-empty.ll
+++ b/test/DebugInfo/X86/gnu-public-names-empty.ll
@@ -9,11 +9,18 @@
 ; CHECK: DW_AT_GNU_pubnames [DW_FORM_flag_present]   (true)
 ; CHECK-NOT: DW_AT_GNU_pubtypes [
 
+; CHECK: .debug_gnu_pubnames contents:
+; CHECK-NEXT: length = 0x0000000e
+; CHECK-NEXT: Offset
+; CHECK-NEXT: {{^$}}
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 191846) (llvm/trunk 191866)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (trunk 191846) (llvm/trunk 191866)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !5, globals: !2, imports: !2)
 !1 = !DIFile(filename: "foo.c", directory: "/usr/local/google/home/echristo/tmp")
 !2 = !{}
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 !4 = !{i32 1, !"Debug Info Version", i32 3}
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/DebugInfo/X86/gnu-public-names-gmlt.ll b/test/DebugInfo/X86/gnu-public-names-gmlt.ll
new file mode 100644
index 000000000000..569f56a50af0
--- /dev/null
+++ b/test/DebugInfo/X86/gnu-public-names-gmlt.ll
@@ -0,0 +1,68 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -filetype=obj < %s -generate-gnu-dwarf-pub-sections | llvm-dwarfdump - | FileCheck --check-prefix=GPUB --check-prefix=CHECK %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -filetype=obj < %s -generate-dwarf-pub-sections=Enable | llvm-dwarfdump - | FileCheck --check-prefix=PUB --check-prefix=CHECK %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -filetype=obj < %s | llvm-dwarfdump - | FileCheck --check-prefix=NONE %s
+
+; Generated from:
+;   void f1();
+;   inline __attribute__((always_inline)) void f2() {
+;     f1();
+;   }
+;   void f3() {
+;     f2();
+;   }
+;   $ clang++ -gmlt %s -emit-llvm -S
+
+; GPUB: Compile Unit
+; GPUB: DW_AT_GNU_pubnames
+
+; GPUB: .debug_gnu_pubnames contents:
+; PUB: .debug_pubnames contents:
+; CHECK-NEXT: unit_offset = 0x00000000
+; CHECK-NEXT: Name
+; CHECK-NEXT: "f2"
+; CHECK-NEXT: "f3"
+
+; GPUB: .debug_gnu_pubtypes contents:
+; PUB: .debug_pubtypes contents:
+; CHECK-NEXT: length = 0x0000000e version = 0x0002 unit_offset = 0x00000000
+; CHECK-NEXT: Name
+
+; NONE: .debug_pubnames contents:
+; NONE: {{^$}}
+; NONE: .debug_pubtypes contents:
+; NONE: {{^$}}
+; NONE: .debug_gnu_pubnames contents:
+; NONE: {{^$}}
+; NONE: .debug_gnu_pubtypes contents:
+; NONE: {{^$}}
+
+
+; Function Attrs: noinline uwtable
+define void @_Z2f3v() #0 !dbg !7 {
+entry:
+  call void @_Z2f1v(), !dbg !9
+  ret void, !dbg !12
+}
+
+declare void @_Z2f1v() #1
+
+attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 303768) (llvm/trunk 303774)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "gnu-public-names-gmlt.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 5.0.0 (trunk 303768) (llvm/trunk 303774)"}
+!7 = distinct !DISubprogram(name: "f3", scope: !1, file: !1, line: 5, type: !8, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 3, column: 3, scope: !10, inlinedAt: !11)
+!10 = distinct !DISubprogram(name: "f2", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!11 = distinct !DILocation(line: 6, column: 3, scope: !7)
+!12 = !DILocation(line: 7, column: 1, scope: !7)
diff --git a/test/DebugInfo/X86/split-dwarf-multiple-cu-hash.ll b/test/DebugInfo/X86/split-dwarf-multiple-cu-hash.ll
new file mode 100644
index 000000000000..f801ade27bd9
--- /dev/null
+++ b/test/DebugInfo/X86/split-dwarf-multiple-cu-hash.ll
@@ -0,0 +1,42 @@
+; RUN: %llc_dwarf -split-dwarf-file=foo.dwo  %s -filetype=obj -o %T/a.o
+; RUN: %llc_dwarf -split-dwarf-file=bar.dwo  %s -filetype=obj -o %T/b.o
+; RUN: llvm-dwarfdump -debug-dump=info %T/a.o %T/b.o | FileCheck %s
+
+; CHECK: dwo_id {{.*}}([[HASH:.*]])
+; CHECK-NOT: dwo_id {{.*}}([[HASH]])
+
+target triple = "x86_64-pc-linux"
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z1av() #0 !dbg !9 {
+entry:
+  ret void, !dbg !12
+}
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z1bv() #0 !dbg !13 {
+entry:
+  ret void, !dbg !14
+}
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0, !3}
+!llvm.ident = !{!5, !5}
+!llvm.module.flags = !{!6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 304107) (llvm/trunk 304109)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "a.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!2 = !{}
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 5.0.0 (trunk 304107) (llvm/trunk 304109)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!4 = !DIFile(filename: "b.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!5 = !{!"clang version 5.0.0 (trunk 304107) (llvm/trunk 304109)"}
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = distinct !DISubprogram(name: "a", linkageName: "_Z1av", scope: !1, file: !1, line: 1, type: !10, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!10 = !DISubroutineType(types: !11)
+!11 = !{null}
+!12 = !DILocation(line: 2, column: 1, scope: !9)
+!13 = distinct !DISubprogram(name: "b", linkageName: "_Z1bv", scope: !4, file: !4, line: 1, type: !10, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !3, variables: !2)
+!14 = !DILocation(line: 2, column: 1, scope: !13)
diff --git a/test/DebugInfo/X86/split-dwarf-omit-empty.ll b/test/DebugInfo/X86/split-dwarf-omit-empty.ll
new file mode 100644
index 000000000000..5f824bd43cd3
--- /dev/null
+++ b/test/DebugInfo/X86/split-dwarf-omit-empty.ll
@@ -0,0 +1,54 @@
+; RUN: %llc_dwarf -split-dwarf-file=foo.dwo  %s -filetype=obj -o - | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; Created from:
+; a.cpp:
+;   void f1();
+;   inline __attribute__((always_inline)) __attribute__((used)) void f2() { f1(); }
+; b.cpp:
+;   void f2();
+;   void f3() {
+;     f2();
+;   }
+; $ clang++ -fno-split-dwarf-inlining {a,b}.cpp -emit-llvm -S -g
+; $ llvm-link {a,b}.ll -S -o ab.ll
+; Then strip out the @llvm.used global, so no out of line definition of 'f2'
+; will be emitted. This emulates something more like the available_externally
+; import performed by ThinLTO.
+
+; CHECK: Compile Unit
+; CHECK-NOT: Compile Unit
+
+target triple = "x86_64-pc-linux"
+
+declare void @_Z2f1v()
+
+; Function Attrs: noinline norecurse uwtable
+define i32 @main() !dbg !9 {
+entry:
+  call void @_Z2f1v(), !dbg !13
+  ret i32 0, !dbg !18
+}
+
+!llvm.dbg.cu = !{!0, !3}
+!llvm.ident = !{!5, !5}
+!llvm.module.flags = !{!6, !7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 304054) (llvm/trunk 304080)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false)
+!1 = !DIFile(filename: "a.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!2 = !{}
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 5.0.0 (trunk 304054) (llvm/trunk 304080)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false)
+!4 = !DIFile(filename: "b.cpp", directory: "/usr/local/google/home/blaikie/dev/scratch")
+!5 = !{!"clang version 5.0.0 (trunk 304054) (llvm/trunk 304080)"}
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i32 1, !"wchar_size", i32 4}
+!9 = distinct !DISubprogram(name: "main", scope: !4, file: !4, line: 2, type: !10, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !3, variables: !2)
+!10 = !DISubroutineType(types: !11)
+!11 = !{!12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !DILocation(line: 2, column: 73, scope: !14, inlinedAt: !17)
+!14 = distinct !DISubprogram(name: "f2", linkageName: "_Z2f2v", scope: !1, file: !1, line: 2, type: !15, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = distinct !DILocation(line: 3, column: 3, scope: !9)
+!18 = !DILocation(line: 4, column: 1, scope: !9)
diff --git a/test/DebugInfo/dwo.ll b/test/DebugInfo/dwo.ll
index 5eeca541e78d..b6de943c0239 100644
--- a/test/DebugInfo/dwo.ll
+++ b/test/DebugInfo/dwo.ll
@@ -8,8 +8,10 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2, dwoId: 43981)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !2, retainedTypes: !5, globals: !2, imports: !2, dwoId: 43981)
 !1 = !DIFile(filename: "<stdin>", directory: "/")
 !2 = !{}
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
diff --git a/test/DebugInfo/llvm-symbolizer.test b/test/DebugInfo/llvm-symbolizer.test
index f0db8f4b921f..2c64804659fc 100644
--- a/test/DebugInfo/llvm-symbolizer.test
+++ b/test/DebugInfo/llvm-symbolizer.test
@@ -27,6 +27,9 @@ RUN: cp %p/Inputs/split-dwarf-multiple-cu.dwo %T
 RUN: echo "%p/Inputs/split-dwarf-multiple-cu.o 0x4" >> %t.input
 RUN: cp %p/Inputs/split-dwarf-addr-object-relocation.dwo %T
 RUN: echo "%p/Inputs/split-dwarf-addr-object-relocation.o 0x14" >> %t.input
+RUN: cp %p/Inputs/split-dwarf-dwp.o %T
+RUN: cp %p/Inputs/split-dwarf-dwp.o.dwp %T
+RUN: echo "%T/split-dwarf-dwp.o 0x4" >> %t.input
 
 RUN: llvm-symbolizer --functions=linkage --inlining --demangle=false \
 RUN:    --default-arch=i386 < %t.input | FileCheck --check-prefix=CHECK --check-prefix=SPLIT --check-prefix=DWO %s
@@ -147,6 +150,11 @@ CHECK-NEXT: split-dwarf-addr-object-relocation.cpp:3:3
 CHECK-NEXT: f3
 CHECK-NEXT: split-dwarf-addr-object-relocation.cpp:6:0
 
+CHECK:      f2
+CHECK-NEXT: split-dwarf-dwp.cpp:3:3
+CHECK-NEXT: f3
+CHECK-NEXT: split-dwarf-dwp.cpp:6:0
+
 RUN: echo "unexisting-file 0x1234" > %t.input2
 RUN: llvm-symbolizer < %t.input2 2>&1 | FileCheck %s --check-prefix=MISSING-FILE
 
diff --git a/test/DebugInfo/omit-empty.ll b/test/DebugInfo/omit-empty.ll
new file mode 100644
index 000000000000..92450050d208
--- /dev/null
+++ b/test/DebugInfo/omit-empty.ll
@@ -0,0 +1,12 @@
+; RUN: %llc_dwarf %s -filetype=obj -o - | llvm-objdump -h - | FileCheck %s
+
+; CHECK-NOT: .debug_
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/DebugInfo/skeletoncu.ll b/test/DebugInfo/skeletoncu.ll
index a90c8b355ce2..4c96d82036e7 100644
--- a/test/DebugInfo/skeletoncu.ll
+++ b/test/DebugInfo/skeletoncu.ll
@@ -8,9 +8,11 @@
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, splitDebugFilename: "my.dwo", emissionKind: FullDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2, dwoId: 43981)
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "LLVM", isOptimized: false, runtimeVersion: 2, splitDebugFilename: "my.dwo", emissionKind: FullDebug, enums: !2, retainedTypes: !6, globals: !2, imports: !2, dwoId: 43981)
 !1 = !DIFile(filename: "<stdin>", directory: "/")
 !2 = !{}
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!6 = !{!5}
 
diff --git a/test/ExecutionEngine/MCJIT/lit.local.cfg b/test/ExecutionEngine/MCJIT/lit.local.cfg
index f98140357736..e2535ef1dbfd 100644
--- a/test/ExecutionEngine/MCJIT/lit.local.cfg
+++ b/test/ExecutionEngine/MCJIT/lit.local.cfg
@@ -9,7 +9,8 @@ else:
 # FIXME: autoconf and cmake produce different arch names. We should normalize
 # them before getting here.
 if root.host_arch not in ['i386', 'x86', 'x86_64', 'AMD64',
-                          'AArch64', 'ARM', 'Mips', 'PowerPC', 'ppc64', 'SystemZ']:
+                          'AArch64', 'ARM', 'Mips',
+                          'PowerPC', 'ppc64', 'ppc64le', 'SystemZ']:
     config.unsupported = True
 
 if 'armv7' in root.host_arch:
diff --git a/test/ExecutionEngine/OrcMCJIT/lit.local.cfg b/test/ExecutionEngine/OrcMCJIT/lit.local.cfg
index f98140357736..e2535ef1dbfd 100644
--- a/test/ExecutionEngine/OrcMCJIT/lit.local.cfg
+++ b/test/ExecutionEngine/OrcMCJIT/lit.local.cfg
@@ -9,7 +9,8 @@ else:
 # FIXME: autoconf and cmake produce different arch names. We should normalize
 # them before getting here.
 if root.host_arch not in ['i386', 'x86', 'x86_64', 'AMD64',
-                          'AArch64', 'ARM', 'Mips', 'PowerPC', 'ppc64', 'SystemZ']:
+                          'AArch64', 'ARM', 'Mips',
+                          'PowerPC', 'ppc64', 'ppc64le', 'SystemZ']:
     config.unsupported = True
 
 if 'armv7' in root.host_arch:
diff --git a/test/ExecutionEngine/OrcMCJIT/pr32650.ll b/test/ExecutionEngine/OrcMCJIT/pr32650.ll
new file mode 100644
index 000000000000..bbf68aea512a
--- /dev/null
+++ b/test/ExecutionEngine/OrcMCJIT/pr32650.ll
@@ -0,0 +1,28 @@
+; RUN: %lli -jit-kind=orc-mcjit %s
+
+; This test is intended to verify that a function weakly defined in
+; JITted code, and strongly defined in the main executable, can be
+; correctly resolved when called from elsewhere in JITted code.
+
+; This test makes the assumption that the lli executable in compiled
+; to export symbols (e.g. --export-dynamic), and that is actually does
+; contain the symbol LLVMInitializeCodeGen.  (Note that this function
+; is not actually called by the test.  The test simply verifes that
+; the reference can be resolved without relocation errors.)
+
+define linkonce_odr void @LLVMInitializeCodeGen() {
+entry:
+  ret void
+}
+
+define void @test() {
+entry:
+  call void @LLVMInitializeCodeGen()
+  ret void
+}
+
+define i32 @main() {
+entry:
+  ret i32 0
+}
+
diff --git a/test/Feature/fp-intrinsics.ll b/test/Feature/fp-intrinsics.ll
index 960bfb5ca105..f21ba15b2d49 100644
--- a/test/Feature/fp-intrinsics.ll
+++ b/test/Feature/fp-intrinsics.ll
@@ -95,8 +95,156 @@ if.end:
 }
 
 
+; Verify that sqrt(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f5
+; CHECK: call double @llvm.experimental.constrained.sqrt
+define double @f5() {
+entry:
+  %result = call double @llvm.experimental.constrained.sqrt.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that pow(42.1, 3.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f6
+; CHECK: call double @llvm.experimental.constrained.pow
+define double @f6() {
+entry:
+  %result = call double @llvm.experimental.constrained.pow.f64(double 42.1,
+                                               double 3.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that powi(42.1, 3) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f7
+; CHECK: call double @llvm.experimental.constrained.powi
+define double @f7() {
+entry:
+  %result = call double @llvm.experimental.constrained.powi.f64(double 42.1,
+                                               i32 3,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that sin(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f8
+; CHECK: call double @llvm.experimental.constrained.sin
+define double @f8() {
+entry:
+  %result = call double @llvm.experimental.constrained.sin.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that cos(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f9
+; CHECK: call double @llvm.experimental.constrained.cos
+define double @f9() {
+entry:
+  %result = call double @llvm.experimental.constrained.cos.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that exp(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f10
+; CHECK: call double @llvm.experimental.constrained.exp
+define double @f10() {
+entry:
+  %result = call double @llvm.experimental.constrained.exp.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that exp2(42.1) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f11
+; CHECK: call double @llvm.experimental.constrained.exp2
+define double @f11() {
+entry:
+  %result = call double @llvm.experimental.constrained.exp2.f64(double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that log(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f12
+; CHECK: call double @llvm.experimental.constrained.log
+define double @f12() {
+entry:
+  %result = call double @llvm.experimental.constrained.log.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that log10(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f13
+; CHECK: call double @llvm.experimental.constrained.log10
+define double @f13() {
+entry:
+  %result = call double @llvm.experimental.constrained.log10.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that log2(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f14
+; CHECK: call double @llvm.experimental.constrained.log2
+define double @f14() {
+entry:
+  %result = call double @llvm.experimental.constrained.log2.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that rint(42.1) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: f15
+; CHECK: call double @llvm.experimental.constrained.rint
+define double @f15() {
+entry:
+  %result = call double @llvm.experimental.constrained.rint.f64(double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
+; Verify that nearbyint(42.1) isn't simplified when the rounding mode is
+; unknown.
+; CHECK-LABEL: f16
+; CHECK: call double @llvm.experimental.constrained.nearbyint
+define double @f16() {
+entry:
+  %result = call double @llvm.experimental.constrained.nearbyint.f64(
+                                               double 42.1,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %result
+}
+
 @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata"
 declare double @llvm.experimental.constrained.fdiv.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fmul.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.fsub.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata)
+declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.exp.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.exp2.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.log2.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.rint.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.nearbyint.f64(double, metadata, metadata)
diff --git a/test/Instrumentation/SanitizerCoverage/chains.ll b/test/Instrumentation/SanitizerCoverage/chains.ll
new file mode 100644
index 000000000000..86b109165ee5
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/chains.ll
@@ -0,0 +1,33 @@
+; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=1  -S | FileCheck %s
+
+define i32 @blah(i32) #0 {
+  %2 = icmp sgt i32 %0, 1
+  br i1 %2, label %branch, label %exit
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+branch:
+  br label %pos2
+; CHECK-LABEL: branch:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+pos2:
+  br label %pos3
+; CHECK-LABEL: pos2:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+pos3:
+  br label %pos4
+; CHECK-LABEL: pos3:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+pos4:
+  ret i32 0
+; CHECK-LABEL: pos4:
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+exit:
+  ret i32 0
+; CHECK-LABEL: exit:
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+}
diff --git a/test/Instrumentation/SanitizerCoverage/postdominator_check.ll b/test/Instrumentation/SanitizerCoverage/postdominator_check.ll
new file mode 100644
index 000000000000..c50d663eff82
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/postdominator_check.ll
@@ -0,0 +1,85 @@
+; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=1 -S | FileCheck %s
+; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc -sanitizer-coverage-prune-blocks=0 -S | FileCheck %s --check-prefix=CHECK_NO_PRUNE
+
+define i32 @foo(i32) #0 {
+  %2 = icmp sgt i32 %0, 0
+  br i1 %2, label %left, label %right
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+left:
+  %3 = icmp sgt i32 %0, 10
+  br i1 %3, label %left_left, label %left_right
+; CHECK-LABEL: left:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: left:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+left_left:
+  br label %left_join
+; CHECK-LABEL: left_left:
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: left_left:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+left_right:
+  br label %left_join
+; CHECK-LABEL: left_right:
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: left_right:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+left_join:
+  br label %finish
+; CHECK-LABEL: left_join:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: left_join:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+right:
+  %4 = icmp sgt i32 %0, 10
+  br i1 %4, label %right_left, label %right_right
+; CHECK-LABEL: right:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: right:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+right_left:
+  br label %right_join
+; CHECK-LABEL: right_left:
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: right_left:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+right_right:
+  br label %right_join
+; CHECK-LABEL: right_right:
+; CHECK: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: right_right:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+right_join:
+  br label %finish
+; CHECK-LABEL: right_join:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: right_join:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+finish:
+  ret i32 %0
+; CHECK-LABEL: finish:
+; CHECK-NOT: call void @__sanitizer_cov_trace_pc()
+
+; CHECK_NO_PRUNE-LABEL: finish:
+; CHECK_NO_PRUNE: call void @__sanitizer_cov_trace_pc()
+
+}
diff --git a/test/LTO/Resolution/X86/linkonce.ll b/test/LTO/Resolution/X86/linkonce.ll
new file mode 100644
index 000000000000..33d2df740a4d
--- /dev/null
+++ b/test/LTO/Resolution/X86/linkonce.ll
@@ -0,0 +1,11 @@
+; RUN: opt -module-summary -o %t %s
+; RUN: llvm-lto2 run %t -O0 -r %t,foo,px -o %t2
+; RUN: llvm-nm %t2.0 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: W foo
+define linkonce_odr void @foo() {
+  ret void
+}
diff --git a/test/LTO/Resolution/X86/type-checked-load.ll b/test/LTO/Resolution/X86/type-checked-load.ll
new file mode 100644
index 000000000000..3ec783bc5653
--- /dev/null
+++ b/test/LTO/Resolution/X86/type-checked-load.ll
@@ -0,0 +1,16 @@
+; RUN: llvm-as -o %t %s
+; RUN: llvm-lto2 run %t -O0 -r %t,foo,px -o %t2
+
+; This just tests that we don't crash when compiling this test case.
+; It means that the wholeprogramdevirt pass must have run and lowered
+; the llvm.type.checked.load call.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define {i8*, i1} @foo(i8* %ptr) {
+  %pair = call {i8*, i1} @llvm.type.checked.load(i8* %ptr, i32 16, metadata !"foo")
+  ret {i8*, i1} %pair
+}
+
+declare {i8*, i1} @llvm.type.checked.load(i8* %ptr, i32 %offset, metadata %type)
diff --git a/test/Linker/Inputs/module-flags-pic-2-b.ll b/test/Linker/Inputs/module-flags-pic-2-b.ll
index 0d78cafc6a0f..f652eddb3842 100644
--- a/test/Linker/Inputs/module-flags-pic-2-b.ll
+++ b/test/Linker/Inputs/module-flags-pic-2-b.ll
@@ -1,3 +1,4 @@
-!0 = !{ i32 1, !"PIC Level", i32 2 }
+!0 = !{ i32 7, !"PIC Level", i32 2 }
+!1 = !{ i32 7, !"PIE Level", i32 2 }
 
-!llvm.module.flags = !{!0}
+!llvm.module.flags = !{!0, !1}
diff --git a/test/Linker/module-flags-pic-2-a.ll b/test/Linker/module-flags-pic-2-a.ll
index e09af6bcd128..8898d72d5101 100644
--- a/test/Linker/module-flags-pic-2-a.ll
+++ b/test/Linker/module-flags-pic-2-a.ll
@@ -1,10 +1,11 @@
-; RUN: not llvm-link %s %p/Inputs/module-flags-pic-2-b.ll -S -o - 2> %t
-; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+; RUN: llvm-link %s %p/Inputs/module-flags-pic-2-b.ll -S -o - | FileCheck %s
 
-; test linking modules with two different PIC levels
+; test linking modules with two different PIC and PIE levels
 
-!0 = !{ i32 1, !"PIC Level", i32 1 }
+!0 = !{ i32 7, !"PIC Level", i32 1 }
+!1 = !{ i32 7, !"PIE Level", i32 1 }
 
-!llvm.module.flags = !{!0}
+!llvm.module.flags = !{!0, !1}
 
-; CHECK-ERRORS: ERROR: linking module flags 'PIC Level': IDs have conflicting values
+; CHECK: !0 = !{i32 7, !"PIC Level", i32 2}
+; CHECK: !1 = !{i32 7, !"PIE Level", i32 2}
diff --git a/test/MC/AMDGPU/vop_sdwa.s b/test/MC/AMDGPU/vop_sdwa.s
index 75db3259f43c..59dd30ed0e48 100644
--- a/test/MC/AMDGPU/vop_sdwa.s
+++ b/test/MC/AMDGPU/vop_sdwa.s
@@ -1,42 +1,42 @@
-// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=VI --check-prefix=GFX89
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=GFX9 --check-prefix=GFX89
 
 // RUN: not llvm-mc -arch=amdgcn -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI
 // RUN: not llvm-mc -arch=amdgcn -mcpu=SI -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSI --check-prefix=NOSICI
 // RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOSICI
-// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOVI
-
-// ToDo: intrinsics
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOVI --check-prefix=NOGFX89
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=NOGFX9 --check-prefix=NOGFX89
 
 //---------------------------------------------------------------------------//
 // Check SDWA operands
 //---------------------------------------------------------------------------//
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x02,0x10,0x06,0x06]
+// GFX89: v_mov_b32_sdwa v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x02,0x10,0x06,0x06]
 v_mov_b32 v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x06,0x7e,0x04,0x11,0x05,0x06]
+// GFX89: v_mov_b32_sdwa v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x06,0x7e,0x04,0x11,0x05,0x06]
 v_mov_b32 v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v15, v99 dst_sel:BYTE_2 dst_unused:UNUSED_SEXT src0_sel:WORD_0 ; encoding: [0xf9,0x02,0x1e,0x7e,0x63,0x0a,0x04,0x06]
+// GFX89: v_mov_b32_sdwa v15, v99 dst_sel:BYTE_2 dst_unused:UNUSED_SEXT src0_sel:WORD_0 ; encoding: [0xf9,0x02,0x1e,0x7e,0x63,0x0a,0x04,0x06]
 v_mov_b32 v15, v99 dst_sel:BYTE_2 dst_unused:UNUSED_SEXT src0_sel:WORD_0
 
 // NOSICI: error:
-// VI: v_min_u32_sdwa v194, v13, v1 dst_sel:BYTE_3 dst_unused:UNUSED_SEXT src0_sel:BYTE_3 src1_sel:BYTE_2 ; encoding: [0xf9,0x02,0x84,0x1d,0x0d,0x0b,0x03,0x02]
+// GFX89: v_min_u32_sdwa v194, v13, v1 dst_sel:BYTE_3 dst_unused:UNUSED_SEXT src0_sel:BYTE_3 src1_sel:BYTE_2 ; encoding: [0xf9,0x02,0x84,0x1d,0x0d,0x0b,0x03,0x02]
 v_min_u32 v194, v13, v1 dst_sel:BYTE_3 dst_unused:UNUSED_SEXT src0_sel:BYTE_3 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_u32_sdwa v255, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:WORD_1 ; encoding: [0xf9,0x02,0xfe,0x1d,0x04,0x04,0x02,0x05]
+// GFX89: v_min_u32_sdwa v255, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:WORD_1 ; encoding: [0xf9,0x02,0xfe,0x1d,0x04,0x04,0x02,0x05]
 v_min_u32 v255, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_min_u32_sdwa v200, v200, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x90,0x1d,0xc8,0x05,0x01,0x06]
+// GFX89: v_min_u32_sdwa v200, v200, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x90,0x1d,0xc8,0x05,0x01,0x06]
 v_min_u32 v200, v200, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 
 // NOSICI: error:
-// VI: v_min_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x1c,0x01,0x06,0x00,0x06]
+// GFX89: v_min_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x1c,0x01,0x06,0x00,0x06]
 v_min_u32 v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 
 //---------------------------------------------------------------------------//
@@ -44,43 +44,43 @@ v_min_u32 v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_se
 //---------------------------------------------------------------------------//
 
 // NOSICI: error:
-// VI: v_cvt_u32_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x0e,0x00,0x7e,0x00,0x36,0x06,0x06]
+// GFX89: v_cvt_u32_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x0e,0x00,0x7e,0x00,0x36,0x06,0x06]
 v_cvt_u32_f32 v0, v0 clamp dst_sel:DWORD
 
 // NOSICI: error:
-// VI: v_fract_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x26,0x06,0x06]
+// GFX89: v_fract_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x26,0x06,0x06]
 v_fract_f32 v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD
 
 // NOSICI: error:
-// VI: v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
 v_sin_f32 v0, v0 dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v1, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x36,0x05,0x06]
+// GFX89: v_mov_b32_sdwa v1, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x36,0x05,0x06]
 v_mov_b32 v1, v0 clamp src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_trunc_f32_sdwa v1, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x38,0x02,0x7e,0x00,0x36,0x05,0x06]
+// GFX89: v_trunc_f32_sdwa v1, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x38,0x02,0x7e,0x00,0x36,0x05,0x06]
 v_trunc_f32 v1, v0 clamp dst_sel:DWORD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x06,0x06]
+// GFX89: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x06,0x06]
 v_mov_b32_sdwa v1, v0
 
 // NOSICI: error:
-// VI: v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x06]
+// GFX89: v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x06]
 v_add_f32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_min_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x36,0x06,0x02]
+// GFX89: v_min_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x36,0x06,0x02]
 v_min_f32 v0, v0, v0 clamp dst_sel:DWORD src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x02]
+// GFX89: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x02]
 v_and_b32 v0, v0, v0 dst_unused:UNUSED_PAD src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_i32_i24_sdwa v1, v2, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x06,0x02,0x0c,0x02,0x36,0x06,0x06]
+// GFX89: v_mul_i32_i24_sdwa v1, v2, v3 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x06,0x02,0x0c,0x02,0x36,0x06,0x06]
 v_mul_i32_i24_sdwa v1, v2, v3 clamp
 
 //===----------------------------------------------------------------------===//
@@ -88,255 +88,256 @@ v_mul_i32_i24_sdwa v1, v2, v3 clamp
 //===----------------------------------------------------------------------===//
 
 // NOSICI: error:
-// VI: v_fract_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x25,0x06]
+// GFX89: v_fract_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x25,0x06]
 v_fract_f32 v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_sin_f32_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x35,0x06]
+// GFX89: v_sin_f32_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x35,0x06]
 v_sin_f32 v0, -abs(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_add_f32_sdwa v0, -|v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x35,0x12]
+// GFX89: v_add_f32_sdwa v0, -|v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x35,0x12]
 v_add_f32 v0, -|v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_f32_sdwa v0, |v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x25,0x12]
+// GFX89: v_min_f32_sdwa v0, |v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x25,0x12]
 v_min_f32 v0, abs(v0), -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x0e,0x06]
+// GFX89: v_mov_b32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x0e,0x06]
 v_mov_b32_sdwa v1, sext(v0)
 
 // NOSICI: error:
-// VI: v_and_b32_sdwa v0, sext(v0), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x0e,0x0a]
+// GFX89: v_and_b32_sdwa v0, sext(v0), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x0e,0x0a]
 v_and_b32 v0, sext(v0), sext(v0) dst_unused:UNUSED_PAD src1_sel:BYTE_2
 
 // NOSICI: error:
 // VI: v_cmp_class_f32 vcc, -v1, sext(v2) src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x12,0x0c]
-v_cmp_class_f32 vcc, -v1, sext(v2) src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_class_f32_sdwa vcc, -v1, sext(v2) src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x20,0x7c,0x01,0x00,0x12,0x0c]
+v_cmp_class_f32_sdwa vcc, -v1, sext(v2) src0_sel:BYTE_2 src1_sel:WORD_0
 
 //===----------------------------------------------------------------------===//
 // Check VOP1 opcodes
 //===----------------------------------------------------------------------===//
 
 // NOSICI: error:
-// VI: v_nop ; encoding: [0xf9,0x00,0x00,0x7e,0x00,0x16,0x06,0x06]
+// GFX89: v_nop ; encoding: [0xf9,0x00,0x00,0x7e,0x00,0x16,0x06,0x06]
 v_nop_sdwa
 
 // NOSICI: error:
-// VI: v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_u32_f32 v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x05,0x06]
 v_fract_f32 v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
 v_sin_f32 v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_mov_b32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_i32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0a,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_i32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0a,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_i32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0c,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0c,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_u32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x10,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x10,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_i32_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f16_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x14,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f16_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x14,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f16_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x16,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x16,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_rpi_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x18,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_rpi_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x18,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_rpi_i32_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_flr_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x1a,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_flr_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x1a,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_flr_i32_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_off_f32_i4_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x1c,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_off_f32_i4_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x1c,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_off_f32_i4 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x22,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x22,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_ubyte0 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_ubyte1_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x24,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_ubyte1_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x24,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_ubyte1 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_ubyte2_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x26,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_ubyte2_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x26,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_ubyte2 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f32_ubyte3_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x28,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f32_ubyte3_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x28,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f32_ubyte3 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_trunc_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x38,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_trunc_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x38,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_trunc_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_ceil_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3a,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_ceil_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3a,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_ceil_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rndne_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3c,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rndne_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3c,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rndne_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_floor_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3e,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_floor_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3e,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_floor_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_exp_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x40,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_exp_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x40,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_exp_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_log_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x42,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_log_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x42,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_log_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rcp_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x44,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rcp_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x44,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rcp_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rcp_iflag_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x46,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rcp_iflag_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x46,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rcp_iflag_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rsq_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x48,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rsq_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x48,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rsq_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_sqrt_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x4e,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_sqrt_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x4e,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_sqrt_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cos_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x54,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cos_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x54,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cos_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_not_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x56,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_not_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x56,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_not_b32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_bfrev_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x58,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_bfrev_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x58,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_bfrev_b32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5a,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5a,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_ffbh_u32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_ffbl_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5c,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_ffbl_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5c,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_ffbl_b32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_ffbh_i32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5e,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_ffbh_i32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5e,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_ffbh_i32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_frexp_exp_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x66,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_frexp_exp_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x66,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_frexp_exp_i32_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_frexp_mant_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x68,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_frexp_mant_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x68,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_frexp_mant_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_log_legacy_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x98,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_log_legacy_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x98,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_log_legacy_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_exp_legacy_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x96,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_exp_legacy_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x96,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_exp_legacy_f32 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f16_u16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x72,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f16_u16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x72,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f16_u16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_f16_i16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x74,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_f16_i16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x74,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_f16_i16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_u16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x76,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_u16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x76,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_u16_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cvt_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x78,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cvt_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x78,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cvt_i16_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7a,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7a,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rcp_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7c,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7c,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_sqrt_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7e,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7e,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rsq_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x80,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x80,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_log_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x82,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x82,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_exp_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x84,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x84,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_frexp_mant_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x86,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x86,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_frexp_exp_i16_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_floor_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x88,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_floor_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x88,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_floor_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_ceil_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8a,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_ceil_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8a,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_ceil_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_trunc_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8c,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_trunc_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8c,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_trunc_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8e,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8e,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_rndne_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_fract_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x90,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_fract_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x90,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_fract_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_sin_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x92,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_sin_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x92,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_sin_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 // NOSICI: error:
-// VI: v_cos_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x94,0x02,0x7e,0x00,0x06,0x05,0x06]
+// GFX89: v_cos_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x94,0x02,0x7e,0x00,0x06,0x05,0x06]
 v_cos_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 //===----------------------------------------------------------------------===//
@@ -344,195 +345,179 @@ v_cos_f16 v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 //===----------------------------------------------------------------------===//
 
 // NOSICI: error:
-// VI: v_mac_f32_sdwa v3, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x0a,0x06,0x2c,0x04,0x16,0x05,0x06]
-v_mac_f32 v3, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
-
-// NOSICI: error:
-// VI: v_mac_f32_sdwa v15, v99, v194 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:WORD_0 src1_sel:DWORD ; encoding: [0xf9,0x84,0x1f,0x2c,0x63,0x0e,0x04,0x06]
-v_mac_f32 v15, v99, v194 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:WORD_0
-
-// NOSICI: error:
-// NOVI: error: invalid operand for instruction
-v_mac_f32 v194, v13, v1 dst_sel:BYTE_0 dst_unused:UNUSED_SEXT src0_sel:BYTE_3 src1_sel:BYTE_2
-
-// NOSICI: error:
-// VI: v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x02]
+// GFX89: v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x02]
 v_add_f32 v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x05,0x02]
+// GFX89: v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x05,0x02]
 v_min_f32 v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x05,0x02]
+// GFX89: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x05,0x02]
 v_and_b32 v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0c,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0c,0x02,0x06,0x05,0x02]
 v_mul_i32_i24 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_sub_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x04,0x02,0x06,0x05,0x02]
+// GFX89: v_sub_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x04,0x02,0x06,0x05,0x02]
 v_sub_f32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_subrev_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x06,0x02,0x06,0x05,0x02]
+// GFX89: v_subrev_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x06,0x02,0x06,0x05,0x02]
 v_subrev_f32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0a,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0a,0x02,0x06,0x05,0x02]
 v_mul_f32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_hi_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0e,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_hi_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0e,0x02,0x06,0x05,0x02]
 v_mul_hi_i32_i24 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_u32_u24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x10,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_u32_u24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x10,0x02,0x06,0x05,0x02]
 v_mul_u32_u24 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_hi_u32_u24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x12,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_hi_u32_u24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x12,0x02,0x06,0x05,0x02]
 v_mul_hi_u32_u24 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_max_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x16,0x02,0x06,0x05,0x02]
+// GFX89: v_max_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x16,0x02,0x06,0x05,0x02]
 v_max_f32 v1, v2 v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x18,0x02,0x06,0x05,0x02]
+// GFX89: v_min_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x18,0x02,0x06,0x05,0x02]
 v_min_i32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_max_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1a,0x02,0x06,0x05,0x02]
+// GFX89: v_max_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1a,0x02,0x06,0x05,0x02]
 v_max_i32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_u32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1c,0x02,0x06,0x05,0x02]
+// GFX89: v_min_u32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1c,0x02,0x06,0x05,0x02]
 v_min_u32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_max_u32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1e,0x02,0x06,0x05,0x02]
+// GFX89: v_max_u32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1e,0x02,0x06,0x05,0x02]
 v_max_u32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_lshrrev_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x20,0x02,0x06,0x05,0x02]
+// GFX89: v_lshrrev_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x20,0x02,0x06,0x05,0x02]
 v_lshrrev_b32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_ashrrev_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x22,0x02,0x06,0x05,0x02]
+// GFX89: v_ashrrev_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x22,0x02,0x06,0x05,0x02]
 v_ashrrev_i32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_lshlrev_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x24,0x02,0x06,0x05,0x02]
+// GFX89: v_lshlrev_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x24,0x02,0x06,0x05,0x02]
 v_lshlrev_b32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x28,0x02,0x06,0x05,0x02]
+// GFX89: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x28,0x02,0x06,0x05,0x02]
 v_or_b32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_xor_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x2a,0x02,0x06,0x05,0x02]
+// GFX89: v_xor_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x2a,0x02,0x06,0x05,0x02]
 v_xor_b32 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_add_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3e,0x02,0x06,0x05,0x02]
+// GFX89: v_add_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3e,0x02,0x06,0x05,0x02]
 v_add_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_sub_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x40,0x02,0x06,0x05,0x02]
+// GFX89: v_sub_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x40,0x02,0x06,0x05,0x02]
 v_sub_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_subrev_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x42,0x02,0x06,0x05,0x02]
+// GFX89: v_subrev_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x42,0x02,0x06,0x05,0x02]
 v_subrev_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x44,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x44,0x02,0x06,0x05,0x02]
 v_mul_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mac_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x46,0x02,0x06,0x05,0x02]
-v_mac_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
-
-// NOSICI: error:
-// VI: v_add_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4c,0x02,0x06,0x05,0x02]
+// GFX89: v_add_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4c,0x02,0x06,0x05,0x02]
 v_add_u16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_sub_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4e,0x02,0x06,0x05,0x02]
+// GFX89: v_sub_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4e,0x02,0x06,0x05,0x02]
 v_sub_u16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_subrev_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x50,0x02,0x06,0x05,0x02]
+// GFX89: v_subrev_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x50,0x02,0x06,0x05,0x02]
 v_subrev_u16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_mul_lo_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x52,0x02,0x06,0x05,0x02]
+// GFX89: v_mul_lo_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x52,0x02,0x06,0x05,0x02]
 v_mul_lo_u16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_lshlrev_b16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x54,0x02,0x06,0x05,0x02]
+// GFX89: v_lshlrev_b16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x54,0x02,0x06,0x05,0x02]
 v_lshlrev_b16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_lshrrev_b16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x56,0x02,0x06,0x05,0x02]
+// GFX89: v_lshrrev_b16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x56,0x02,0x06,0x05,0x02]
 v_lshrrev_b16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_ashrrev_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x58,0x02,0x06,0x05,0x02]
+// GFX89: v_ashrrev_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x58,0x02,0x06,0x05,0x02]
 v_ashrrev_i16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_max_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5a,0x02,0x06,0x05,0x02]
+// GFX89: v_max_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5a,0x02,0x06,0x05,0x02]
 v_max_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5c,0x02,0x06,0x05,0x02]
+// GFX89: v_min_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5c,0x02,0x06,0x05,0x02]
 v_min_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_max_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5e,0x02,0x06,0x05,0x02]
+// GFX89: v_max_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5e,0x02,0x06,0x05,0x02]
 v_max_u16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_max_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x60,0x02,0x06,0x05,0x02]
+// GFX89: v_max_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x60,0x02,0x06,0x05,0x02]
 v_max_i16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x62,0x02,0x06,0x05,0x02]
+// GFX89: v_min_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x62,0x02,0x06,0x05,0x02]
 v_min_u16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_min_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x64,0x02,0x06,0x05,0x02]
+// GFX89: v_min_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x64,0x02,0x06,0x05,0x02]
 v_min_i16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_ldexp_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x66,0x02,0x06,0x05,0x02]
+// GFX89: v_ldexp_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x66,0x02,0x06,0x05,0x02]
 v_ldexp_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_add_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x32,0x02,0x06,0x05,0x02]
+// GFX89: v_add_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x32,0x02,0x06,0x05,0x02]
 v_add_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_sub_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x34,0x02,0x06,0x05,0x02]
+// GFX89: v_sub_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x34,0x02,0x06,0x05,0x02]
 v_sub_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_subrev_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x36,0x02,0x06,0x05,0x02]
+// GFX89: v_subrev_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x36,0x02,0x06,0x05,0x02]
 v_subrev_i32_sdwa v1, vcc, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_addc_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x38,0x02,0x06,0x05,0x02]
+// GFX89: v_addc_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x38,0x02,0x06,0x05,0x02]
 v_addc_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_subb_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3a,0x02,0x06,0x05,0x02]
+// GFX89: v_subb_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3a,0x02,0x06,0x05,0x02]
 v_subb_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 // NOSICI: error:
-// VI: v_subbrev_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3c,0x02,0x06,0x05,0x02]
+// GFX89: v_subbrev_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3c,0x02,0x06,0x05,0x02]
 v_subbrev_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
 
 //===----------------------------------------------------------------------===//
@@ -541,92 +526,210 @@ v_subbrev_u32_sdwa v1, vcc, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0
 
 // NOSICI: error:
 // VI: v_cmp_eq_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x02,0x04]
-v_cmp_eq_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x00,0x02,0x04]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmp_nle_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x02,0x04]
-v_cmp_nle_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x98,0x7c,0x01,0x00,0x02,0x04]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_gt_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x02,0x04]
-v_cmpx_gt_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xa8,0x7c,0x01,0x00,0x02,0x04]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_nlt_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x02,0x04]
-v_cmpx_nlt_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xbc,0x7c,0x01,0x00,0x02,0x04]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmp_lt_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x02,0x04]
-v_cmp_lt_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x82,0x7d,0x01,0x00,0x02,0x04]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmp_t_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x02,0x04]
-v_cmp_t_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x8e,0x7d,0x01,0x00,0x02,0x04]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_eq_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x02,0x04]
-v_cmpx_eq_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xa4,0x7d,0x01,0x00,0x02,0x04]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_ne_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x02,0x04]
-v_cmpx_ne_i32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xaa,0x7d,0x01,0x00,0x02,0x04]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmp_f_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x02,0x04]
-v_cmp_f_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x90,0x7d,0x01,0x00,0x02,0x04]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmp_gt_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x02,0x04]
-v_cmp_gt_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x98,0x7d,0x01,0x00,0x02,0x04]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_le_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x02,0x04]
-v_cmpx_le_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xb6,0x7d,0x01,0x00,0x02,0x04]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_ne_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x02,0x04]
-v_cmpx_ne_u32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xba,0x7d,0x01,0x00,0x02,0x04]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmp_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x02,0x04]
-v_cmp_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x20,0x7c,0x01,0x00,0x02,0x04]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
 // VI: v_cmpx_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x02,0x04]
-v_cmpx_class_f32 vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
+// GFX9: v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x22,0x7c,0x01,0x00,0x02,0x04]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0
 
 //===----------------------------------------------------------------------===//
-// Check that immideates and scalar regs are not supported
+// Check that immideates are not supported
 //===----------------------------------------------------------------------===//
 
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
+// NOV9: error: invalid operand for instruction
 v_mov_b32 v0, 1 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
+// NOGFX89: error: invalid operand for instruction
 v_and_b32 v0, 42, v1 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
+// NOGFX89: error: invalid operand for instruction
 v_add_f32 v0, v1, 345 src0_sel:BYTE_2 src1_sel:WORD_0
 
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
+// NOGFX89: error: invalid operand for instruction
 v_cmpx_class_f32 vcc, -1, 200 src0_sel:BYTE_2 src1_sel:WORD_0
 
+//===----------------------------------------------------------------------===//
+// Check GFX9-specific SDWA features
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// v_mac_f16/f32 is prohibited
+//===----------------------------------------------------------------------===//
+
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
-v_mov_b32 v0, s1 src0_sel:BYTE_2 src1_sel:WORD_0
+// VI: v_mac_f32_sdwa v3, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x0a,0x06,0x2c,0x04,0x16,0x05,0x06]
+// NOGFX9: error: instruction not supported on this GPU
+v_mac_f32 v3, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1
+
+// NOSICI: error:
+// VI: v_mac_f32_sdwa v15, v99, v194 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:WORD_0 src1_sel:DWORD ; encoding: [0xf9,0x84,0x1f,0x2c,0x63,0x0e,0x04,0x06]
+// NOGFX9: error: instruction not supported on this GPU
+v_mac_f32 v15, v99, v194 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:WORD_0
 
 // NOSICI: error:
 // NOVI: error: invalid operand for instruction
-v_and_b32 v0, s42, v1 src0_sel:BYTE_2 src1_sel:WORD_0
+// NOGFX9: error: instruction not supported on this GPU
+v_mac_f32 v194, v13, v1 dst_sel:BYTE_0 dst_unused:UNUSED_SEXT src0_sel:BYTE_3 src1_sel:BYTE_2
 
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
-v_add_f32 v0, v1, s45 src0_sel:BYTE_2 src1_sel:WORD_0
+// VI: v_mac_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x46,0x02,0x06,0x05,0x02]
+// NOGFX9: error: instruction not supported on this GPU
+v_mac_f16 v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
+
+//===----------------------------------------------------------------------===//
+// Scalar registers are allowed
+//===----------------------------------------------------------------------===//
 
 // NOSICI: error:
-// NOVI: error: invalid operand for instruction
-v_cmpx_class_f32 vcc, s1, s2 src0_sel:BYTE_2 src1_sel:WORD_0
+// NOVI: error:
+// GFX9: v_mov_b32_sdwa v1, s2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x02,0x10,0x86,0x06]
+v_mov_b32 v1, s2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_mov_b32_sdwa v1, exec dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x7e,0x10,0x86,0x06]
+v_mov_b32 v1, exec dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_add_f32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x85,0x02]
+v_add_f32 v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_add_f32_sdwa v0, v0, s22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x2c,0x00,0x02,0x00,0x06,0x05,0x82]
+v_add_f32 v0, v0, s22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// NO: invalid operand (violates constant bus restrictions)
+v_add_f32 v0, exec, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_cmp_eq_f32_sdwa vcc, s1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x00,0x85,0x02]
+v_cmp_eq_f32_sdwa vcc, s1, v2 src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_cmp_eq_f32_sdwa vcc, v1, s22 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x2c,0x84,0x7c,0x01,0x00,0x05,0x82]
+v_cmp_eq_f32_sdwa vcc, v1, s22 src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// NOGFX9: error: invalid operand (violates constant bus restrictions)
+v_cmp_eq_f32_sdwa vcc, exec, vcc src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_ceil_f16_sdwa v5, flat_scratch_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0x66,0x06,0x86,0x06]
+v_ceil_f16_sdwa v5, flat_scratch_lo dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+
+//===----------------------------------------------------------------------===//
+// VOPC with arbitrary SGPR destination
+//===----------------------------------------------------------------------===//
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_cmp_eq_f32_sdwa s[2:3], v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x82,0x05,0x02]
+v_cmp_eq_f32_sdwa s[2:3], v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_cmp_eq_f32_sdwa exec, v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0xfe,0x05,0x02]
+v_cmp_eq_f32_sdwa exec, v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_cmp_eq_f32_sdwa exec, s2, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x02,0xfe,0x85,0x02]
+v_cmp_eq_f32_sdwa exec, s2, v2 src0_sel:WORD_1 src1_sel:BYTE_2
+
+//===----------------------------------------------------------------------===//
+// OMod output modifier allowed
+//===----------------------------------------------------------------------===//
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_trunc_f32_sdwa v1, v2 mul:2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x38,0x02,0x7e,0x02,0x50,0x06,0x06]
+v_trunc_f32 v1, v2 mul:2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_trunc_f32_sdwa v1, v2 clamp div:2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x38,0x02,0x7e,0x02,0xf0,0x06,0x06]
+v_trunc_f32 v1, v2 clamp div:2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_add_f32_sdwa v0, v0, v0 mul:2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x46,0x05,0x02]
+v_add_f32 v0, v0, v0 mul:2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
+
+// NOSICI: error:
+// NOVI: error:
+// GFX9: v_add_f32_sdwa v0, v0, v0 clamp div:2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0xe6,0x05,0x02]
+v_add_f32 v0, v0, v0 clamp div:2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2
\ No newline at end of file
diff --git a/test/MC/Disassembler/AMDGPU/sdwa_gfx9.txt b/test/MC/Disassembler/AMDGPU/sdwa_gfx9.txt
new file mode 100644
index 000000000000..c697ebce2650
--- /dev/null
+++ b/test/MC/Disassembler/AMDGPU/sdwa_gfx9.txt
@@ -0,0 +1,477 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX9
+
+#-----------------------------------------------------------------------------#
+# Input modifiers
+#-----------------------------------------------------------------------------#
+
+# GFX9: v_fract_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x25,0x06]
+0xf9 0x36 0x00 0x7e 0x00 0x06 0x25 0x06
+
+# GFX9: v_sin_f32_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x35,0x06]
+0xf9 0x52 0x00 0x7e 0x00 0x06 0x35 0x06
+
+# GFX9: v_add_f32_sdwa v0, -|v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x35,0x12]
+0xf9 0x00 0x00 0x02 0x00 0x06 0x35 0x12
+
+# GFX9: v_min_f32_sdwa v0, |v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x25,0x12]
+0xf9 0x00 0x00 0x14 0x00 0x06 0x25 0x12
+
+#-----------------------------------------------------------------------------#
+# VOP1
+#-----------------------------------------------------------------------------#
+
+# GFX9: v_mov_b32_sdwa v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x02,0x10,0x06,0x06]
+0xf9 0x02 0x02 0x7e 0x02 0x10 0x06 0x06
+
+# GFX9: v_mov_b32_sdwa v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x06,0x7e,0x04,0x11,0x05,0x06]
+0xf9 0x02 0x06 0x7e 0x04 0x11 0x05 0x06
+
+# GFX9: v_mov_b32_sdwa v15, v99 dst_sel:BYTE_2 dst_unused:UNUSED_SEXT src0_sel:WORD_0 ; encoding: [0xf9,0x02,0x1e,0x7e,0x63,0x0a,0x04,0x06]
+0xf9 0x02 0x1e 0x7e 0x63 0x0a 0x04 0x06
+
+# GFX9: v_min_u32_sdwa v194, v13, v1 dst_sel:BYTE_3 dst_unused:UNUSED_SEXT src0_sel:BYTE_3 src1_sel:BYTE_2 ; encoding: [0xf9,0x02,0x84,0x1d,0x0d,0x0b,0x03,0x02]
+0xf9 0x02 0x84 0x1d 0x0d 0x0b 0x03 0x02
+
+# GFX9: v_min_u32_sdwa v255, v4, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:WORD_1 ; encoding: [0xf9,0x02,0xfe,0x1d,0x04,0x04,0x02,0x05]
+0xf9 0x02 0xfe 0x1d 0x04 0x04 0x02 0x05
+
+# GFX9: v_min_u32_sdwa v200, v200, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD ; encoding: [0xf9,0x02,0x90,0x1d,0xc8,0x05,0x01,0x06]
+0xf9 0x02 0x90 0x1d 0xc8 0x05 0x01 0x06
+
+# GFX9: v_min_u32_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x1c,0x01,0x06,0x00,0x06]
+0xf9 0x02 0x02 0x1c 0x01 0x06 0x00 0x06
+
+# GFX9: v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x0e,0x00,0x7e,0x00,0x16,0x06,0x06]
+0xf9 0x0e 0x00 0x7e 0x00 0x16 0x06 0x06
+
+# GFX9: v_fract_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x26,0x06,0x06]
+0xf9 0x36 0x00 0x7e 0x00 0x26 0x06 0x06
+
+# GFX9: v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x52 0x00 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x05,0x06]
+0xf9 0x02 0x02 0x7e 0x00 0x16 0x05 0x06
+
+# GFX9: v_trunc_f32_sdwa v1, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 ; encoding: [0xf9,0x38,0x02,0x7e,0x00,0x36,0x05,0x06]
+0xf9 0x38 0x02 0x7e 0x00 0x36 0x05 0x06
+
+# GFX9: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x06,0x06]
+0xf9 0x02 0x02 0x7e 0x00 0x16 0x06 0x06
+
+# GFX9: v_nop ; encoding: [0xf9,0x00,0x00,0x7e,0x00,0x16,0x06,0x06]
+0xf9 0x00 0x00 0x7e 0x00 0x16 0x06 0x06
+
+# GFX9: v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x0e 0x00 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x36 0x00 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x52 0x00 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_mov_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x02 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_i32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0a,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x0a 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x0c,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x0c 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x10,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x10 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f16_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x14,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x14 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x16,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x16 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_rpi_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x18,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x18 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_flr_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x1a,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x1a 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_off_f32_i4_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x1c,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x1c 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_ubyte0_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x22,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x22 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_ubyte1_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x24,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x24 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_ubyte2_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x26,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x26 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f32_ubyte3_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x28,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x28 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_trunc_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x38,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x38 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_ceil_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3a,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x3a 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rndne_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3c,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x3c 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_floor_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x3e,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x3e 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_exp_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x40,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x40 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_log_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x42,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x42 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rcp_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x44,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x44 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rcp_iflag_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x46,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x46 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rsq_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x48,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x48 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_sqrt_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x4e,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x4e 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cos_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x54,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x54 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_not_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x56,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x56 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_bfrev_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x58,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x58 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5a,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x5a 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_ffbl_b32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5c,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x5c 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_ffbh_i32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x5e,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x5e 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_frexp_exp_i32_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x66,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x66 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_frexp_mant_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x68,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x68 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_log_legacy_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x98,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x98 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_exp_legacy_f32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x96,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x96 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f16_u16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x72,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x72 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_f16_i16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x74,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x74 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_u16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x76,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x76 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cvt_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x78,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x78 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7a,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x7a 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_sqrt_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7c,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x7c 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rsq_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x7e,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x7e 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_log_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x80,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x80 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_exp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x82,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x82 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_frexp_mant_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x84,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x84 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_frexp_exp_i16_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x86,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x86 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_floor_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x88,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x88 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_ceil_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8a,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x8a 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_trunc_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8c,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x8c 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_rndne_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x8e,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x8e 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_fract_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x90,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x90 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_sin_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x92,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x92 0x02 0x7e 0x00 0x06 0x05 0x06
+
+# GFX9: v_cos_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x94,0x02,0x7e,0x00,0x06,0x05,0x06]
+0xf9 0x94 0x02 0x7e 0x00 0x06 0x05 0x06
+
+#-----------------------------------------------------------------------------#
+# VOP2
+#-----------------------------------------------------------------------------#
+
+# GFX9: v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x06]
+0xf9 0x00 0x00 0x02 0x00 0x06 0x05 0x06
+
+# GFX9: v_min_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x36,0x06,0x02]
+0xf9 0x00 0x00 0x14 0x00 0x36 0x06 0x02
+
+# GFX9: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x02]
+0xf9 0x00 0x00 0x26 0x00 0x06 0x06 0x02
+
+# GFX9: v_mul_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x06,0x02,0x0c,0x02,0x16,0x06,0x06]
+0xf9 0x06 0x02 0x0c 0x02 0x16 0x06 0x06
+
+# GFX9: v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x02]
+0xf9 0x00 0x00 0x02 0x00 0x06 0x05 0x02
+
+# GFX9: v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x05,0x02]
+0xf9 0x00 0x00 0x14 0x00 0x06 0x05 0x02
+
+# GFX9: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x05,0x02]
+0xf9 0x00 0x00 0x26 0x00 0x06 0x05 0x02
+
+# GFX9: v_mul_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0c,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x0c 0x02 0x06 0x05 0x02
+
+# GFX9: v_sub_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x04,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x04 0x02 0x06 0x05 0x02
+
+# GFX9: v_subrev_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x06,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x06 0x02 0x06 0x05 0x02
+
+# GFX9: v_mul_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0a,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x0a 0x02 0x06 0x05 0x02
+
+# GFX9: v_mul_hi_i32_i24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x0e,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x0e 0x02 0x06 0x05 0x02
+
+# GFX9: v_mul_u32_u24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x10,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x10 0x02 0x06 0x05 0x02
+
+# GFX9: v_mul_hi_u32_u24_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x12,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x12 0x02 0x06 0x05 0x02
+
+# GFX9: v_max_f32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x16,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x16 0x02 0x06 0x05 0x02
+
+# GFX9: v_min_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x18,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x18 0x02 0x06 0x05 0x02
+
+# GFX9: v_max_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1a,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x1a 0x02 0x06 0x05 0x02
+
+# GFX9: v_min_u32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1c,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x1c 0x02 0x06 0x05 0x02
+
+# GFX9: v_max_u32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x1e,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x1e 0x02 0x06 0x05 0x02
+
+# GFX9: v_lshrrev_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x20,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x20 0x02 0x06 0x05 0x02
+
+# GFX9: v_ashrrev_i32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x22,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x22 0x02 0x06 0x05 0x02
+
+# GFX9: v_lshlrev_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x24,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x24 0x02 0x06 0x05 0x02
+
+# GFX9: v_or_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x28,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x28 0x02 0x06 0x05 0x02
+
+# GFX9: v_xor_b32_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x2a,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x2a 0x02 0x06 0x05 0x02
+
+# GFX9: v_add_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x3e,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x3e 0x02 0x06 0x05 0x02
+
+# GFX9: v_sub_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x40,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x40 0x02 0x06 0x05 0x02
+
+# GFX9: v_subrev_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x42,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x42 0x02 0x06 0x05 0x02
+
+# GFX9: v_mul_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x44,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x44 0x02 0x06 0x05 0x02
+
+# GFX9: v_add_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4c,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x4c 0x02 0x06 0x05 0x02
+
+# GFX9: v_sub_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x4e,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x4e 0x02 0x06 0x05 0x02
+
+# GFX9: v_subrev_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x50,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x50 0x02 0x06 0x05 0x02
+
+# GFX9: v_mul_lo_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x52,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x52 0x02 0x06 0x05 0x02
+
+# GFX9: v_lshlrev_b16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x54,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x54 0x02 0x06 0x05 0x02
+
+# GFX9: v_lshrrev_b16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x56,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x56 0x02 0x06 0x05 0x02
+
+# GFX9: v_ashrrev_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x58,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x58 0x02 0x06 0x05 0x02
+
+# GFX9: v_max_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5a,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x5a 0x02 0x06 0x05 0x02
+
+# GFX9: v_min_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5c,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x5c 0x02 0x06 0x05 0x02
+
+# GFX9: v_max_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x5e,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x5e 0x02 0x06 0x05 0x02
+
+# GFX9: v_max_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x60,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x60 0x02 0x06 0x05 0x02
+
+# GFX9: v_min_u16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x62,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x62 0x02 0x06 0x05 0x02
+
+# GFX9: v_min_i16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x64,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x64 0x02 0x06 0x05 0x02
+
+# GFX9: v_ldexp_f16_sdwa v1, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x06,0x02,0x66,0x02,0x06,0x05,0x02]
+0xf9 0x06 0x02 0x66 0x02 0x06 0x05 0x02
+
+#-----------------------------------------------------------------------------#
+# VOPC
+#-----------------------------------------------------------------------------#
+
+# GFX9: v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x84 0x7c 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x98,0x7c,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x98 0x7c 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xa8,0x7c,0x01,0x00,0x02,0x04]
+0xf9 0x04 0xa8 0x7c 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xbc,0x7c,0x01,0x00,0x02,0x04]
+0xf9 0x04 0xbc 0x7c 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x82,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x82 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x8e,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x8e 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xa4,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0xa4 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xaa,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0xaa 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x90,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x90 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x98,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x98 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xb6,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0xb6 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0xba,0x7d,0x01,0x00,0x02,0x04]
+0xf9 0x04 0xba 0x7d 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x20,0x7c,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x20 0x7c 0x01 0x00 0x02 0x04
+
+# GFX9: v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x22,0x7c,0x01,0x00,0x02,0x04]
+0xf9 0x04 0x22 0x7c 0x01 0x00 0x02 0x04
+
+#-----------------------------------------------------------------------------#
+# Modifiers
+#-----------------------------------------------------------------------------#
+
+# GFX9: v_fract_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x25,0x06]
+0xf9 0x36 0x00 0x7e 0x00 0x06 0x25 0x06
+
+# GFX9: v_sin_f32_sdwa v0, -|v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; encoding: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x35,0x06]
+0xf9 0x52 0x00 0x7e 0x00 0x06 0x35 0x06
+
+# GFX9: v_add_f32_sdwa v0, -|v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x35,0x12]
+0xf9 0x00 0x00 0x02 0x00 0x06 0x35 0x12
+
+# GFX9: v_min_f32_sdwa v0, |v0|, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x14,0x00,0x06,0x25,0x12]
+0xf9 0x00 0x00 0x14 0x00 0x06 0x25 0x12
+
+# GFX9: v_mov_b32_sdwa v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x00,0x16,0x0e,0x06]
+0xf9 0x02 0x02 0x7e 0x00 0x16 0x0e 0x06
+
+# GFX9: v_and_b32_sdwa v0, sext(v0), sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x26,0x00,0x06,0x0e,0x0a]
+0xf9 0x00 0x00 0x26 0x00 0x06 0x0e 0x0a
+
+# GFX9: v_cmp_class_f32_sdwa vcc, -v1, sext(v2) src0_sel:BYTE_2 src1_sel:WORD_0 ; encoding: [0xf9,0x04,0x20,0x7c,0x01,0x00,0x12,0x0c]
+0xf9 0x04 0x20 0x7c 0x01 0x00 0x12 0x0c
+
+#===------------------------------------------------------------------------===#
+# Scalar registers are allowed
+#===------------------------------------------------------------------------===#
+
+# GFX9: v_mov_b32_sdwa v1, s2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x02,0x10,0x86,0x06]
+0xf9 0x02 0x02 0x7e 0x02 0x10 0x86 0x06
+
+# GFX9: v_mov_b32_sdwa v1, exec_lo dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x02,0x02,0x7e,0x7e,0x10,0x86,0x06]
+0xf9 0x02 0x02 0x7e 0x7e 0x10 0x86 0x06
+
+# GFX9: v_add_f32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x06,0x85,0x02]
+0xf9 0x00 0x00 0x02 0x00 0x06 0x85 0x02
+
+# GFX9: v_add_f32_sdwa v0, v0, s22 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x2c,0x00,0x02,0x00,0x06,0x05,0x82]
+0xf9 0x2c 0x00 0x02 0x00 0x06 0x05 0x82
+
+# GFX9: v_cmp_eq_f32_sdwa vcc, s1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x00,0x85,0x02]
+0xf9 0x04 0x84 0x7c 0x01 0x00 0x85 0x02
+
+# GFX9: v_cmp_eq_f32_sdwa vcc, v1, s22 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x2c,0x84,0x7c,0x01,0x00,0x05,0x82]
+0xf9 0x2c 0x84 0x7c 0x01 0x00 0x05 0x82
+
+#===------------------------------------------------------------------------===#
+# VOPC with arbitrary SGPR destination
+#===------------------------------------------------------------------------===#
+
+# GFX9: v_cmp_eq_f32_sdwa s[2:3], v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0x82,0x05,0x02]
+0xf9 0x04 0x84 0x7c 0x01 0x82 0x05 0x02
+
+# GFX9: v_cmp_eq_f32_sdwa exec, v1, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x01,0xfe,0x05,0x02]
+0xf9 0x04 0x84 0x7c 0x01 0xfe 0x05 0x02
+
+# GFX9: v_cmp_eq_f32_sdwa exec, s2, v2 src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x04,0x84,0x7c,0x02,0xfe,0x85,0x02]
+0xf9 0x04 0x84 0x7c 0x02 0xfe 0x85 0x02
+
+#===------------------------------------------------------------------------===#
+# OMod output modifier allowed
+#===------------------------------------------------------------------------===#
+
+# GFX9: v_trunc_f32_sdwa v1, v2 mul:2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x38,0x02,0x7e,0x02,0x50,0x06,0x06]
+0xf9 0x38 0x02 0x7e 0x02 0x50 0x06 0x06
+
+# GFX9: v_trunc_f32_sdwa v1, v2 clamp div:2 dst_sel:BYTE_0 dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x38,0x02,0x7e,0x02,0xf0,0x06,0x06]
+0xf9 0x38 0x02 0x7e 0x02 0xf0 0x06 0x06
+
+# GFX9: v_add_f32_sdwa v0, v0, v0 mul:2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0x46,0x05,0x02]
+0xf9 0x00 0x00 0x02 0x00 0x46 0x05 0x02
+
+# GFX9: v_add_f32_sdwa v0, v0, v0 clamp div:2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:BYTE_2 ; encoding: [0xf9,0x00,0x00,0x02,0x00,0xe6,0x05,0x02]
+0xf9 0x00 0x00 0x02 0x00 0xe6 0x05 0x02
diff --git a/test/MC/Disassembler/X86/avx-512.txt b/test/MC/Disassembler/X86/avx-512.txt
index b0d1009476f5..7eda07f0d30c 100644
--- a/test/MC/Disassembler/X86/avx-512.txt
+++ b/test/MC/Disassembler/X86/avx-512.txt
@@ -1,5 +1,6 @@
 # RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mcpu=knl | FileCheck %s
 # RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mcpu=skx | FileCheck --check-prefix=CHECK-SKX %s
+# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 -mattr=+avx512vpopcntdq | FileCheck %s --check-prefix=AVX512VPOPCNTDQ
 
 # CHECK: vpbroadcastd    %xmm18, %zmm28 {%k7} {z}
 0x62 0x22 0x7d 0xcf 0x58 0xe2
@@ -265,3 +266,25 @@
 
 # CHECK: vscatterqpd %ymm19, 256(%r9,%ymm31) {%k1}
 0x62 0x82 0xfd 0x21 0xa3 0x5c 0x39 0x20
+
+#####################################################
+#             POPULATION COUNT                      #
+#####################################################
+
+# AVX512VPOPCNTDQ: vpopcntd   %zmm21, %zmm26 {%k4}
+0x62 0x22 0x7d 0x4c 0x55 0xd5
+
+# AVX512VPOPCNTDQ: vpopcntd   %zmm21, %zmm26 {%k4} {z} 
+0x62 0x22 0x7d 0xcc 0x55 0xd5
+
+# AVX512VPOPCNTDQ: vpopcntd   (%rcx), %zmm26  
+0x62 0x62 0x7d 0x48 0x55 0x11
+
+# AVX512VPOPCNTDQ: vpopcntq   %zmm21, %zmm17 {%k6} 
+0x62 0xa2 0xfd 0x4e 0x55 0xcd
+
+# AVX512VPOPCNTDQ: vpopcntq   %zmm21, %zmm17 {%k6} {z} 
+0x62 0xa2 0xfd 0xce 0x55 0xcd
+
+# AVX512VPOPCNTDQ: vpopcntq   (%rcx), %zmm17  
+0x62 0xe2 0xfd 0x48 0x55 0x09
diff --git a/test/MC/WebAssembly/unnamed-data.ll b/test/MC/WebAssembly/unnamed-data.ll
new file mode 100644
index 000000000000..77a7c08f6594
--- /dev/null
+++ b/test/MC/WebAssembly/unnamed-data.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple wasm32-unknown-unknown-wasm -filetype=obj %s -o - | obj2yaml | FileCheck %s
+
+@.str1 = private unnamed_addr constant [6 x i8] c"hello\00", align 1
+@.str2 = private unnamed_addr constant [6 x i8] c"world\00", align 1
+
+@a = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str1, i32 0, i32 0), align 8
+@b = global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str2, i32 0, i32 0), align 8
+
+
+; CHECK:   - Type:            GLOBAL
+; CHECK:     Globals:         
+; CHECK:       - Type:            I32
+; CHECK:         Mutable:         false
+; CHECK:         InitExpr:        
+; CHECK:           Opcode:          I32_CONST
+; CHECK:           Value:           0
+; CHECK:       - Type:            I32
+; CHECK:         Mutable:         false
+; CHECK:         InitExpr:        
+; CHECK:           Opcode:          I32_CONST
+; CHECK:           Value:           6
+; CHECK:       - Type:            I32
+; CHECK:         Mutable:         false
+; CHECK:         InitExpr:        
+; CHECK:           Opcode:          I32_CONST
+; CHECK:           Value:           16
+; CHECK:       - Type:            I32
+; CHECK:         Mutable:         false
+; CHECK:         InitExpr:        
+; CHECK:           Opcode:          I32_CONST
+; CHECK:           Value:           24
+; CHECK:   - Type:            EXPORT
+; CHECK:     Exports:         
+; CHECK:       - Name:            a
+; CHECK:         Kind:            GLOBAL
+; CHECK:         Index:           2
+; CHECK:       - Name:            b
+; CHECK:         Kind:            GLOBAL
+; CHECK:         Index:           3
+; CHECK:   - Type:            DATA
+; CHECK:     Relocations:     
+; CHECK:       - Type:            R_WEBASSEMBLY_GLOBAL_ADDR_I32
+; CHECK:         Index:           0
+; CHECK:         Offset:          0x00000016
+; CHECK:       - Type:            R_WEBASSEMBLY_GLOBAL_ADDR_I32
+; CHECK:         Index:           1
+; CHECK:         Offset:          0x0000001E
+; CHECK:     Segments:        
+; CHECK:       - Index:           0
+; CHECK:         Offset:          
+; CHECK:           Opcode:          I32_CONST
+; CHECK:           Value:           0
+; CHECK:         Content:         68656C6C6F00776F726C640000000000000000000000000006000000
diff --git a/test/MC/X86/pr22004.s b/test/MC/X86/pr22004.s
new file mode 100644
index 000000000000..3ef1526e43ae
--- /dev/null
+++ b/test/MC/X86/pr22004.s
@@ -0,0 +1,3 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -x86-asm-syntax=intel %s
+
+lea rax, qword ptr [rip + .L.str]
diff --git a/test/MC/X86/x86-64-avx512vpopcntdq.s b/test/MC/X86/x86-64-avx512vpopcntdq.s
new file mode 100644
index 000000000000..925d2c6308e4
--- /dev/null
+++ b/test/MC/X86/x86-64-avx512vpopcntdq.s
@@ -0,0 +1,225 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avx512vpopcntdq --show-encoding %s | FileCheck %s
+
+// CHECK: vpopcntq   %zmm25, %zmm20  
+// CHECK: encoding: [0x62,0x82,0xfd,0x48,0x55,0xe1]
+          vpopcntq   %zmm25, %zmm20  
+
+// CHECK: vpopcntq   %zmm25, %zmm20 {%k6} 
+// CHECK: encoding: [0x62,0x82,0xfd,0x4e,0x55,0xe1]
+          vpopcntq   %zmm25, %zmm20 {%k6} 
+
+// CHECK: vpopcntq   %zmm25, %zmm20 {%k6} {z} 
+// CHECK: encoding: [0x62,0x82,0xfd,0xce,0x55,0xe1]
+          vpopcntq   %zmm25, %zmm20 {%k6} {z} 
+
+// CHECK: vpopcntq   (%rcx), %zmm20  
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x21]
+          vpopcntq   (%rcx), %zmm20  
+
+// CHECK: vpopcntq   291(%rax,%r14,8), %zmm20 
+// CHECK: encoding: [0x62,0xa2,0xfd,0x48,0x55,0xa4,0xf0,0x23,0x01,0x00,0x00]
+          vpopcntq   291(%rax,%r14,8), %zmm20 
+
+// CHECK: vpopcntq   (%rcx){1to8}, %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x21]
+          vpopcntq   (%rcx){1to8}, %zmm20 
+
+// CHECK: vpopcntq   4064(%rdx), %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0xa2,0xe0,0x0f,0x00,0x00]
+          vpopcntq   4064(%rdx), %zmm20 
+
+// CHECK: vpopcntq   4096(%rdx), %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x62,0x40]
+          vpopcntq   4096(%rdx), %zmm20 
+
+// CHECK: vpopcntq   -4096(%rdx), %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x62,0xc0]
+          vpopcntq   -4096(%rdx), %zmm20 
+
+// CHECK: vpopcntq   -4128(%rdx), %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0xa2,0xe0,0xef,0xff,0xff]
+          vpopcntq   -4128(%rdx), %zmm20 
+
+// CHECK: vpopcntq   1016(%rdx){1to8}, %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x62,0x7f]
+          vpopcntq   1016(%rdx){1to8}, %zmm20 
+
+// CHECK: vpopcntq   1024(%rdx){1to8}, %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0xa2,0x00,0x04,0x00,0x00]
+          vpopcntq   1024(%rdx){1to8}, %zmm20 
+
+// CHECK: vpopcntq   -1024(%rdx){1to8}, %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x62,0x80]
+          vpopcntq   -1024(%rdx){1to8}, %zmm20 
+
+// CHECK: vpopcntq   -1032(%rdx){1to8}, %zmm20 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0xa2,0xf8,0xfb,0xff,0xff]
+          vpopcntq   -1032(%rdx){1to8}, %zmm20 
+
+// CHECK: vpopcntq   %zmm21, %zmm17  
+// CHECK: encoding: [0x62,0xa2,0xfd,0x48,0x55,0xcd]
+          vpopcntq   %zmm21, %zmm17  
+
+// CHECK: vpopcntq   %zmm21, %zmm17 {%k6} 
+// CHECK: encoding: [0x62,0xa2,0xfd,0x4e,0x55,0xcd]
+          vpopcntq   %zmm21, %zmm17 {%k6} 
+
+// CHECK: vpopcntq   %zmm21, %zmm17 {%k6} {z} 
+// CHECK: encoding: [0x62,0xa2,0xfd,0xce,0x55,0xcd]
+          vpopcntq   %zmm21, %zmm17 {%k6} {z} 
+
+// CHECK: vpopcntq   (%rcx), %zmm17  
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x09]
+          vpopcntq   (%rcx), %zmm17  
+
+// CHECK: vpopcntq   4660(%rax,%r14,8), %zmm17 
+// CHECK: encoding: [0x62,0xa2,0xfd,0x48,0x55,0x8c,0xf0,0x34,0x12,0x00,0x00]
+          vpopcntq   4660(%rax,%r14,8), %zmm17 
+
+// CHECK: vpopcntq   (%rcx){1to8}, %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x09]
+          vpopcntq   (%rcx){1to8}, %zmm17 
+
+// CHECK: vpopcntq   4064(%rdx), %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x8a,0xe0,0x0f,0x00,0x00]
+          vpopcntq   4064(%rdx), %zmm17 
+
+// CHECK: vpopcntq   4096(%rdx), %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x4a,0x40]
+          vpopcntq   4096(%rdx), %zmm17 
+
+// CHECK: vpopcntq   -4096(%rdx), %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x4a,0xc0]
+          vpopcntq   -4096(%rdx), %zmm17 
+
+// CHECK: vpopcntq   -4128(%rdx), %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x48,0x55,0x8a,0xe0,0xef,0xff,0xff]
+          vpopcntq   -4128(%rdx), %zmm17 
+
+// CHECK: vpopcntq   1016(%rdx){1to8}, %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x4a,0x7f]
+          vpopcntq   1016(%rdx){1to8}, %zmm17 
+
+// CHECK: vpopcntq   1024(%rdx){1to8}, %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x8a,0x00,0x04,0x00,0x00]
+          vpopcntq   1024(%rdx){1to8}, %zmm17 
+
+// CHECK: vpopcntq   -1024(%rdx){1to8}, %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x4a,0x80]
+          vpopcntq   -1024(%rdx){1to8}, %zmm17 
+
+// CHECK: vpopcntq   -1032(%rdx){1to8}, %zmm17 
+// CHECK: encoding: [0x62,0xe2,0xfd,0x58,0x55,0x8a,0xf8,0xfb,0xff,0xff]
+          vpopcntq   -1032(%rdx){1to8}, %zmm17 
+
+// CHECK: vpopcntd   %zmm19, %zmm25  
+// CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0xcb]
+          vpopcntd   %zmm19, %zmm25  
+
+// CHECK: vpopcntd   %zmm19, %zmm25 {%k4} 
+// CHECK: encoding: [0x62,0x22,0x7d,0x4c,0x55,0xcb]
+          vpopcntd   %zmm19, %zmm25 {%k4} 
+
+// CHECK: vpopcntd   %zmm19, %zmm25 {%k4} {z} 
+// CHECK: encoding: [0x62,0x22,0x7d,0xcc,0x55,0xcb]
+          vpopcntd   %zmm19, %zmm25 {%k4} {z} 
+
+// CHECK: vpopcntd   (%rcx), %zmm25  
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x09]
+          vpopcntd   (%rcx), %zmm25  
+
+// CHECK: vpopcntd   291(%rax,%r14,8), %zmm25 
+// CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0x8c,0xf0,0x23,0x01,0x00,0x00]
+          vpopcntd   291(%rax,%r14,8), %zmm25 
+
+// CHECK: vpopcntd   (%rcx){1to16}, %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x09]
+          vpopcntd   (%rcx){1to16}, %zmm25
+
+// CHECK: vpopcntd   4064(%rdx), %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x8a,0xe0,0x0f,0x00,0x00]
+          vpopcntd   4064(%rdx), %zmm25 
+
+// CHECK: vpopcntd   4096(%rdx), %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x4a,0x40]
+          vpopcntd   4096(%rdx), %zmm25 
+
+// CHECK: vpopcntd   -4096(%rdx), %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x4a,0xc0]
+          vpopcntd   -4096(%rdx), %zmm25 
+
+// CHECK: vpopcntd   -4128(%rdx), %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x8a,0xe0,0xef,0xff,0xff]
+          vpopcntd   -4128(%rdx), %zmm25 
+
+// CHECK: vpopcntd   508(%rdx){1to16}, %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x4a,0x7f]
+          vpopcntd   508(%rdx){1to16}, %zmm25 
+
+// CHECK: vpopcntd   512(%rdx){1to16}, %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x8a,0x00,0x02,0x00,0x00]
+          vpopcntd   512(%rdx){1to16}, %zmm25 
+
+// CHECK: vpopcntd   -512(%rdx){1to16}, %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x4a,0x80]
+          vpopcntd   -512(%rdx){1to16}, %zmm25 
+
+// CHECK: vpopcntd   -516(%rdx){1to16}, %zmm25 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x8a,0xfc,0xfd,0xff,0xff]
+          vpopcntd   -516(%rdx){1to16}, %zmm25 
+
+// CHECK: vpopcntd   %zmm21, %zmm26  
+// CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0xd5]
+          vpopcntd   %zmm21, %zmm26  
+
+// CHECK: vpopcntd   %zmm21, %zmm26 {%k4} 
+// CHECK: encoding: [0x62,0x22,0x7d,0x4c,0x55,0xd5]
+          vpopcntd   %zmm21, %zmm26 {%k4} 
+
+// CHECK: vpopcntd   %zmm21, %zmm26 {%k4} {z} 
+// CHECK: encoding: [0x62,0x22,0x7d,0xcc,0x55,0xd5]
+          vpopcntd   %zmm21, %zmm26 {%k4} {z} 
+
+// CHECK: vpopcntd   (%rcx), %zmm26  
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x11]
+          vpopcntd   (%rcx), %zmm26  
+
+// CHECK: vpopcntd   4660(%rax,%r14,8), %zmm26 
+// CHECK: encoding: [0x62,0x22,0x7d,0x48,0x55,0x94,0xf0,0x34,0x12,0x00,0x00]
+          vpopcntd   4660(%rax,%r14,8), %zmm26 
+
+// CHECK: vpopcntd   (%rcx){1to16}, %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x11]
+          vpopcntd   (%rcx){1to16}, %zmm26 
+
+// CHECK: vpopcntd   4064(%rdx), %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x92,0xe0,0x0f,0x00,0x00]
+          vpopcntd   4064(%rdx), %zmm26 
+
+// CHECK: vpopcntd   4096(%rdx), %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x52,0x40]
+          vpopcntd   4096(%rdx), %zmm26 
+
+// CHECK: vpopcntd   -4096(%rdx), %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x52,0xc0]
+          vpopcntd   -4096(%rdx), %zmm26 
+
+// CHECK: vpopcntd   -4128(%rdx), %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x48,0x55,0x92,0xe0,0xef,0xff,0xff]
+          vpopcntd   -4128(%rdx), %zmm26 
+
+// CHECK: vpopcntd   508(%rdx){1to16}, %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x52,0x7f]
+          vpopcntd   508(%rdx){1to16}, %zmm26 
+
+// CHECK: vpopcntd   512(%rdx){1to16}, %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x92,0x00,0x02,0x00,0x00]
+          vpopcntd   512(%rdx){1to16}, %zmm26 
+
+// CHECK: vpopcntd   -512(%rdx){1to16}, %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x52,0x80]
+          vpopcntd   -512(%rdx){1to16}, %zmm26 
+
+// CHECK: vpopcntd   -516(%rdx){1to16}, %zmm26 
+// CHECK: encoding: [0x62,0x62,0x7d,0x58,0x55,0x92,0xfc,0xfd,0xff,0xff]
+          vpopcntd   -516(%rdx){1to16}, %zmm26 
diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll
index f712dc7b63ca..0ec356392a2d 100644
--- a/test/Other/new-pm-defaults.ll
+++ b/test/Other/new-pm-defaults.ll
@@ -95,6 +95,7 @@
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LICM
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
+; CHECK-O-NEXT: Running pass: SimpleLoopUnswitchPass
 ; CHECK-O-NEXT: Finished Loop pass manager run.
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
diff --git a/test/TableGen/GlobalISelEmitter.td b/test/TableGen/GlobalISelEmitter.td
index aeac85962f63..8778ad71ea72 100644
--- a/test/TableGen/GlobalISelEmitter.td
+++ b/test/TableGen/GlobalISelEmitter.td
@@ -462,6 +462,32 @@ def XORManyDefaults : I<(outs GPR32:$dst), (ins m1Z:$src3, Z:$src2, GPR32:$src1)
 def ORN : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2), []>;
 def : Pat<(not GPR32:$Wm), (ORN R0, GPR32:$Wm)>;
 
+//===- Test a simple pattern with just a leaf immediate. ------------------===//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:    MachineInstr &MI0 = I;
+// CHECK-NEXT:    if (MI0.getNumOperands() < 2)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_CONSTANT) &&
+// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* Operand 1 */ (MI0.getOperand(1).isCImm() && MI0.getOperand(1).getCImm()->equalsInt(1))))) {
+// CHECK-NEXT:      // 1:i32 => (MOV1:i32)
+// CHECK-NEXT:      MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MOV1));
+// CHECK-NEXT:      MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT:      for (const auto *FromMI : {&MI0, })
+// CHECK-NEXT:        for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT:          MIB.addMemOperand(MMO);
+// CHECK-NEXT:      I.eraseFromParent();
+// CHECK-NEXT:      MachineInstr &NewI = *MIB;
+// CHECK-NEXT:      constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:      return true;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }()) { return true; }
+
+def MOV1 : I<(outs GPR32:$dst), (ins), [(set GPR32:$dst, 1)]>;
+
 //===- Test a pattern with an MBB operand. --------------------------------===//
 
 // CHECK-LABEL: if ([&]() {
diff --git a/test/Transforms/Coroutines/coro-debug.ll b/test/Transforms/Coroutines/coro-debug.ll
new file mode 100644
index 000000000000..4da545499f94
--- /dev/null
+++ b/test/Transforms/Coroutines/coro-debug.ll
@@ -0,0 +1,142 @@
+; Tests that debug information is sane after coro-split
+; RUN: opt < %s -coro-split -S | FileCheck %s
+
+source_filename = "simple-repro.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind
+define i8* @f(i32 %x) #0 !dbg !6 {
+entry:
+  %x.addr = alloca i32, align 4
+  %coro_hdl = alloca i8*, align 8
+  store i32 %x, i32* %x.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %x.addr, metadata !12, metadata !13), !dbg !14
+  call void @llvm.dbg.declare(metadata i8** %coro_hdl, metadata !15, metadata !13), !dbg !16
+  %0 = call token @llvm.coro.id(i32 0, i8* null, i8* bitcast (i8* (i32)* @f to i8*), i8* null), !dbg !16
+  %1 = call i64 @llvm.coro.size.i64(), !dbg !16
+  %call = call i8* @malloc(i64 %1), !dbg !16
+  %2 = call i8* @llvm.coro.begin(token %0, i8* %call) #7, !dbg !16
+  store i8* %2, i8** %coro_hdl, align 8, !dbg !16
+  %3 = call i8 @llvm.coro.suspend(token none, i1 false), !dbg !17
+  %conv = sext i8 %3 to i32, !dbg !17
+  call void @coro.devirt.trigger(i8* null)
+  switch i32 %conv, label %sw.default [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb1
+  ], !dbg !17
+
+sw.bb:                                            ; preds = %entry
+  br label %sw.epilog, !dbg !18
+
+sw.bb1:                                           ; preds = %entry
+  br label %coro_Cleanup, !dbg !18
+
+sw.default:                                       ; preds = %entry
+  br label %coro_Suspend, !dbg !18
+
+sw.epilog:                                        ; preds = %sw.bb
+  %4 = load i32, i32* %x.addr, align 4, !dbg !20
+  %add = add nsw i32 %4, 1, !dbg !21
+  store i32 %add, i32* %x.addr, align 4, !dbg !22
+  br label %coro_Cleanup, !dbg !23
+
+coro_Cleanup:                                     ; preds = %sw.epilog, %sw.bb1
+  %5 = load i8*, i8** %coro_hdl, align 8, !dbg !24
+  %6 = call i8* @llvm.coro.free(token %0, i8* %5), !dbg !24
+  call void @free(i8* %6), !dbg !24
+  br label %coro_Suspend, !dbg !24
+
+coro_Suspend:                                     ; preds = %coro_Cleanup, %sw.default
+  %7 = call i1 @llvm.coro.end(i8* null, i1 false) #7, !dbg !24
+  %8 = load i8*, i8** %coro_hdl, align 8, !dbg !24
+  ret i8* %8, !dbg !24
+}
+
+; Function Attrs: nounwind readnone speculatable
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: argmemonly nounwind readonly
+declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) #2
+
+declare i8* @malloc(i64) #3
+
+; Function Attrs: nounwind readnone
+declare i64 @llvm.coro.size.i64() #4
+
+; Function Attrs: nounwind
+declare i8* @llvm.coro.begin(token, i8* writeonly) #5
+
+; Function Attrs: nounwind
+declare i8 @llvm.coro.suspend(token, i1) #5
+
+declare void @free(i8*) #3
+
+; Function Attrs: argmemonly nounwind readonly
+declare i8* @llvm.coro.free(token, i8* nocapture readonly) #2
+
+; Function Attrs: nounwind
+declare i1 @llvm.coro.end(i8*, i1) #5
+
+; Function Attrs: alwaysinline
+define private void @coro.devirt.trigger(i8*) #6 {
+entry:
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind readonly
+declare i8* @llvm.coro.subfn.addr(i8* nocapture readonly, i8) #2
+
+attributes #0 = { noinline nounwind "coroutine.presplit"="1" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone speculatable }
+attributes #2 = { argmemonly nounwind readonly }
+attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind readnone }
+attributes #5 = { nounwind }
+attributes #6 = { alwaysinline }
+attributes #7 = { noduplicate }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 97b002238b11ff30d94d0516d6a0515db5725fd8) (http://llvm.org/git/llvm.git 0cb060ba567f1aa5b4b04e86665f88e4632b528a)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "<stdin>", directory: "C:\5CGitHub\5Cllvm\5Cbuild\5CDebug\5Cbin")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 5.0.0 (http://llvm.org/git/clang.git 97b002238b11ff30d94d0516d6a0515db5725fd8) (http://llvm.org/git/llvm.git 0cb060ba567f1aa5b4b04e86665f88e4632b528a)"}
+!6 = distinct !DISubprogram(name: "f", linkageName: "flink", scope: !7, file: !7, line: 55, type: !8, isLocal: false, isDefinition: true, scopeLine: 55, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DIFile(filename: "simple-repro.c", directory: "C:\5CGitHub\5Cllvm\5Cbuild\5CDebug\5Cbin")
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !11}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !DILocalVariable(name: "x", arg: 1, scope: !6, file: !7, line: 55, type: !11)
+!13 = !DIExpression()
+!14 = !DILocation(line: 55, column: 13, scope: !6)
+!15 = !DILocalVariable(name: "coro_hdl", scope: !6, file: !7, line: 56, type: !10)
+!16 = !DILocation(line: 56, column: 3, scope: !6)
+!17 = !DILocation(line: 58, column: 5, scope: !6)
+!18 = !DILocation(line: 58, column: 5, scope: !19)
+!19 = distinct !DILexicalBlock(scope: !6, file: !7, line: 58, column: 5)
+!20 = !DILocation(line: 59, column: 9, scope: !6)
+!21 = !DILocation(line: 59, column: 10, scope: !6)
+!22 = !DILocation(line: 59, column: 7, scope: !6)
+!23 = !DILocation(line: 59, column: 5, scope: !6)
+!24 = !DILocation(line: 62, column: 3, scope: !6)
+
+; CHECK: define i8* @f(i32 %x) #0 !dbg ![[ORIG:[0-9]+]]
+; CHECK: define internal fastcc void @f.resume(%f.Frame* %FramePtr) #0 !dbg ![[RESUME:[0-9]+]]
+; CHECK: define internal fastcc void @f.destroy(%f.Frame* %FramePtr) #0 !dbg ![[DESTROY:[0-9]+]]
+; CHECK: define internal fastcc void @f.cleanup(%f.Frame* %FramePtr) #0 !dbg ![[CLEANUP:[0-9]+]]
+
+; CHECK: ![[ORIG]] = distinct !DISubprogram(name: "f", linkageName: "flink"
+; CHECK: !DILocalVariable(name: "x", arg: 1, scope: ![[ORIG]]
+
+; CHECK: ![[RESUME]] = distinct !DISubprogram(name: "f", linkageName: "flink"
+; CHECK: !DILocalVariable(name: "x", arg: 1, scope: ![[RESUME]]
+
+; CHECK: ![[DESTROY]] = distinct !DISubprogram(name: "f", linkageName: "flink"
+
+; CHECK: ![[CLEANUP]] = distinct !DISubprogram(name: "f", linkageName: "flink"
diff --git a/test/Transforms/Coroutines/coro-frame.ll b/test/Transforms/Coroutines/coro-frame.ll
index 001012fcd0c9..826d3a04fa1e 100644
--- a/test/Transforms/Coroutines/coro-frame.ll
+++ b/test/Transforms/Coroutines/coro-frame.ll
@@ -1,8 +1,11 @@
 ; Check that we can handle spills of the result of the invoke instruction
 ; RUN: opt < %s -coro-split -S | FileCheck %s
 
-define i8* @f() "coroutine.presplit"="1" personality i32 0 {
+define i8* @f(i64 %this) "coroutine.presplit"="1" personality i32 0 {
 entry:
+  %this.addr = alloca i64
+  store i64 %this, i64* %this.addr
+  %this1 = load i64, i64* %this.addr
   %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
   %size = call i32 @llvm.coro.size.i32()
   %alloc = call i8* @malloc(i32 %size)
@@ -15,6 +18,7 @@ cont:
                                 i8 1, label %cleanup]
 resume:
   call double @print(double %r)
+  call void @print2(i64 %this1)
   br label %cleanup
 
 cleanup:
@@ -30,12 +34,12 @@ pad:
 }
 
 ; See if the float was added to the frame
-; CHECK-LABEL: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i1, i1, double }
+; CHECK-LABEL: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i1, i1, i64, double }
 
 ; See if the float was spilled into the frame
 ; CHECK-LABEL: @f(
 ; CHECK: %r = call double @print(
-; CHECK: %r.spill.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 4
+; CHECK: %r.spill.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 5
 ; CHECK: store double %r, double* %r.spill.addr
 ; CHECK: ret i8* %hdl
 
@@ -58,4 +62,5 @@ declare i1 @llvm.coro.end(i8*, i1)
 
 declare noalias i8* @malloc(i32)
 declare double @print(double)
+declare void @print2(i64)
 declare void @free(i8*)
diff --git a/test/Transforms/Coroutines/coro-materialize.ll b/test/Transforms/Coroutines/coro-materialize.ll
new file mode 100644
index 000000000000..95e8a049ad2f
--- /dev/null
+++ b/test/Transforms/Coroutines/coro-materialize.ll
@@ -0,0 +1,52 @@
+; Verifies that we materialize instruction across suspend points
+; RUN: opt < %s -coro-split -S | FileCheck %s
+
+define i8* @f(i32 %n) "coroutine.presplit"="1" {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call i8* @malloc(i32 %size)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+
+  %inc1 = add i32 %n, 1
+  %sp1 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %sp1, label %suspend [i8 0, label %resume1
+                                  i8 1, label %cleanup]
+resume1:
+  %inc2 = add i32 %inc1, 1
+  %sp2 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %sp1, label %suspend [i8 0, label %resume2
+                                  i8 1, label %cleanup]
+
+resume2:
+  call void @print(i32 %inc1)
+  call void @print(i32 %inc2)
+  br label %cleanup
+
+cleanup:
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %suspend
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)
+  ret i8* %hdl
+}
+
+; See that we only spilled one value
+; CHECK: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i1, i1, i32 }
+; CHECK-LABEL: @f(
+
+declare i8* @llvm.coro.free(token, i8*)
+declare i32 @llvm.coro.size.i32()
+declare i8  @llvm.coro.suspend(token, i1)
+declare void @llvm.coro.resume(i8*)
+declare void @llvm.coro.destroy(i8*)
+
+declare token @llvm.coro.id(i32, i8*, i8*, i8*)
+declare i1 @llvm.coro.alloc(token)
+declare i8* @llvm.coro.begin(token, i8*)
+declare i1 @llvm.coro.end(i8*, i1)
+
+declare noalias i8* @malloc(i32)
+declare void @print(i32)
+declare void @free(i8*)
diff --git a/test/Transforms/EarlyCSE/const-speculation.ll b/test/Transforms/EarlyCSE/const-speculation.ll
new file mode 100644
index 000000000000..5b7f2f5b6982
--- /dev/null
+++ b/test/Transforms/EarlyCSE/const-speculation.ll
@@ -0,0 +1,39 @@
+; RUN: opt -early-cse -S %s | FileCheck %s
+
+%mystruct = type { i32 }
+
+; @var is global so that *every* GEP argument is Constant.
+@var = external global %mystruct
+
+; Control flow is to make the dominance tree consider the final icmp before it
+; gets to simplify the purely constant one (%tst). Since that icmp uses the
+; select that gets considered next. Finally the select simplification looks at
+; the %tst icmp and we don't want it to speculate about what happens if "i32 0"
+; is actually "i32 1", broken universes are automatic UB.
+;
+; In this case doing the speculation would create an invalid GEP(@var, 0, 1) and
+; crash.
+
+define i1 @test_constant_speculation() {
+; CHECK-LABEL: define i1 @test_constant_speculation
+entry:
+  br i1 undef, label %end, label %select
+
+select:
+; CHECK: select:
+; CHECK-NOT: icmp
+; CHECK-NOT: getelementptr
+; CHECK-NOT: select
+
+  %tst = icmp eq i32 1, 0
+  %elt = getelementptr %mystruct, %mystruct* @var, i64 0, i32 0
+  %sel = select i1 %tst, i32* null, i32* %elt
+  br label %end
+
+end:
+; CHECK: end:
+; CHECK: %tmp = phi i32* [ null, %entry ], [ getelementptr inbounds (%mystruct, %mystruct* @var, i64 0, i32 0), %select ]
+  %tmp = phi i32* [null, %entry], [%sel, %select]
+  %res = icmp eq i32* %tmp, null
+  ret i1 %res
+}
diff --git a/test/Transforms/GVN/PRE/phi-translate-2.ll b/test/Transforms/GVN/PRE/phi-translate-2.ll
new file mode 100644
index 000000000000..b2993657c7f5
--- /dev/null
+++ b/test/Transforms/GVN/PRE/phi-translate-2.ll
@@ -0,0 +1,105 @@
+; RUN: opt < %s -gvn -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@a = common global [100 x i64] zeroinitializer, align 16
+@b = common global [100 x i64] zeroinitializer, align 16
+@g1 = common global i64 0, align 8
+@g2 = common global i64 0, align 8
+@g3 = common global i64 0, align 8
+declare i64 @goo(...) local_unnamed_addr #1
+
+define void @test1(i64 %a, i64 %b, i64 %c, i64 %d) {
+entry:
+  %mul = mul nsw i64 %b, %a
+  store i64 %mul, i64* @g1, align 8
+  %t0 = load i64, i64* @g2, align 8
+  %cmp = icmp sgt i64 %t0, 3
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %mul2 = mul nsw i64 %d, %c
+  store i64 %mul2, i64* @g2, align 8
+  br label %if.end
+
+; Check phi-translate works and mul is removed.
+; CHECK-LABEL: @test1(
+; CHECK: if.end:
+; CHECK: %[[MULPHI:.*]] = phi i64 [ {{.*}}, %if.then ], [ %mul, %entry ]
+; CHECK-NOT: = mul
+; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8
+if.end:                                           ; preds = %if.then, %entry
+  %b.addr.0 = phi i64 [ %d, %if.then ], [ %b, %entry ]
+  %a.addr.0 = phi i64 [ %c, %if.then ], [ %a, %entry ]
+  %mul3 = mul nsw i64 %a.addr.0, %b.addr.0
+  store i64 %mul3, i64* @g3, align 8
+  ret void
+}
+
+define void @test2(i64 %i) {
+entry:
+  %arrayidx = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i
+  %t0 = load i64, i64* %arrayidx, align 8
+  %arrayidx1 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i
+  %t1 = load i64, i64* %arrayidx1, align 8
+  %mul = mul nsw i64 %t1, %t0
+  store i64 %mul, i64* @g1, align 8
+  %cmp = icmp sgt i64 %mul, 3
+  br i1 %cmp, label %if.then, label %if.end
+
+; Check phi-translate works for the phi generated by loadpre. A new mul will be
+; inserted in if.then block.
+; CHECK-LABEL: @test2(
+; CHECK: if.then:
+; CHECK: %[[MUL_THEN:.*]] = mul
+; CHECK: br label %if.end
+if.then:                                          ; preds = %entry
+  %call = tail call i64 (...) @goo() #2
+  store i64 %call, i64* @g2, align 8
+  br label %if.end
+
+; CHECK: if.end:
+; CHECK: %[[MULPHI:.*]] = phi i64 [ %[[MUL_THEN]], %if.then ], [ %mul, %entry ]
+; CHECK-NOT: = mul
+; CHECK: store i64 %[[MULPHI]], i64* @g3, align 8
+if.end:                                           ; preds = %if.then, %entry
+  %i.addr.0 = phi i64 [ 3, %if.then ], [ %i, %entry ]
+  %arrayidx3 = getelementptr inbounds [100 x i64], [100 x i64]* @a, i64 0, i64 %i.addr.0
+  %t2 = load i64, i64* %arrayidx3, align 8
+  %arrayidx4 = getelementptr inbounds [100 x i64], [100 x i64]* @b, i64 0, i64 %i.addr.0
+  %t3 = load i64, i64* %arrayidx4, align 8
+  %mul5 = mul nsw i64 %t3, %t2
+  store i64 %mul5, i64* @g3, align 8
+  ret void
+}
+
+; Check phi-translate doesn't go through backedge, which may lead to incorrect
+; pre transformation.
+; CHECK: for.end:
+; CHECK-NOT: %{{.*pre-phi}} = phi
+; CHECK: ret void
+define void @test3(i64 %N, i64* nocapture readonly %a) {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %add, %for.body ]
+  %add = add nuw nsw i64 %i.0, 1
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %add
+  %tmp0 = load i64, i64* %arrayidx, align 8
+  %cmp = icmp slt i64 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %call = tail call i64 (...) @goo() #2
+  %add1 = sub nsw i64 0, %call
+  %tobool = icmp eq i64 %tmp0, %add1
+  br i1 %tobool, label %for.cond, label %for.end
+
+for.end:                                          ; preds = %for.body, %for.cond
+  %i.0.lcssa = phi i64 [ %i.0, %for.body ], [ %i.0, %for.cond ]
+  %arrayidx2 = getelementptr inbounds i64, i64* %a, i64 %i.0.lcssa
+  %tmp1 = load i64, i64* %arrayidx2, align 8
+  store i64 %tmp1, i64* @g1, align 8
+  ret void
+}
+
diff --git a/test/Transforms/GVN/PRE/pre-gep-load.ll b/test/Transforms/GVN/PRE/pre-gep-load.ll
index 9eec8bb6455b..1b2b4d20d31d 100644
--- a/test/Transforms/GVN/PRE/pre-gep-load.ll
+++ b/test/Transforms/GVN/PRE/pre-gep-load.ll
@@ -37,7 +37,7 @@ sw.bb2:                                           ; preds = %if.end, %entry
   %3 = load double, double* %arrayidx5, align 8
 ; CHECK: sw.bb2:
 ; CHECK-NOT: sext
-; CHECK-NEXT: phi double [
+; CHECK: phi double [
 ; CHECK-NOT: load
   %sub6 = fsub double 3.000000e+00, %3
   br label %return
diff --git a/test/Transforms/GVN/PRE/pre-load.ll b/test/Transforms/GVN/PRE/pre-load.ll
index 685df24f62b6..ffff2b7f08e5 100644
--- a/test/Transforms/GVN/PRE/pre-load.ll
+++ b/test/Transforms/GVN/PRE/pre-load.ll
@@ -72,7 +72,7 @@ block4:
   %PRE = load i32, i32* %P3
   ret i32 %PRE
 ; CHECK: block4:
-; CHECK-NEXT: phi i32 [
+; CHECK: phi i32 [
 ; CHECK-NOT: load
 ; CHECK: ret i32
 }
@@ -104,7 +104,7 @@ block4:
   %PRE = load i32, i32* %P3
   ret i32 %PRE
 ; CHECK: block4:
-; CHECK-NEXT: phi i32 [
+; CHECK: phi i32 [
 ; CHECK-NOT: load
 ; CHECK: ret i32
 }
@@ -263,7 +263,7 @@ block4:
   %PRE = load i32, i32* %P3
   ret i32 %PRE
 ; CHECK: block4:
-; CHECK-NEXT: phi i32 [
+; CHECK: phi i32 [
 ; CHECK-NOT: load
 ; CHECK: ret i32
 }
diff --git a/test/Transforms/GVNSink/dither.ll b/test/Transforms/GVNSink/dither.ll
new file mode 100644
index 000000000000..9717021aca82
--- /dev/null
+++ b/test/Transforms/GVNSink/dither.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -S -gvn-sink | FileCheck %s
+
+; Because %tmp17 has flipped operands to its equivalents %tmp14 and %tmp7, we
+; can't sink the zext as we'd need a shuffling PHI in between.
+;
+; Just sinking the zext isn't profitable, so ensure nothing is sunk.
+
+; CHECK-LABEL: @hoge
+; CHECK-NOT: bb18.gvnsink.split
+define void @hoge() {
+bb:
+  br i1 undef, label %bb4, label %bb11
+
+bb4:                                              ; preds = %bb3
+  br i1 undef, label %bb6, label %bb8
+
+bb6:                                              ; preds = %bb5
+  %tmp = zext i16 undef to i64
+  %tmp7 = add i64 %tmp, undef
+  br label %bb18
+
+bb8:                                              ; preds = %bb5
+  %tmp9 = zext i16 undef to i64
+  br label %bb18
+
+bb11:                                             ; preds = %bb10
+  br i1 undef, label %bb12, label %bb15
+
+bb12:                                             ; preds = %bb11
+  %tmp13 = zext i16 undef to i64
+  %tmp14 = add i64 %tmp13, undef
+  br label %bb18
+
+bb15:                                             ; preds = %bb11
+  %tmp16 = zext i16 undef to i64
+  %tmp17 = add i64 undef, %tmp16
+  br label %bb18
+
+bb18:                                             ; preds = %bb15, %bb12, %bb8, %bb6
+  %tmp19 = phi i64 [ %tmp7, %bb6 ], [ undef, %bb8 ], [ %tmp14, %bb12 ], [ %tmp17, %bb15 ]
+  unreachable
+}
diff --git a/test/Transforms/GVNSink/indirect-call.ll b/test/Transforms/GVNSink/indirect-call.ll
new file mode 100644
index 000000000000..da98ed0819a6
--- /dev/null
+++ b/test/Transforms/GVNSink/indirect-call.ll
@@ -0,0 +1,70 @@
+; RUN: opt < %s -gvn-sink -simplifycfg -simplifycfg-sink-common=false -S | FileCheck %s
+
+declare i8 @ext(i1)
+
+define zeroext i1 @test1(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks, i8(i1)* %ext) {
+entry:
+  %cmp = icmp uge i32 %blksA, %nblks
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK-LABEL: test1
+; CHECK: call i8 @ext
+; CHECK: call i8 %ext
+if.then:
+  %frombool1 = call i8 @ext(i1 %cmp)
+  br label %if.end
+
+if.else:
+  %frombool3 = call i8 %ext(i1 %cmp)
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+define zeroext i1 @test2(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks, i8(i1)* %ext) {
+entry:
+  %cmp = icmp uge i32 %blksA, %nblks
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK-LABEL: test2
+; CHECK: call i8 %ext
+; CHECK-NOT: call
+if.then:
+  %frombool1 = call i8 %ext(i1 %cmp)
+  br label %if.end
+
+if.else:
+  %frombool3 = call i8 %ext(i1 %cmp)
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+define zeroext i1 @test3(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks, i8(i1)* %ext1, i8(i1)* %ext2) {
+entry:
+  %cmp = icmp uge i32 %blksA, %nblks
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK-LABEL: test3
+; CHECK: %[[x:.*]] = select i1 %flag, i8 (i1)* %ext1, i8 (i1)* %ext2
+; CHECK: call i8 %[[x]](i1 %cmp)
+; CHECK-NOT: call
+if.then:
+  %frombool1 = call i8 %ext1(i1 %cmp)
+  br label %if.end
+
+if.else:
+  %frombool3 = call i8 %ext2(i1 %cmp)
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
diff --git a/test/Transforms/GVNSink/sink-common-code.ll b/test/Transforms/GVNSink/sink-common-code.ll
new file mode 100644
index 000000000000..d9e757cd10fc
--- /dev/null
+++ b/test/Transforms/GVNSink/sink-common-code.ll
@@ -0,0 +1,694 @@
+; RUN: opt < %s -gvn-sink -simplifycfg -simplifycfg-sink-common=false -S | FileCheck %s
+
+define zeroext i1 @test1(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK-LABEL: test1
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+define zeroext i1 @test2(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK-LABEL: test2
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp uge i32 %blksA, %add
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+declare i32 @foo(i32, i32) nounwind readnone
+
+define i32 @test3(i1 zeroext %flag, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
+  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
+  br label %if.end
+
+if.else:
+  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
+  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
+  br label %if.end
+
+if.end:
+  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
+  %yy = phi i32 [ %y0, %if.then ], [ %y1, %if.else ]
+  %ret = add i32 %xx, %yy
+  ret i32 %ret
+}
+
+; CHECK-LABEL: test3
+; CHECK: select
+; CHECK: call
+; CHECK: call
+; CHECK: add
+; CHECK-NOT: br
+
+define i32 @test4(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %a = add i32 %x, 5
+  store i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %b = add i32 %x, 7
+  store i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test4
+; CHECK: select
+; CHECK: store
+; CHECK-NOT: store
+
+define i32 @test5(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %a = add i32 %x, 5
+  store volatile i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %b = add i32 %x, 7
+  store i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test5
+; CHECK: store volatile
+; CHECK: store
+
+define i32 @test6(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %a = add i32 %x, 5
+  store volatile i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %b = add i32 %x, 7
+  store volatile i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test6
+; CHECK: select
+; CHECK: store volatile
+; CHECK-NOT: store
+
+define i32 @test7(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %z = load volatile i32, i32* %y
+  %a = add i32 %z, 5
+  store volatile i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %w = load volatile i32, i32* %y
+  %b = add i32 %w, 7
+  store volatile i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test7
+; CHECK-DAG: select
+; CHECK-DAG: load volatile
+; CHECK: store volatile
+; CHECK-NOT: load
+; CHECK-NOT: store
+
+; The extra store in %if.then means %z and %w are not equivalent.
+define i32 @test9(i1 zeroext %flag, i32 %x, i32* %y, i32* %p) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  store i32 7, i32* %p
+  %z = load volatile i32, i32* %y
+  store i32 6, i32* %p
+  %a = add i32 %z, 5
+  store volatile i32 %a, i32* %y
+  br label %if.end
+
+if.else:
+  %w = load volatile i32, i32* %y
+  %b = add i32 %w, 7
+  store volatile i32 %b, i32* %y
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test9
+; CHECK: add
+; CHECK: add
+
+%struct.anon = type { i32, i32 }
+
+; The GEP indexes a struct type so cannot have a variable last index.
+define i32 @test10(i1 zeroext %flag, i32 %x, i32* %y, %struct.anon* %s) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %x, 5
+  %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 0
+  store volatile i32 %x, i32* %gepa
+  br label %if.end
+
+if.else:
+  %dummy1 = add i32 %x, 6
+  %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  store volatile i32 %x, i32* %gepb
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+; CHECK-LABEL: test10
+; CHECK: getelementptr
+; CHECK: store volatile
+; CHECK: getelementptr
+; CHECK: store volatile
+
+; The shufflevector's mask operand cannot be merged in a PHI.
+define i32 @test11(i1 zeroext %flag, i32 %w, <2 x i32> %x, <2 x i32> %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %w, 5
+  %sv1 = shufflevector <2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 0, i32 1>
+  br label %if.end
+
+if.else:
+  %dummy1 = add i32 %w, 6
+  %sv2 = shufflevector <2 x i32> %x, <2 x i32> %y, <2 x i32> <i32 1, i32 0>
+  br label %if.end
+
+if.end:
+  %p = phi <2 x i32> [ %sv1, %if.then ], [ %sv2, %if.else ]
+  ret i32 1
+}
+
+; CHECK-LABEL: test11
+; CHECK: shufflevector
+; CHECK: shufflevector
+
+; We can't common an intrinsic!
+define i32 @test12(i1 zeroext %flag, i32 %w, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %w, 5
+  %sv1 = call i32 @llvm.ctlz.i32(i32 %x)
+  br label %if.end
+
+if.else:
+  %dummy1 = add i32 %w, 6
+  %sv2 = call i32 @llvm.cttz.i32(i32 %x)
+  br label %if.end
+
+if.end:
+  %p = phi i32 [ %sv1, %if.then ], [ %sv2, %if.else ]
+  ret i32 1
+}
+
+declare i32 @llvm.ctlz.i32(i32 %x) readnone
+declare i32 @llvm.cttz.i32(i32 %x) readnone
+
+; CHECK-LABEL: test12
+; CHECK: call i32 @llvm.ctlz
+; CHECK: call i32 @llvm.cttz
+
+; The TBAA metadata should be properly combined.
+define i32 @test13(i1 zeroext %flag, i32 %x, i32* %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %z = load volatile i32, i32* %y
+  %a = add i32 %z, 5
+  store volatile i32 %a, i32* %y, !tbaa !3
+  br label %if.end
+
+if.else:
+  %w = load volatile i32, i32* %y
+  %b = add i32 %w, 7
+  store volatile i32 %b, i32* %y, !tbaa !4
+  br label %if.end
+
+if.end:
+  ret i32 1
+}
+
+!0 = !{ !"an example type tree" }
+!1 = !{ !"int", !0 }
+!2 = !{ !"float", !0 }
+!3 = !{ !"const float", !2, i64 0 }
+!4 = !{ !"special float", !2, i64 1 }
+
+; CHECK-LABEL: test13
+; CHECK-DAG: select
+; CHECK-DAG: load volatile
+; CHECK: store volatile {{.*}}, !tbaa !0
+; CHECK-NOT: load
+; CHECK-NOT: store
+
+; The call should be commoned.
+define i32 @test13a(i1 zeroext %flag, i32 %w, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %sv1 = call i32 @bar(i32 %x)
+  br label %if.end
+
+if.else:
+  %sv2 = call i32 @bar(i32 %y)
+  br label %if.end
+
+if.end:
+  %p = phi i32 [ %sv1, %if.then ], [ %sv2, %if.else ]
+  ret i32 1
+}
+declare i32 @bar(i32)
+
+; CHECK-LABEL: test13a
+; CHECK: %[[x:.*]] = select i1 %flag
+; CHECK: call i32 @bar(i32 %[[x]])
+
+; The load should be commoned.
+define i32 @test14(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, %struct.anon* %s) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %x, 1
+  %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  %sv1 = load i32, i32* %gepa
+  %cmp1 = icmp eq i32 %sv1, 56
+  br label %if.end
+
+if.else:
+  %dummy2 = add i32 %x, 4
+  %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  %sv2 = load i32, i32* %gepb
+  %cmp2 = icmp eq i32 %sv2, 57
+  br label %if.end
+
+if.end:
+  %p = phi i1 [ %cmp1, %if.then ], [ %cmp2, %if.else ]
+  ret i32 1
+}
+
+; CHECK-LABEL: test14
+; CHECK: getelementptr
+; CHECK: load
+; CHECK-NOT: load
+
+; The load should be commoned.
+define i32 @test15(i1 zeroext %flag, i32 %w, i32 %x, i32 %y, %struct.anon* %s) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %dummy = add i32 %x, 1
+  %gepa = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 0
+  %sv1 = load i32, i32* %gepa
+  %ext1 = zext i32 %sv1 to i64
+  %cmp1 = icmp eq i64 %ext1, 56
+  br label %if.end
+
+if.else:
+  %dummy2 = add i32 %x, 4
+  %gepb = getelementptr inbounds %struct.anon, %struct.anon* %s, i32 0, i32 1
+  %sv2 = load i32, i32* %gepb
+  %ext2 = zext i32 %sv2 to i64
+  %cmp2 = icmp eq i64 %ext2, 56
+  br label %if.end
+
+if.end:
+  %p = phi i1 [ %cmp1, %if.then ], [ %cmp2, %if.else ]
+  ret i32 1
+}
+
+; CHECK-LABEL: test15
+; CHECK: getelementptr
+; CHECK: load
+; CHECK-NOT: load
+
+define zeroext i1 @test_crash(i1 zeroext %flag, i32* %i4, i32* %m, i32* %n) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %tmp1 = load i32, i32* %i4
+  %tmp2 = add i32 %tmp1, -1
+  store i32 %tmp2, i32* %i4
+  br label %if.end
+
+if.else:
+  %tmp3 = load i32, i32* %m
+  %tmp4 = load i32, i32* %n
+  %tmp5 = add i32 %tmp3, %tmp4
+  store i32 %tmp5, i32* %i4
+  br label %if.end
+
+if.end:
+  ret i1 true
+}
+
+; CHECK-LABEL: test_crash
+; No checks for test_crash - just ensure it doesn't crash!
+
+define zeroext i1 @test16(i1 zeroext %flag, i1 zeroext %flag2, i32 %blksA, i32 %blksB, i32 %nblks) {
+
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  br i1 %flag2, label %if.then2, label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.then2 ], [ 0, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+; CHECK-LABEL: test16
+; CHECK: zext
+; CHECK: zext
+
+define zeroext i1 @test16a(i1 zeroext %flag, i1 zeroext %flag2, i32 %blksA, i32 %blksB, i32 %nblks, i8* %p) {
+
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  %b1 = sext i8 %frombool1 to i32
+  %b2 = trunc i32 %b1 to i8
+  store i8 %b2, i8* %p
+  br label %if.end
+
+if.else:
+  br i1 %flag2, label %if.then2, label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  %a1 = sext i8 %frombool3 to i32
+  %a2 = trunc i32 %a1 to i8
+  store i8 %a2, i8* %p
+  br label %if.end
+
+if.end:
+  ret i1 true
+}
+
+; CHECK-LABEL: test16a
+; CHECK: zext
+; CHECK-NOT: zext
+
+define zeroext i1 @test17(i32 %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  switch i32 %flag, label %if.end [
+    i32 0, label %if.then
+    i32 1, label %if.then2
+  ]
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = call i8 @i1toi8(i1 %cmp)
+  %a1 = sext i8 %frombool1 to i32
+  %a2 = trunc i32 %a1 to i8
+  br label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = call i8 @i1toi8(i1 %cmp2)
+  %b1 = sext i8 %frombool3 to i32
+  %b2 = trunc i32 %b1 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %a2, %if.then ], [ %b2, %if.then2 ], [ 0, %entry ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+declare i8 @i1toi8(i1)
+
+; FIXME: DISABLED - we don't consider this profitable. We should
+;  - Consider argument setup/return mov'ing for calls, like InlineCost does.
+;  - Consider the removal of the %obeys.0 PHI (zero PHI movement overall)
+
+; DISABLED-CHECK-LABEL: test17
+; DISABLED-CHECK: if.then:
+; DISABLED-CHECK-NEXT: icmp uge
+; DISABLED-CHECK-NEXT: br label %[[x:.*]]
+
+; DISABLED-CHECK: if.then2:
+; DISABLED-CHECK-NEXT: add
+; DISABLED-CHECK-NEXT: icmp ule
+; DISABLED-CHECK-NEXT: br label %[[x]]
+
+; DISABLED-CHECK: [[x]]:
+; DISABLED-CHECK-NEXT: %[[y:.*]] = phi i1 [ %cmp
+; DISABLED-CHECK-NEXT: %[[z:.*]] = call i8 @i1toi8(i1 %[[y]])
+; DISABLED-CHECK-NEXT: br label %if.end
+
+; DISABLED-CHECK: if.end:
+; DISABLED-CHECK-NEXT: phi i8
+; DISABLED-CHECK-DAG: [ %[[z]], %[[x]] ]
+; DISABLED-CHECK-DAG: [ 0, %entry ]
+
+define zeroext i1 @test18(i32 %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  switch i32 %flag, label %if.then3 [
+    i32 0, label %if.then
+    i32 1, label %if.then2
+  ]
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.then3:
+  %add2 = add i32 %nblks, %blksA
+  %cmp3 = icmp ule i32 %add2, %blksA
+  %frombool4 = zext i1 %cmp3 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.then2 ], [ %frombool4, %if.then3 ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+; CHECK-LABEL: test18
+; CHECK: if.end:
+; CHECK-NEXT: %[[x:.*]] = phi i1
+; CHECK-DAG: [ %cmp, %if.then ]
+; CHECK-DAG: [ %cmp2, %if.then2 ]
+; CHECK-DAG: [ %cmp3, %if.then3 ]
+; CHECK-NEXT: zext i1 %[[x]] to i8
+
+; The phi is confusing - both add instructions are used by it, but
+; not on their respective unconditional arcs. It should not be
+; optimized.
+define void @test_pr30292(i1 %cond, i1 %cond2, i32 %a, i32 %b) {
+entry:
+  %add1 = add i32 %a, 1
+  br label %succ
+
+one:
+  br i1 %cond, label %two, label %succ
+
+two:
+  call void @g()
+  %add2 = add i32 %a, 1
+  br label %succ
+
+succ:
+  %p = phi i32 [ 0, %entry ], [ %add1, %one ], [ %add2, %two ]
+  br label %one
+}
+declare void @g()
+
+; CHECK-LABEL: test_pr30292
+; CHECK: phi i32 [ 0, %entry ], [ %add1, %succ ], [ %add2, %two ]
+
+define zeroext i1 @test_pr30244(i1 zeroext %flag, i1 zeroext %flag2, i32 %blksA, i32 %blksB, i32 %nblks) {
+
+entry:
+  %p = alloca i8
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  store i8 %frombool1, i8* %p
+  br label %if.end
+
+if.else:
+  br i1 %flag2, label %if.then2, label %if.end
+
+if.then2:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  store i8 %frombool3, i8* %p
+  br label %if.end
+
+if.end:
+  ret i1 true
+}
+
+; CHECK-LABEL: @test_pr30244
+; CHECK: store
+; CHECK-NOT: store
+
+define i32 @test_pr30373a(i1 zeroext %flag, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
+  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
+  %z0 = lshr i32 %y0, 8
+  br label %if.end
+
+if.else:
+  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
+  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
+  %z1 = lshr exact i32 %y1, 8
+  br label %if.end
+
+if.end:
+  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
+  %yy = phi i32 [ %z0, %if.then ], [ %z1, %if.else ]
+  %ret = add i32 %xx, %yy
+  ret i32 %ret
+}
+
+; CHECK-LABEL: test_pr30373a
+; CHECK: lshr
+; CHECK-NOT: exact
+; CHECK: }
+
+define i32 @test_pr30373b(i1 zeroext %flag, i32 %x, i32 %y) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+if.then:
+  %x0 = call i32 @foo(i32 %x, i32 0) nounwind readnone
+  %y0 = call i32 @foo(i32 %x, i32 1) nounwind readnone
+  %z0 = lshr exact i32 %y0, 8
+  br label %if.end
+
+if.else:
+  %x1 = call i32 @foo(i32 %y, i32 0) nounwind readnone
+  %y1 = call i32 @foo(i32 %y, i32 1) nounwind readnone
+  %z1 = lshr i32 %y1, 8
+  br label %if.end
+
+if.end:
+  %xx = phi i32 [ %x0, %if.then ], [ %x1, %if.else ]
+  %yy = phi i32 [ %z0, %if.then ], [ %z1, %if.else ]
+  %ret = add i32 %xx, %yy
+  ret i32 %ret
+}
+
+; CHECK-LABEL: test_pr30373b
+; CHECK: lshr
+; CHECK-NOT: exact
+; CHECK: }
+
+; CHECK: !0 = !{!1, !1, i64 0}
+; CHECK: !1 = !{!"float", !2}
+; CHECK: !2 = !{!"an example type tree"}
diff --git a/test/Transforms/GVNSink/struct.ll b/test/Transforms/GVNSink/struct.ll
new file mode 100644
index 000000000000..2228cf2803ae
--- /dev/null
+++ b/test/Transforms/GVNSink/struct.ll
@@ -0,0 +1,71 @@
+; RUN: opt -gvn-sink -S < %s | FileCheck %s
+
+%struct = type {i32, i32}
+%struct2 = type { [ 2 x i32], i32 }
+
+; Struct indices cannot be variant.
+
+; CHECK-LABEL: @f() {
+; CHECK: getelementptr
+; CHECK: getelementptr
+define void @f() {
+bb:
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp = getelementptr inbounds %struct, %struct* null, i64 0, i32 1
+  br label %bb4
+
+bb2:                                              ; preds = %bb
+  %tmp3 = getelementptr inbounds %struct, %struct* null, i64 0, i32 0
+  br label %bb4
+
+bb4:                                              ; preds = %bb2, %bb1
+  %tmp5 = phi i32 [ 1, %bb1 ], [ 0, %bb2 ]
+  ret void
+}
+
+; Struct indices cannot be variant.
+
+; CHECK-LABEL: @g() {
+; CHECK: getelementptr
+; CHECK: getelementptr
+define void @g() {
+bb:
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp = getelementptr inbounds %struct2, %struct2* null, i64 0, i32 0, i32 1
+  br label %bb4
+
+bb2:                                              ; preds = %bb
+  %tmp3 = getelementptr inbounds %struct2, %struct2* null, i64 0, i32 0, i32 0
+  br label %bb4
+
+bb4:                                              ; preds = %bb2, %bb1
+  %tmp5 = phi i32 [ 1, %bb1 ], [ 0, %bb2 ]
+  ret void
+}
+
+
+; ... but the first parameter to a GEP can.
+
+; CHECK-LABEL: @h() {
+; CHECK: getelementptr
+; CHECK-NOT: getelementptr
+define void @h() {
+bb:
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp = getelementptr inbounds %struct, %struct* null, i32 0, i32 0
+  br label %bb4
+
+bb2:                                              ; preds = %bb
+  %tmp3 = getelementptr inbounds %struct, %struct* null, i32 1, i32 0
+  br label %bb4
+
+bb4:                                              ; preds = %bb2, %bb1
+  %tmp5 = phi i32 [ 0, %bb1 ], [ 1, %bb2 ]
+  ret void
+}
\ No newline at end of file
diff --git a/test/Transforms/GlobalDCE/externally_available.ll b/test/Transforms/GlobalDCE/externally_available.ll
index fca49b29ec8e..bc54db38cee0 100644
--- a/test/Transforms/GlobalDCE/externally_available.ll
+++ b/test/Transforms/GlobalDCE/externally_available.ll
@@ -1,12 +1,21 @@
 ; RUN: opt < %s -globaldce -S | FileCheck %s
 
+; test_global should not be emitted to the .s file.
+; CHECK-NOT: @test_global =
+@test_global = available_externally global i32 4
+
+; test_global2 is a normal global using an available externally function.
+; CHECK: @test_global2 =
+@test_global2 = global i32 ()* @test_function2
+
 ; test_function should not be emitted to the .s file.
-; CHECK-NOT: test_function
+; CHECK-NOT: define {{.*}} @test_function()
 define available_externally i32 @test_function() {
   ret i32 4
 }
 
-; test_global should not be emitted to the .s file.
-; CHECK-NOT: test_global
-@test_global = available_externally global i32 4
-
+; test_function2 isn't actually dead even though it's available externally.
+; CHECK: define available_externally i32 @test_function2()
+define available_externally i32 @test_function2() {
+  ret i32 4
+}
diff --git a/test/Transforms/Inline/prof-update-instr.ll b/test/Transforms/Inline/prof-update-instr.ll
new file mode 100644
index 000000000000..6650165cb904
--- /dev/null
+++ b/test/Transforms/Inline/prof-update-instr.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S | FileCheck %s
+; Checks if inliner updates VP metadata for indrect call instructions
+; with instrumentation based profile.
+
+@func = global void ()* null
+@func2 = global void ()* null
+
+; CHECK: define void @callee(i32 %n) !prof ![[ENTRY_COUNT:[0-9]*]]
+define void  @callee(i32 %n) !prof !15 {
+  %cond = icmp sle i32 %n, 10
+  br i1 %cond, label %cond_true, label %cond_false, !prof !20
+cond_true:
+; f2 is optimized away, thus not updated.
+  %f2 = load void ()*, void ()** @func2
+; CHECK: call void %f2(), !prof ![[COUNT_IND_CALLEE1:[0-9]*]]
+  call void %f2(), !prof !19
+  ret void
+cond_false:
+  %f = load void ()*, void ()** @func
+; CHECK: call void %f(), !prof ![[COUNT_IND_CALLEE:[0-9]*]]
+  call void %f(), !prof !18
+  ret void
+}
+
+; CHECK: define void @caller()
+define void @caller() !prof !21 {
+; CHECK: call void %f.i(), !prof ![[COUNT_IND_CALLER:[0-9]*]]
+  call void @callee(i32 15)
+  ret void
+}
+
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 10}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 2000}
+!8 = !{!"NumCounts", i64 2}
+!9 = !{!"NumFunctions", i64 2}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+!15 = !{!"function_entry_count", i64 1000}
+!16 = !{!"branch_weights", i64 2000}
+!18 = !{!"VP", i32 0, i64 140, i64 111, i64 80, i64 222, i64 40, i64 333, i64 20}
+!19 = !{!"VP", i32 0, i64 200, i64 111, i64 100, i64 222, i64 60, i64 333, i64 40}
+!20 = !{!"branch_weights", i32 1000, i32 1000}
+!21 = !{!"function_entry_count", i64 400}
+attributes #0 = { alwaysinline }
+; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
+; CHECK: ![[COUNT_IND_CALLEE1]] = !{!"VP", i32 0, i64 200, i64 111, i64 100, i64 222, i64 60, i64 333, i64 40}
+; CHECK: ![[COUNT_IND_CALLEE]] = !{!"VP", i32 0, i64 84, i64 111, i64 48, i64 222, i64 24, i64 333, i64 12}
+; CHECK: ![[COUNT_IND_CALLER]] = !{!"VP", i32 0, i64 56, i64 111, i64 32, i64 222, i64 16, i64 333, i64 8}
diff --git a/test/Transforms/Inline/prof-update.ll b/test/Transforms/Inline/prof-update-sample.ll
similarity index 100%
rename from test/Transforms/Inline/prof-update.ll
rename to test/Transforms/Inline/prof-update-sample.ll
diff --git a/test/Transforms/InstCombine/2008-07-10-ICmpBinOp.ll b/test/Transforms/InstCombine/2008-07-10-ICmpBinOp.ll
deleted file mode 100644
index 76e30399a666..000000000000
--- a/test/Transforms/InstCombine/2008-07-10-ICmpBinOp.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: opt < %s -instcombine -S | not grep add
-; RUN: opt < %s -instcombine -S | not grep mul
-; PR2330
-
-define i1 @f(i32 %x, i32 %y) nounwind {
-entry:
-  %A = add i32 %x, 5
-  %B = add i32 %y, 5
-  %C = icmp eq i32 %A, %B
-  ret i1 %C
-}
-
-define i1 @g(i32 %x, i32 %y) nounwind {
-entry:
-  %A = mul i32 %x, 5
-  %B = mul i32 %y, 5
-  %C = icmp eq i32 %A, %B
-  ret i1 %C
-}
diff --git a/test/Transforms/InstCombine/2008-08-17-ICmpXorSignbit.ll b/test/Transforms/InstCombine/2008-08-17-ICmpXorSignbit.ll
deleted file mode 100644
index b91457c79dea..000000000000
--- a/test/Transforms/InstCombine/2008-08-17-ICmpXorSignbit.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-define i1 @test1(i8 %x, i8 %y) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp ult i8 %x, %y
-; CHECK-NEXT:    ret i1 [[TMP]]
-;
-  %X = xor i8 %x, 128
-  %Y = xor i8 %y, 128
-  %tmp = icmp slt i8 %X, %Y
-  ret i1 %tmp
-}
-
-define i1 @test2(i8 %x, i8 %y) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp slt i8 %x, %y
-; CHECK-NEXT:    ret i1 [[TMP]]
-;
-  %X = xor i8 %x, 128
-  %Y = xor i8 %y, 128
-  %tmp = icmp ult i8 %X, %Y
-  ret i1 %tmp
-}
-
-define i1 @test3(i8 %x) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp sgt i8 %x, -114
-; CHECK-NEXT:    ret i1 [[TMP]]
-;
-  %X = xor i8 %x, 128
-  %tmp = icmp uge i8 %X, 15
-  ret i1 %tmp
-}
-
-define <2 x i1> @test3vec(<2 x i8> %x) {
-; CHECK-LABEL: @test3vec(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp sgt <2 x i8> %x, <i8 -114, i8 -114>
-; CHECK-NEXT:    ret <2 x i1> [[TMP]]
-;
-  %X = xor <2 x i8> %x, <i8 128, i8 128>
-  %tmp = icmp uge <2 x i8> %X, <i8 15, i8 15>
-  ret <2 x i1> %tmp
-}
-
-define i1 @test4(i8 %x, i8 %y) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp ugt i8 %x, %y
-; CHECK-NEXT:    ret i1 [[TMP]]
-;
-  %X = xor i8 %x, 127
-  %Y = xor i8 %y, 127
-  %tmp = icmp slt i8 %X, %Y
-  ret i1 %tmp
-}
-
-define i1 @test5(i8 %x, i8 %y) {
-; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp sgt i8 %x, %y
-; CHECK-NEXT:    ret i1 [[TMP]]
-;
-  %X = xor i8 %x, 127
-  %Y = xor i8 %y, 127
-  %tmp = icmp ult i8 %X, %Y
-  ret i1 %tmp
-}
-
-define i1 @test6(i8 %x) {
-; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp slt i8 %x, 113
-; CHECK-NEXT:    ret i1 [[TMP]]
-;
-  %X = xor i8 %x, 127
-  %tmp = icmp uge i8 %X, 15
-  ret i1 %tmp
-}
-
-define <2 x i1> @test6vec(<2 x i8> %x) {
-; CHECK-LABEL: @test6vec(
-; CHECK-NEXT:    [[TMP:%.*]] = icmp slt <2 x i8> %x, <i8 113, i8 113>
-; CHECK-NEXT:    ret <2 x i1> [[TMP]]
-;
-  %X = xor <2 x i8> %x, <i8 127, i8 127>
-  %tmp = icmp uge <2 x i8> %X, <i8 15, i8 15>
-  ret <2 x i1> %tmp
-}
-
diff --git a/test/Transforms/InstCombine/alloca.ll b/test/Transforms/InstCombine/alloca.ll
index f81f700e6cf4..490830af2d82 100644
--- a/test/Transforms/InstCombine/alloca.ll
+++ b/test/Transforms/InstCombine/alloca.ll
@@ -51,8 +51,8 @@ define i32* @test4(i32 %n) {
   ret i32* %A
 }
 
-; Allocas which are only used by GEPs, bitcasts, and stores (transitively)
-; should be deleted.
+; Allocas which are only used by GEPs, bitcasts, addrspacecasts, and stores
+; (transitively) should be deleted.
 define void @test5() {
 ; CHECK-LABEL: @test5(
 ; CHECK-NOT: alloca
@@ -62,6 +62,7 @@ define void @test5() {
 entry:
   %a = alloca { i32 }
   %b = alloca i32*
+  %c = alloca i32
   %a.1 = getelementptr { i32 }, { i32 }* %a, i32 0, i32 0
   store i32 123, i32* %a.1
   store i32* %a.1, i32** %b
@@ -73,6 +74,8 @@ entry:
   store atomic i32 3, i32* %a.3 release, align 4
   %a.4 = getelementptr { i32 }, { i32 }* %a, i32 0, i32 0
   store atomic i32 4, i32* %a.4 seq_cst, align 4
+  %c.1 = addrspacecast i32* %c to i32 addrspace(1)*
+  store i32 123, i32 addrspace(1)* %c.1
   ret void
 }
 
diff --git a/test/Transforms/InstCombine/bitcast-vec-canon.ll b/test/Transforms/InstCombine/bitcast-vec-canon.ll
index 97145221099e..a92a7b73fd7e 100644
--- a/test/Transforms/InstCombine/bitcast-vec-canon.ll
+++ b/test/Transforms/InstCombine/bitcast-vec-canon.ll
@@ -1,41 +1,40 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 define double @a(<1 x i64> %y) {
+; CHECK-LABEL: @a(
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <1 x i64> %y to <1 x double>
+; CHECK-NEXT:    [[C:%.*]] = extractelement <1 x double> [[BC]], i32 0
+; CHECK-NEXT:    ret double [[C]]
+;
   %c = bitcast <1 x i64> %y to double
   ret double %c
- 
-; CHECK-LABEL: @a(
-; CHECK-NEXT:  bitcast <1 x i64> %y to <1 x double>
-; CHECK-NEXT:  extractelement <1 x double> {{.*}}, i32 0
-; CHECK-NEXT:  ret double
 }
 
 define i64 @b(<1 x i64> %y) {
+; CHECK-LABEL: @b(
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i64> %y, i32 0
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
   %c = bitcast <1 x i64> %y to i64
   ret i64 %c
-
-; CHECK-LABEL: @b(
-; CHECK-NEXT:  extractelement <1 x i64> %y, i32 0
-; CHECK-NEXT:  ret i64
 }
 
 define <1 x i64> @c(double %y) {
+; CHECK-LABEL: @c(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double %y to i64
+; CHECK-NEXT:    [[C:%.*]] = insertelement <1 x i64> undef, i64 [[TMP1]], i32 0
+; CHECK-NEXT:    ret <1 x i64> [[C]]
+;
   %c = bitcast double %y to <1 x i64>
   ret <1 x i64> %c
-
-; CHECK-LABEL: @c(
-; CHECK-NEXT:  bitcast double %y to i64
-; CHECK-NEXT:  insertelement <1 x i64> undef, i64 {{.*}}, i32 0
-; CHECK-NEXT:  ret <1 x i64>
 }
 
 define <1 x i64> @d(i64 %y) {
+; CHECK-LABEL: @d(
+; CHECK-NEXT:    [[C:%.*]] = insertelement <1 x i64> undef, i64 %y, i32 0
+; CHECK-NEXT:    ret <1 x i64> [[C]]
+;
   %c = bitcast i64 %y to <1 x i64>
   ret <1 x i64> %c
-
-; CHECK-LABEL: @d(
-; CHECK-NEXT:  insertelement <1 x i64> undef, i64 %y, i32 0
-; CHECK-NEXT:  ret <1 x i64>
 }
 
-
diff --git a/test/Transforms/InstCombine/bitcast.ll b/test/Transforms/InstCombine/bitcast.ll
index 2e7f30fee14d..4cf3f27ab014 100644
--- a/test/Transforms/InstCombine/bitcast.ll
+++ b/test/Transforms/InstCombine/bitcast.ll
@@ -70,6 +70,51 @@ define <2 x i32> @or_bitcast_int_to_vec(i64 %a) {
   ret <2 x i32> %t2
 }
 
+; PR26702 - https://bugs.llvm.org//show_bug.cgi?id=26702
+; Bitcast is canonicalized below logic, so we can see the not-not pattern.
+
+define <2 x i64> @is_negative(<4 x i32> %x) {
+; CHECK-LABEL: @is_negative(
+; CHECK-NEXT:    [[LOBIT:%.*]] = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    [[NOTNOT:%.*]] = bitcast <4 x i32> [[LOBIT]] to <2 x i64>
+; CHECK-NEXT:    ret <2 x i64> [[NOTNOT]]
+;
+  %lobit = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+  %not = xor <4 x i32> %lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc = bitcast <4 x i32> %not to <2 x i64>
+  %notnot = xor <2 x i64> %bc, <i64 -1, i64 -1>
+  ret <2 x i64> %notnot
+}
+
+; This variation has an extra bitcast at the end. This means that the 2nd xor
+; can be done in <4 x i32> to eliminate a bitcast regardless of canonicalizaion.
+
+define <4 x i32> @is_negative_bonus_bitcast(<4 x i32> %x) {
+; CHECK-LABEL: @is_negative_bonus_bitcast(
+; CHECK-NEXT:    [[LOBIT:%.*]] = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+; CHECK-NEXT:    ret <4 x i32> [[LOBIT]]
+;
+  %lobit = ashr <4 x i32> %x, <i32 31, i32 31, i32 31, i32 31>
+  %not = xor <4 x i32> %lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %bc = bitcast <4 x i32> %not to <2 x i64>
+  %notnot = xor <2 x i64> %bc, <i64 -1, i64 -1>
+  %bc2 = bitcast <2 x i64> %notnot to <4 x i32>
+  ret <4 x i32> %bc2
+}
+
+; Negative test: bitcasts are canonicalized below bitwise logic. No changes here.
+
+define <2 x i8> @canonicalize_bitcast_logic_with_constant(<4 x i4> %x) {
+; CHECK-LABEL: @canonicalize_bitcast_logic_with_constant(
+; CHECK-NEXT:    [[A:%.*]] = and <4 x i4> %x, <i4 0, i4 -8, i4 0, i4 -8>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i4> [[A]] to <2 x i8>
+; CHECK-NEXT:    ret <2 x i8> [[B]]
+;
+  %a = and <4 x i4> %x, <i4 0, i4 8, i4 0, i4 8>
+  %b = bitcast <4 x i4> %a to <2 x i8>
+  ret <2 x i8> %b
+}
+
 ; PR27925 - https://llvm.org/bugs/show_bug.cgi?id=27925
 
 define <4 x i32> @bitcasts_and_bitcast(<4 x i32> %a, <8 x i16> %b) {
diff --git a/test/Transforms/InstCombine/ctpop.ll b/test/Transforms/InstCombine/ctpop.ll
index 38612c92aaa4..6bc6f9731979 100644
--- a/test/Transforms/InstCombine/ctpop.ll
+++ b/test/Transforms/InstCombine/ctpop.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -S -instcombine | FileCheck %s
 
 declare i32 @llvm.ctpop.i32(i32)
@@ -5,8 +6,9 @@ declare i8 @llvm.ctpop.i8(i8)
 declare void @llvm.assume(i1)
 
 define i1 @test1(i32 %arg) {
-; CHECK: @test1
-; CHECK: ret i1 false
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i1 false
+;
   %and = and i32 %arg, 15
   %cnt = call i32 @llvm.ctpop.i32(i32 %and)
   %res = icmp eq i32 %cnt, 9
@@ -14,8 +16,9 @@ define i1 @test1(i32 %arg) {
 }
 
 define i1 @test2(i32 %arg) {
-; CHECK: @test2
-; CHECK: ret i1 false
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    ret i1 false
+;
   %and = and i32 %arg, 1
   %cnt = call i32 @llvm.ctpop.i32(i32 %and)
   %res = icmp eq i32 %cnt, 2
@@ -23,9 +26,12 @@ define i1 @test2(i32 %arg) {
 }
 
 define i1 @test3(i32 %arg) {
-; CHECK: @test3
-; CHECK: ret i1 false
-  ;; Use an assume to make all the bits known without triggering constant 
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[ASSUME:%.*]] = icmp eq i32 [[ARG:%.*]], 0
+; CHECK-NEXT:    call void @llvm.assume(i1 [[ASSUME]])
+; CHECK-NEXT:    ret i1 false
+;
+  ;; Use an assume to make all the bits known without triggering constant
   ;; folding.  This is trying to hit a corner case where we have to avoid
   ;; taking the log of 0.
   %assume = icmp eq i32 %arg, 0
@@ -37,8 +43,11 @@ define i1 @test3(i32 %arg) {
 
 ; Negative test for when we know nothing
 define i1 @test4(i8 %arg) {
-; CHECK: @test4
-; CHECK: ret i1 %res
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[CNT:%.*]] = call i8 @llvm.ctpop.i8(i8 [[ARG:%.*]])
+; CHECK-NEXT:    [[RES:%.*]] = icmp eq i8 [[CNT]], 2
+; CHECK-NEXT:    ret i1 [[RES]]
+;
   %cnt = call i8 @llvm.ctpop.i8(i8 %arg)
   %res = icmp eq i8 %cnt, 2
   ret i1 %res
diff --git a/test/Transforms/InstCombine/icmp-xor-signbit.ll b/test/Transforms/InstCombine/icmp-xor-signbit.ll
new file mode 100644
index 000000000000..30a9668f37df
--- /dev/null
+++ b/test/Transforms/InstCombine/icmp-xor-signbit.ll
@@ -0,0 +1,228 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
+
+define i1 @slt_to_ult(i8 %x, i8 %y) {
+; CHECK-LABEL: @slt_to_ult(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 128
+  %b = xor i8 %y, 128
+  %cmp = icmp slt i8 %a, %b
+  ret i1 %cmp
+}
+
+; PR33138 - https://bugs.llvm.org/show_bug.cgi?id=33138
+
+define <2 x i1> @slt_to_ult_splat(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @slt_to_ult_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <2 x i8> %x, %y
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 128, i8 128>
+  %b = xor <2 x i8> %y, <i8 128, i8 128>
+  %cmp = icmp slt <2 x i8> %a, %b
+  ret <2 x i1> %cmp
+}
+
+; Make sure that unsigned -> signed works too.
+
+define i1 @ult_to_slt(i8 %x, i8 %y) {
+; CHECK-LABEL: @ult_to_slt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 128
+  %b = xor i8 %y, 128
+  %cmp = icmp ult i8 %a, %b
+  ret i1 %cmp
+}
+
+define <2 x i1> @ult_to_slt_splat(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @ult_to_slt_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> %x, %y
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 128, i8 128>
+  %b = xor <2 x i8> %y, <i8 128, i8 128>
+  %cmp = icmp ult <2 x i8> %a, %b
+  ret <2 x i1> %cmp
+}
+
+; icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b
+
+define i1 @slt_to_ugt(i8 %x, i8 %y) {
+; CHECK-LABEL: @slt_to_ugt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 127
+  %b = xor i8 %y, 127
+  %cmp = icmp slt i8 %a, %b
+  ret i1 %cmp
+}
+
+define <2 x i1> @slt_to_ugt_splat(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @slt_to_ugt_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <2 x i8> %x, %y
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 127, i8 127>
+  %b = xor <2 x i8> %y, <i8 127, i8 127>
+  %cmp = icmp slt <2 x i8> %a, %b
+  ret <2 x i1> %cmp
+}
+
+; Make sure that unsigned -> signed works too.
+
+define i1 @ult_to_sgt(i8 %x, i8 %y) {
+; CHECK-LABEL: @ult_to_sgt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 127
+  %b = xor i8 %y, 127
+  %cmp = icmp ult i8 %a, %b
+  ret i1 %cmp
+}
+
+define <2 x i1> @ult_to_sgt_splat(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @ult_to_sgt_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i8> %x, %y
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 127, i8 127>
+  %b = xor <2 x i8> %y, <i8 127, i8 127>
+  %cmp = icmp ult <2 x i8> %a, %b
+  ret <2 x i1> %cmp
+}
+
+; icmp u/s (a ^ signmask), C --> icmp s/u a, C'
+
+define i1 @sge_to_ugt(i8 %x) {
+; CHECK-LABEL: @sge_to_ugt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 %x, -114
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 128
+  %cmp = icmp sge i8 %a, 15
+  ret i1 %cmp
+}
+
+define <2 x i1> @sge_to_ugt_splat(<2 x i8> %x) {
+; CHECK-LABEL: @sge_to_ugt_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <2 x i8> %x, <i8 -114, i8 -114>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 128, i8 128>
+  %cmp = icmp sge <2 x i8> %a, <i8 15, i8 15>
+  ret <2 x i1> %cmp
+}
+
+; Make sure that unsigned -> signed works too.
+
+define i1 @uge_to_sgt(i8 %x) {
+; CHECK-LABEL: @uge_to_sgt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 %x, -114
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 128
+  %cmp = icmp uge i8 %a, 15
+  ret i1 %cmp
+}
+
+define <2 x i1> @uge_to_sgt_splat(<2 x i8> %x) {
+; CHECK-LABEL: @uge_to_sgt_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt <2 x i8> %x, <i8 -114, i8 -114>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 128, i8 128>
+  %cmp = icmp uge <2 x i8> %a, <i8 15, i8 15>
+  ret <2 x i1> %cmp
+}
+
+; icmp u/s (a ^ maxsignval), C --> icmp s/u' a, C'
+
+define i1 @sge_to_ult(i8 %x) {
+; CHECK-LABEL: @sge_to_ult(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 %x, 113
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 127
+  %cmp = icmp sge i8 %a, 15
+  ret i1 %cmp
+}
+
+define <2 x i1> @sge_to_ult_splat(<2 x i8> %x) {
+; CHECK-LABEL: @sge_to_ult_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <2 x i8> %x, <i8 113, i8 113>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 127, i8 127>
+  %cmp = icmp sge <2 x i8> %a, <i8 15, i8 15>
+  ret <2 x i1> %cmp
+}
+
+; Make sure that unsigned -> signed works too.
+
+define i1 @uge_to_slt(i8 %x) {
+; CHECK-LABEL: @uge_to_slt(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 %x, 113
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %a = xor i8 %x, 127
+  %cmp = icmp uge i8 %a, 15
+  ret i1 %cmp
+}
+
+define <2 x i1> @uge_to_slt_splat(<2 x i8> %x) {
+; CHECK-LABEL: @uge_to_slt_splat(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> %x, <i8 113, i8 113>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %a = xor <2 x i8> %x, <i8 127, i8 127>
+  %cmp = icmp uge <2 x i8> %a, <i8 15, i8 15>
+  ret <2 x i1> %cmp
+}
+
+; PR33138, part 2: https://bugs.llvm.org/show_bug.cgi?id=33138
+; TODO: We could look through vector bitcasts for icmp folds,
+; or we could canonicalize bitcast ahead of logic ops with constants.
+
+define <8 x i1> @sgt_to_ugt_bitcasted_splat(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @sgt_to_ugt_bitcasted_splat(
+; CHECK-NEXT:    [[A:%.*]] = xor <2 x i32> %x, <i32 -2139062144, i32 -2139062144>
+; CHECK-NEXT:    [[B:%.*]] = xor <2 x i32> %y, <i32 -2139062144, i32 -2139062144>
+; CHECK-NEXT:    [[C:%.*]] = bitcast <2 x i32> [[A]] to <8 x i8>
+; CHECK-NEXT:    [[D:%.*]] = bitcast <2 x i32> [[B]] to <8 x i8>
+; CHECK-NEXT:    [[E:%.*]] = icmp sgt <8 x i8> [[C]], [[D]]
+; CHECK-NEXT:    ret <8 x i1> [[E]]
+;
+  %a = xor <2 x i32> %x, <i32 2155905152, i32 2155905152> ; 0x80808080
+  %b = xor <2 x i32> %y, <i32 2155905152, i32 2155905152>
+  %c = bitcast <2 x i32> %a to <8 x i8>
+  %d = bitcast <2 x i32> %b to <8 x i8>
+  %e = icmp sgt <8 x i8> %c, %d
+  ret <8 x i1> %e
+}
+
+; TODO: This is false (little-endian). How should that be recognized?
+; Ie, should InstSimplify know this directly, should InstCombine canonicalize
+; this so InstSimplify can know this, or is that not something that we want
+; either pass to recognize?
+
+define <2 x i1> @negative_simplify_splat(<4 x i8> %x) {
+; CHECK-LABEL: @negative_simplify_splat(
+; CHECK-NEXT:    [[A:%.*]] = or <4 x i8> %x, <i8 0, i8 -128, i8 0, i8 -128>
+; CHECK-NEXT:    [[B:%.*]] = bitcast <4 x i8> [[A]] to <2 x i16>
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt <2 x i16> [[B]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %a = or <4 x i8> %x, <i8 0, i8 128, i8 0, i8 128>
+  %b = bitcast <4 x i8> %a to <2 x i16>
+  %c = icmp sgt <2 x i16> %b, zeroinitializer
+  ret <2 x i1> %c
+}
+
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index 6f657b190454..ed570da73c9e 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -2895,3 +2895,67 @@ define i1 @cmp_ult_rhs_dec(float %x, i32 %y) {
   %cmp = icmp ult i32 %conv, %dec
   ret i1 %cmp
 }
+
+define i1 @eq_add_constants(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_add_constants(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 %x, %y
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %A = add i32 %x, 5
+  %B = add i32 %y, 5
+  %C = icmp eq i32 %A, %B
+  ret i1 %C
+}
+
+define i1 @eq_mul_constants(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_mul_constants(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 %x, %y
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %A = mul i32 %x, 5
+  %B = mul i32 %y, 5
+  %C = icmp eq i32 %A, %B
+  ret i1 %C
+}
+
+define <2 x i1> @eq_mul_constants_splat(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @eq_mul_constants_splat(
+; CHECK-NEXT:    [[C:%.*]] = icmp ne <2 x i32> %x, %y
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %A = mul <2 x i32> %x, <i32 5, i32 5>
+  %B = mul <2 x i32> %y, <i32 5, i32 5>
+  %C = icmp ne <2 x i32> %A, %B
+  ret <2 x i1> %C
+}
+
+; If the multiply constant has any trailing zero bits, we get something completely different.
+; We mask off the high bits of each input and then convert:
+; (X&Z) == (Y&Z) -> (X^Y) & Z == 0
+
+define i1 @eq_mul_constants_with_tz(i32 %x, i32 %y) {
+; CHECK-LABEL: @eq_mul_constants_with_tz(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %x, %y
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], 1073741823
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %A = mul i32 %x, 12
+  %B = mul i32 %y, 12
+  %C = icmp ne i32 %A, %B
+  ret i1 %C
+}
+
+define <2 x i1> @eq_mul_constants_with_tz_splat(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @eq_mul_constants_with_tz_splat(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i32> %x, %y
+; CHECK-NEXT:    [[TMP2:%.*]] = and <2 x i32> [[TMP1]], <i32 1073741823, i32 1073741823>
+; CHECK-NEXT:    [[C:%.*]] = icmp eq <2 x i32> [[TMP2]], zeroinitializer
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %A = mul <2 x i32> %x, <i32 12, i32 12>
+  %B = mul <2 x i32> %y, <i32 12, i32 12>
+  %C = icmp eq <2 x i32> %A, %B
+  ret <2 x i1> %C
+}
+
diff --git a/test/Transforms/InstSimplify/call.ll b/test/Transforms/InstSimplify/call.ll
index 988ec2b71c50..68daac65ee6b 100644
--- a/test/Transforms/InstSimplify/call.ll
+++ b/test/Transforms/InstSimplify/call.ll
@@ -1,64 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 ; RUN: opt < %s -passes=instsimplify -S | FileCheck %s
 
 declare {i8, i1} @llvm.uadd.with.overflow.i8(i8 %a, i8 %b)
+declare {i8, i1} @llvm.sadd.with.overflow.i8(i8 %a, i8 %b)
 declare {i8, i1} @llvm.usub.with.overflow.i8(i8 %a, i8 %b)
 declare {i8, i1} @llvm.ssub.with.overflow.i8(i8 %a, i8 %b)
 declare {i8, i1} @llvm.umul.with.overflow.i8(i8 %a, i8 %b)
+declare {i8, i1} @llvm.smul.with.overflow.i8(i8 %a, i8 %b)
 
 define i1 @test_uadd1() {
 ; CHECK-LABEL: @test_uadd1(
+; CHECK-NEXT:    ret i1 true
+;
   %x = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 254, i8 3)
   %overflow = extractvalue {i8, i1} %x, 1
   ret i1 %overflow
-; CHECK-NEXT: ret i1 true
 }
 
 define i8 @test_uadd2() {
 ; CHECK-LABEL: @test_uadd2(
+; CHECK-NEXT:    ret i8 42
+;
   %x = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 254, i8 44)
   %result = extractvalue {i8, i1} %x, 0
   ret i8 %result
-; CHECK-NEXT: ret i8 42
+}
+
+define {i8, i1} @test_uadd3(i8 %v) {
+; CHECK-LABEL: @test_uadd3(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %result = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %v, i8 undef)
+  ret {i8, i1} %result
+}
+
+define {i8, i1} @test_uadd4(i8 %v) {
+; CHECK-LABEL: @test_uadd4(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %result = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 undef, i8 %v)
+  ret {i8, i1} %result
+}
+
+define i1 @test_sadd1() {
+; CHECK-LABEL: @test_sadd1(
+; CHECK-NEXT:    ret i1 true
+;
+  %x = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 126, i8 3)
+  %overflow = extractvalue {i8, i1} %x, 1
+  ret i1 %overflow
+}
+
+define i8 @test_sadd2() {
+; CHECK-LABEL: @test_sadd2(
+; CHECK-NEXT:    ret i8 -86
+;
+  %x = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 126, i8 44)
+  %result = extractvalue {i8, i1} %x, 0
+  ret i8 %result
+}
+
+define {i8, i1} @test_sadd3(i8 %v) {
+; CHECK-LABEL: @test_sadd3(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %result = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v, i8 undef)
+  ret {i8, i1} %result
+}
+
+define {i8, i1} @test_sadd4(i8 %v) {
+; CHECK-LABEL: @test_sadd4(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %result = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 undef, i8 %v)
+  ret {i8, i1} %result
 }
 
 define {i8, i1} @test_usub1(i8 %V) {
 ; CHECK-LABEL: @test_usub1(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
   %x = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %V, i8 %V)
   ret {i8, i1} %x
-; CHECK-NEXT: ret { i8, i1 } zeroinitializer
+}
+
+define {i8, i1} @test_usub2(i8 %V) {
+; CHECK-LABEL: @test_usub2(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %x = call {i8, i1} @llvm.usub.with.overflow.i8(i8 %V, i8 undef)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_usub3(i8 %V) {
+; CHECK-LABEL: @test_usub3(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %x = call {i8, i1} @llvm.usub.with.overflow.i8(i8 undef, i8 %V)
+  ret {i8, i1} %x
 }
 
 define {i8, i1} @test_ssub1(i8 %V) {
 ; CHECK-LABEL: @test_ssub1(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
   %x = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %V, i8 %V)
   ret {i8, i1} %x
-; CHECK-NEXT: ret { i8, i1 } zeroinitializer
+}
+
+define {i8, i1} @test_ssub2(i8 %V) {
+; CHECK-LABEL: @test_ssub2(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %x = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 %V, i8 undef)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_ssub3(i8 %V) {
+; CHECK-LABEL: @test_ssub3(
+; CHECK-NEXT:    ret { i8, i1 } undef
+;
+  %x = call {i8, i1} @llvm.ssub.with.overflow.i8(i8 undef, i8 %V)
+  ret {i8, i1} %x
 }
 
 define {i8, i1} @test_umul1(i8 %V) {
 ; CHECK-LABEL: @test_umul1(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
   %x = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %V, i8 0)
   ret {i8, i1} %x
-; CHECK-NEXT: ret { i8, i1 } zeroinitializer
+}
+
+define {i8, i1} @test_umul2(i8 %V) {
+; CHECK-LABEL: @test_umul2(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %V, i8 undef)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_umul3(i8 %V) {
+; CHECK-LABEL: @test_umul3(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.umul.with.overflow.i8(i8 0, i8 %V)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_umul4(i8 %V) {
+; CHECK-LABEL: @test_umul4(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.umul.with.overflow.i8(i8 undef, i8 %V)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_smul1(i8 %V) {
+; CHECK-LABEL: @test_smul1(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %V, i8 0)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_smul2(i8 %V) {
+; CHECK-LABEL: @test_smul2(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %V, i8 undef)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_smul3(i8 %V) {
+; CHECK-LABEL: @test_smul3(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.smul.with.overflow.i8(i8 0, i8 %V)
+  ret {i8, i1} %x
+}
+
+define {i8, i1} @test_smul4(i8 %V) {
+; CHECK-LABEL: @test_smul4(
+; CHECK-NEXT:    ret { i8, i1 } zeroinitializer
+;
+  %x = call {i8, i1} @llvm.smul.with.overflow.i8(i8 undef, i8 %V)
+  ret {i8, i1} %x
 }
 
 declare i256 @llvm.cttz.i256(i256 %src, i1 %is_zero_undef)
 
 define i256 @test_cttz() {
 ; CHECK-LABEL: @test_cttz(
+; CHECK-NEXT:    ret i256 1
+;
   %x = call i256 @llvm.cttz.i256(i256 10, i1 false)
   ret i256 %x
-; CHECK-NEXT: ret i256 1
 }
 
 declare i256 @llvm.ctpop.i256(i256 %src)
 
 define i256 @test_ctpop() {
 ; CHECK-LABEL: @test_ctpop(
+; CHECK-NEXT:    ret i256 2
+;
   %x = call i256 @llvm.ctpop.i256(i256 10)
   ret i256 %x
-; CHECK-NEXT: ret i256 2
 }
 
 ; Test a non-intrinsic that we know about as a library call.
@@ -66,14 +214,15 @@ declare float @fabs(float %x)
 
 define float @test_fabs_libcall() {
 ; CHECK-LABEL: @test_fabs_libcall(
+; CHECK-NEXT:    [[X:%.*]] = call float @fabs(float -4.200000e+01)
+; CHECK-NEXT:    ret float 4.200000e+01
+;
 
   %x = call float @fabs(float -42.0)
 ; This is still a real function call, so instsimplify won't nuke it -- other
 ; passes have to do that.
-; CHECK-NEXT: call float @fabs
 
   ret float %x
-; CHECK-NEXT: ret float 4.2{{0+}}e+01
 }
 
 
@@ -87,34 +236,35 @@ declare float @llvm.nearbyint.f32(float) nounwind readnone
 ; Test idempotent intrinsics
 define float @test_idempotence(float %a) {
 ; CHECK-LABEL: @test_idempotence(
+; CHECK-NEXT:    [[A0:%.*]] = call float @llvm.fabs.f32(float [[A:%.*]])
+; CHECK-NEXT:    [[B0:%.*]] = call float @llvm.floor.f32(float [[A]])
+; CHECK-NEXT:    [[C0:%.*]] = call float @llvm.ceil.f32(float [[A]])
+; CHECK-NEXT:    [[D0:%.*]] = call float @llvm.trunc.f32(float [[A]])
+; CHECK-NEXT:    [[E0:%.*]] = call float @llvm.rint.f32(float [[A]])
+; CHECK-NEXT:    [[F0:%.*]] = call float @llvm.nearbyint.f32(float [[A]])
+; CHECK-NEXT:    [[R0:%.*]] = fadd float [[A0]], [[B0]]
+; CHECK-NEXT:    [[R1:%.*]] = fadd float [[R0]], [[C0]]
+; CHECK-NEXT:    [[R2:%.*]] = fadd float [[R1]], [[D0]]
+; CHECK-NEXT:    [[R3:%.*]] = fadd float [[R2]], [[E0]]
+; CHECK-NEXT:    [[R4:%.*]] = fadd float [[R3]], [[F0]]
+; CHECK-NEXT:    ret float [[R4]]
+;
 
-; CHECK: fabs
-; CHECK-NOT: fabs
   %a0 = call float @llvm.fabs.f32(float %a)
   %a1 = call float @llvm.fabs.f32(float %a0)
 
-; CHECK: floor
-; CHECK-NOT: floor
   %b0 = call float @llvm.floor.f32(float %a)
   %b1 = call float @llvm.floor.f32(float %b0)
 
-; CHECK: ceil
-; CHECK-NOT: ceil
   %c0 = call float @llvm.ceil.f32(float %a)
   %c1 = call float @llvm.ceil.f32(float %c0)
 
-; CHECK: trunc
-; CHECK-NOT: trunc
   %d0 = call float @llvm.trunc.f32(float %a)
   %d1 = call float @llvm.trunc.f32(float %d0)
 
-; CHECK: rint
-; CHECK-NOT: rint
   %e0 = call float @llvm.rint.f32(float %a)
   %e1 = call float @llvm.rint.f32(float %e0)
 
-; CHECK: nearbyint
-; CHECK-NOT: nearbyint
   %f0 = call float @llvm.nearbyint.f32(float %a)
   %f1 = call float @llvm.nearbyint.f32(float %f0)
 
@@ -128,6 +278,17 @@ define float @test_idempotence(float %a) {
 }
 
 define i8* @operator_new() {
+; CHECK-LABEL: @operator_new(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call noalias i8* @_Znwm(i64 8)
+; CHECK-NEXT:    br i1 false, label [[CAST_END:%.*]], label [[CAST_NOTNULL:%.*]]
+; CHECK:       cast.notnull:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 4
+; CHECK-NEXT:    br label [[CAST_END]]
+; CHECK:       cast.end:
+; CHECK-NEXT:    [[CAST_RESULT:%.*]] = phi i8* [ [[ADD_PTR]], [[CAST_NOTNULL]] ], [ null, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8* [[CAST_RESULT]]
+;
 entry:
   %call = tail call noalias i8* @_Znwm(i64 8)
   %cmp = icmp eq i8* %call, null
@@ -141,8 +302,6 @@ cast.end:                                         ; preds = %cast.notnull, %entr
   %cast.result = phi i8* [ %add.ptr, %cast.notnull ], [ null, %entry ]
   ret i8* %cast.result
 
-; CHECK-LABEL: @operator_new
-; CHECK: br i1 false, label %cast.end, label %cast.notnull
 }
 
 declare nonnull noalias i8* @_Znwm(i64)
@@ -151,6 +310,18 @@ declare nonnull noalias i8* @_Znwm(i64)
 @_ZSt7nothrow = external global %"struct.std::nothrow_t"
 
 define i8* @operator_new_nothrow_t() {
+; CHECK-LABEL: @operator_new_nothrow_t(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call noalias i8* @_ZnamRKSt9nothrow_t(i64 8, %"struct.std::nothrow_t"* @_ZSt7nothrow)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8* [[CALL]], null
+; CHECK-NEXT:    br i1 [[CMP]], label [[CAST_END:%.*]], label [[CAST_NOTNULL:%.*]]
+; CHECK:       cast.notnull:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 4
+; CHECK-NEXT:    br label [[CAST_END]]
+; CHECK:       cast.end:
+; CHECK-NEXT:    [[CAST_RESULT:%.*]] = phi i8* [ [[ADD_PTR]], [[CAST_NOTNULL]] ], [ null, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8* [[CAST_RESULT]]
+;
 entry:
   %call = tail call noalias i8* @_ZnamRKSt9nothrow_t(i64 8, %"struct.std::nothrow_t"* @_ZSt7nothrow)
   %cmp = icmp eq i8* %call, null
@@ -164,13 +335,23 @@ cast.end:                                         ; preds = %cast.notnull, %entr
   %cast.result = phi i8* [ %add.ptr, %cast.notnull ], [ null, %entry ]
   ret i8* %cast.result
 
-; CHECK-LABEL: @operator_new_nothrow_t
-; CHECK: br i1 %cmp, label %cast.end, label %cast.notnull
 }
 
 declare i8* @_ZnamRKSt9nothrow_t(i64, %"struct.std::nothrow_t"*) nounwind
 
 define i8* @malloc_can_return_null() {
+; CHECK-LABEL: @malloc_can_return_null(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = tail call noalias i8* @malloc(i64 8)
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8* [[CALL]], null
+; CHECK-NEXT:    br i1 [[CMP]], label [[CAST_END:%.*]], label [[CAST_NOTNULL:%.*]]
+; CHECK:       cast.notnull:
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 4
+; CHECK-NEXT:    br label [[CAST_END]]
+; CHECK:       cast.end:
+; CHECK-NEXT:    [[CAST_RESULT:%.*]] = phi i8* [ [[ADD_PTR]], [[CAST_NOTNULL]] ], [ null, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i8* [[CAST_RESULT]]
+;
 entry:
   %call = tail call noalias i8* @malloc(i64 8)
   %cmp = icmp eq i8* %call, null
@@ -184,38 +365,44 @@ cast.end:                                         ; preds = %cast.notnull, %entr
   %cast.result = phi i8* [ %add.ptr, %cast.notnull ], [ null, %entry ]
   ret i8* %cast.result
 
-; CHECK-LABEL: @malloc_can_return_null
-; CHECK: br i1 %cmp, label %cast.end, label %cast.notnull
 }
 
 define i32 @call_null() {
+; CHECK-LABEL: @call_null(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 null()
+; CHECK-NEXT:    ret i32 undef
+;
 entry:
   %call = call i32 null()
   ret i32 %call
 }
-; CHECK-LABEL: define i32 @call_null(
-; CHECK: ret i32 undef
 
 define i32 @call_undef() {
+; CHECK-LABEL: @call_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 undef()
+; CHECK-NEXT:    ret i32 undef
+;
 entry:
   %call = call i32 undef()
   ret i32 %call
 }
-; CHECK-LABEL: define i32 @call_undef(
-; CHECK: ret i32 undef
 
 @GV = private constant [8 x i32] [i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49]
 
 define <8 x i32> @partial_masked_load() {
 ; CHECK-LABEL: @partial_masked_load(
-; CHECK:         ret <8 x i32> <i32 undef, i32 undef, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+; CHECK-NEXT:    ret <8 x i32> <i32 undef, i32 undef, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
+;
   %masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* bitcast (i32* getelementptr ([8 x i32], [8 x i32]* @GV, i64 0, i64 -2) to <8 x i32>*), i32 4, <8 x i1> <i1 false, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
   ret <8 x i32> %masked.load
 }
 
 define <8 x i32> @masked_load_undef_mask(<8 x i32>* %V) {
 ; CHECK-LABEL: @masked_load_undef_mask(
-; CHECK:         ret <8 x i32> <i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0>
+; CHECK-NEXT:    ret <8 x i32> <i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0>
+;
   %masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %V, i32 4, <8 x i1> undef, <8 x i32> <i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0>)
   ret <8 x i32> %masked.load
 }
diff --git a/test/Transforms/InstSimplify/or.ll b/test/Transforms/InstSimplify/or.ll
index 2c5b6181bc6c..14b08af00646 100644
--- a/test/Transforms/InstSimplify/or.ll
+++ b/test/Transforms/InstSimplify/or.ll
@@ -159,7 +159,7 @@ define i399 @test4_apint(i399 %V, i399 %M) {
     %A = add i399 %V, %N
     %B = and i399 %A, %C1
     %D = and i399 %V, 274877906943
-    %R = or i399 %B, %D
+    %R = or i399 %D, %B
     ret i399 %R
 }
 
@@ -179,3 +179,42 @@ define i117 @test6_apint(i117 %X) {
     ret i117 %Y
 }
 
+; Test the case where integer BitWidth <= 64 && BitWidth % 2 != 0.
+; Vector version of test1_apint with the add commuted
+define <2 x i39> @test7_apint(<2 x i39> %V, <2 x i39> %M) {
+; CHECK-LABEL: @test7_apint(
+; CHECK-NEXT:    [[N:%.*]] = and <2 x i39> [[M:%.*]], <i39 -274877906944, i39 -274877906944>
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i39> [[N]], [[V:%.*]]
+; CHECK-NEXT:    ret <2 x i39> [[A]]
+;
+  ;; If we have: ((V + N) & C1) | (V & C2)
+  ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+  ;; replace with V+N.
+  %C1 = xor <2 x i39> <i39 274877906943, i39 274877906943>, <i39 -1, i39 -1> ;; C2 = 274877906943
+  %N = and <2 x i39> %M, <i39 274877906944, i39 274877906944>
+  %A = add <2 x i39> %N, %V
+  %B = and <2 x i39> %A, %C1
+  %D = and <2 x i39> %V, <i39 274877906943, i39 274877906943>
+  %R = or <2 x i39> %B, %D
+  ret <2 x i39> %R
+}
+
+; Test the case where Integer BitWidth > 64 && BitWidth <= 1024.
+; Vector version of test4_apint with the add and the or commuted
+define <2 x i399> @test8_apint(<2 x i399> %V, <2 x i399> %M) {
+; CHECK-LABEL: @test8_apint(
+; CHECK-NEXT:    [[N:%.*]] = and <2 x i399> [[M:%.*]], <i399 18446742974197923840, i399 18446742974197923840>
+; CHECK-NEXT:    [[A:%.*]] = add <2 x i399> [[N]], [[V:%.*]]
+; CHECK-NEXT:    ret <2 x i399> [[A]]
+;
+  ;; If we have: ((V + N) & C1) | (V & C2)
+  ;; .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+  ;; replace with V+N.
+  %C1 = xor <2 x i399> <i399 274877906943, i399 274877906943>, <i399 -1, i399 -1> ;; C2 = 274877906943
+  %N = and <2 x i399> %M, <i399 18446742974197923840, i399 18446742974197923840>
+  %A = add <2 x i399> %N, %V
+  %B = and <2 x i399> %A, %C1
+  %D = and <2 x i399> %V, <i399 274877906943, i399 274877906943>
+  %R = or <2 x i399> %D, %B
+  ret <2 x i399> %R
+}
diff --git a/test/Transforms/JumpThreading/assume.ll b/test/Transforms/JumpThreading/assume.ll
index 3a039676e172..f58ee299cba0 100644
--- a/test/Transforms/JumpThreading/assume.ll
+++ b/test/Transforms/JumpThreading/assume.ll
@@ -59,12 +59,12 @@ return:                                           ; preds = %entry, %if.then
 @g = external global i32
 
 ; Check that we do prove a fact using an assume within the block.
-; FIXME: We can fold the assume based on the semantics of assume.
-; CHECK-LABEL: @can_fold_assume
-; CHECK: %notnull = icmp ne i32* %array, null
-; CHECK-NEXT: call void @llvm.assume(i1 %notnull)
-; CHECK-NEXT: ret void
+; We can fold the assume based on the semantics of assume.
 define void @can_fold_assume(i32* %array) {
+; CHECK-LABEL: @can_fold_assume
+; CHECK-NOT: call void @llvm.assume
+; CHECK-NOT: br
+; CHECK: ret void
   %notnull = icmp ne i32* %array, null
   call void @llvm.assume(i1 %notnull)
   br i1 %notnull, label %normal, label %error
@@ -80,11 +80,11 @@ error:
 declare void @f(i1)
 declare void @exit()
 ; We can fold the assume but not the uses before the assume.
-define void @dont_fold_incorrectly(i32* %array) {
-; CHECK-LABEL:@dont_fold_incorrectly
+define void @cannot_fold_use_before_assume(i32* %array) {
+; CHECK-LABEL:@cannot_fold_use_before_assume
 ; CHECK: @f(i1 %notnull)
 ; CHECK-NEXT: exit()
-; CHECK-NEXT: assume(i1 %notnull)
+; CHECK-NOT: assume
 ; CHECK-NEXT: ret void
   %notnull = icmp ne i32* %array, null
   call void @f(i1 %notnull)
@@ -100,6 +100,135 @@ error:
   ret void
 }
 
+declare void @dummy(i1) nounwind argmemonly
+define void @can_fold_some_use_before_assume(i32* %array) {
+
+; CHECK-LABEL:@can_fold_some_use_before_assume
+; CHECK: @f(i1 %notnull)
+; CHECK-NEXT: @dummy(i1 true)
+; CHECK-NOT: assume
+; CHECK-NEXT: ret void
+  %notnull = icmp ne i32* %array, null
+  call void @f(i1 %notnull)
+  call void @dummy(i1 %notnull)
+  call void @llvm.assume(i1 %notnull)
+  br i1 %notnull, label %normal, label %error
+
+normal:
+  ret void
+
+error:
+  store atomic i32 0, i32* @g unordered, align 4
+  ret void
+
+}
+
+; FIXME: can fold assume and all uses before/after assume.
+; because the trapping exit call is after the assume.
+define void @can_fold_assume_and_all_uses(i32* %array) {
+; CHECK-LABEL:@can_fold_assume_and_all_uses
+; CHECK: @dummy(i1 %notnull)
+; CHECK-NEXT: assume(i1 %notnull)
+; CHECK-NEXT: exit()
+; CHECK-NEXT: %notnull2 = or i1 true, false
+; CHECK-NEXT: @f(i1 %notnull2)
+; CHECK-NEXT: ret void
+  %notnull = icmp ne i32* %array, null
+  call void @dummy(i1 %notnull)
+  call void @llvm.assume(i1 %notnull)
+  call void @exit()
+  br i1 %notnull, label %normal, label %error
+
+normal:
+  %notnull2 = or i1 %notnull, false
+  call void @f(i1 %notnull2)
+  ret void
+
+error:
+  store atomic i32 0, i32* @g unordered, align 4
+  ret void
+}
+
+declare void @fz(i8)
+; FIXME: We can fold assume to true, and the use after assume, but we do not do so
+; currently, because of the function call after the assume.
+define void @can_fold_assume2(i32* %array) {
+
+; CHECK-LABEL:@can_fold_assume2
+; CHECK: @f(i1 %notnull)
+; CHECK-NEXT: assume(i1 %notnull)
+; CHECK-NEXT: znotnull = zext i1 %notnull to i8
+; CHECK-NEXT: @f(i1 %notnull)
+; CHECK-NEXT: @f(i1 true)
+; CHECK-NEXT: @fz(i8 %znotnull)
+; CHECK-NEXT: ret void
+  %notnull = icmp ne i32* %array, null
+  call void @f(i1 %notnull)
+  call void @llvm.assume(i1 %notnull)
+  %znotnull = zext i1 %notnull to i8
+  call void @f(i1 %notnull)
+  br i1 %notnull, label %normal, label %error
+
+normal:
+  call void @f(i1 %notnull)
+  call void @fz(i8 %znotnull)
+  ret void
+
+error:
+  store atomic i32 0, i32* @g unordered, align 4
+  ret void
+}
+
+declare void @llvm.experimental.guard(i1, ...)
+; FIXME: We can fold assume to true, but we do not do so
+; because of the guard following the assume.
+define void @can_fold_assume3(i32* %array){
+
+; CHECK-LABEL:@can_fold_assume3
+; CHECK: @f(i1 %notnull)
+; CHECK-NEXT: assume(i1 %notnull)
+; CHECK-NEXT: guard(i1 %notnull)
+; CHECK-NEXT: znotnull = zext i1 true to i8
+; CHECK-NEXT: @f(i1 true)
+; CHECK-NEXT: @fz(i8 %znotnull)
+; CHECK-NEXT: ret void
+  %notnull = icmp ne i32* %array, null
+  call void @f(i1 %notnull)
+  call void @llvm.assume(i1 %notnull)
+  call void(i1, ...) @llvm.experimental.guard(i1 %notnull) [ "deopt"() ]
+  %znotnull = zext i1 %notnull to i8
+  br i1 %notnull, label %normal, label %error
+
+normal:
+  call void @f(i1 %notnull)
+  call void @fz(i8 %znotnull)
+  ret void
+
+error:
+  store atomic i32 0, i32* @g unordered, align 4
+  ret void
+}
+
+
+; can fold all uses and remove the cond
+define void @can_fold_assume4(i32* %array) {
+; CHECK-LABEL: can_fold_assume4
+; CHECK-NOT: notnull
+; CHECK: dummy(i1 true)
+; CHECK-NEXT: ret void
+  %notnull = icmp ne i32* %array, null
+  call void @exit()
+  call void @dummy(i1 %notnull)
+  call void @llvm.assume(i1 %notnull)
+  br i1 %notnull, label %normal, label %error
+
+normal:
+  ret void
+
+error:
+  store atomic i32 0, i32* @g unordered, align 4
+  ret void
+}
 ; Function Attrs: nounwind
 declare void @llvm.assume(i1) #1
 
diff --git a/test/Transforms/JumpThreading/fold-not-thread.ll b/test/Transforms/JumpThreading/fold-not-thread.ll
index f05169b31bc8..85cdcc0d9b33 100644
--- a/test/Transforms/JumpThreading/fold-not-thread.ll
+++ b/test/Transforms/JumpThreading/fold-not-thread.ll
@@ -133,10 +133,10 @@ L3:
   ret void
 }
 
-; FIXME: Make sure we can do the RAUW for %add...
+; Make sure we can do the RAUW for %add...
 ;
 ; CHECK-LABEL: @rauw_if_possible(
-; CHECK: call void @f4(i32 %add)
+; CHECK: call void @f4(i32 96)
 define void @rauw_if_possible(i32 %value) nounwind {
 entry:
   %cmp = icmp eq i32 %value, 32
diff --git a/test/Transforms/JumpThreading/guards.ll b/test/Transforms/JumpThreading/guards.ll
index c5f72b113efc..53175a7b7253 100644
--- a/test/Transforms/JumpThreading/guards.ll
+++ b/test/Transforms/JumpThreading/guards.ll
@@ -182,86 +182,89 @@ Exit:
   ret void
 }
 
-declare void @never_called()
+declare void @never_called(i1)
 
-; Assume the guard is always taken and we deoptimize, so we never reach the
-; branch below that guard. We should *never* change the behaviour of a guard from
-; `must deoptimize` to `may deoptimize`, since this affects the program
-; semantics.
+; LVI uses guard to identify value of %c2 in branch as true, we cannot replace that
+; guard with guard(true & c1).
 define void @dont_fold_guard(i8* %addr, i32 %i, i32 %length) {
 ; CHECK-LABEL: dont_fold_guard
-; CHECK: experimental.guard(i1 %wide.chk)
-
-entry:
-  br label %BBPred
-
-BBPred:
- %cond = icmp eq i8* %addr, null
- br i1 %cond, label %zero, label %not_zero
-
-zero:
-  unreachable
-
-not_zero:
+; CHECK: %wide.chk = and i1 %c1, %c2
+; CHECK-NEXT: experimental.guard(i1 %wide.chk)
+; CHECK-NEXT: call void @never_called(i1 true)
+; CHECK-NEXT: ret void
   %c1 = icmp ult i32 %i, %length
   %c2 = icmp eq i32 %i, 0
   %wide.chk = and i1 %c1, %c2
   call void(i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
-  br i1 %c2, label %unreachedBB2, label %unreachedBB1
+  br i1 %c2, label %BB1, label %BB2
 
-unreachedBB2:
-  call void @never_called()
+BB1:
+  call void @never_called(i1 %c2)
   ret void
 
-unreachedBB1:
+BB2:
   ret void
 }
 
+declare void @dummy(i1) nounwind argmemonly
+; same as dont_fold_guard1 but there's a use immediately after guard and before
+; branch. We can fold that use.
+define void @dont_fold_guard2(i8* %addr, i32 %i, i32 %length) {
+; CHECK-LABEL: dont_fold_guard2
+; CHECK: %wide.chk = and i1 %c1, %c2
+; CHECK-NEXT: experimental.guard(i1 %wide.chk)
+; CHECK-NEXT: dummy(i1 true)
+; CHECK-NEXT: call void @never_called(i1 true)
+; CHECK-NEXT: ret void
+  %c1 = icmp ult i32 %i, %length
+  %c2 = icmp eq i32 %i, 0
+  %wide.chk = and i1 %c1, %c2
+  call void(i1, ...) @llvm.experimental.guard(i1 %wide.chk) [ "deopt"() ]
+  call void @dummy(i1 %c2)
+  br i1 %c2, label %BB1, label %BB2
+
+BB1:
+  call void @never_called(i1 %c2)
+  ret void
+
+BB2:
+  ret void
+}
 
 ; same as dont_fold_guard1 but condition %cmp is not an instruction.
 ; We cannot fold the guard under any circumstance.
 ; FIXME: We can merge unreachableBB2 into not_zero.
-define void @dont_fold_guard2(i8* %addr, i1 %cmp, i32 %i, i32 %length) {
-; CHECK-LABEL: dont_fold_guard2
+define void @dont_fold_guard3(i8* %addr, i1 %cmp, i32 %i, i32 %length) {
+; CHECK-LABEL: dont_fold_guard3
 ; CHECK: guard(i1 %cmp)
-
-entry:
-  br label %BBPred
-
-BBPred:
- %cond = icmp eq i8* %addr, null
- br i1 %cond, label %zero, label %not_zero
-
-zero:
-  unreachable
-
-not_zero:
   call void(i1, ...) @llvm.experimental.guard(i1 %cmp) [ "deopt"() ]
-  br i1 %cmp, label %unreachedBB2, label %unreachedBB1
+  br i1 %cmp, label %BB1, label %BB2
 
-unreachedBB2:
-  call void @never_called()
+BB1:
+  call void @never_called(i1 %cmp)
   ret void
 
-unreachedBB1:
+BB2:
   ret void
 }
 
+declare void @f(i1)
 ; Same as dont_fold_guard1 but use switch instead of branch.
 ; triggers source code `ProcessThreadableEdges`.
-declare void @f(i1)
-define void @dont_fold_guard3(i1 %cmp1, i32 %i) nounwind {
-; CHECK-LABEL: dont_fold_guard3 
+define void @dont_fold_guard4(i1 %cmp1, i32 %i) nounwind {
+; CHECK-LABEL: dont_fold_guard4 
 ; CHECK-LABEL: L2:
 ; CHECK-NEXT: %cmp = icmp eq i32 %i, 0 
 ; CHECK-NEXT: guard(i1 %cmp)
-; CHECK-NEXT: @f(i1 %cmp)
+; CHECK-NEXT: dummy(i1 true)
+; CHECK-NEXT: @f(i1 true)
 ; CHECK-NEXT: ret void
 entry:
   br i1 %cmp1, label %L0, label %L3 
 L0:
   %cmp = icmp eq i32 %i, 0
   call void(i1, ...) @llvm.experimental.guard(i1 %cmp) [ "deopt"() ]
+  call void @dummy(i1 %cmp)
   switch i1 %cmp, label %L3 [
     i1 false, label %L1
     i1 true, label %L2
diff --git a/test/Transforms/LoopIdiom/pr33114.ll b/test/Transforms/LoopIdiom/pr33114.ll
new file mode 100644
index 000000000000..fa44d8e31e7c
--- /dev/null
+++ b/test/Transforms/LoopIdiom/pr33114.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; Check that we're not crashing while looking at the recurrence variable.
+; RUN: opt -S -loop-idiom %s | FileCheck %s
+
+define void @tinkywinky() {
+; CHECK-LABEL: @tinkywinky(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[PH:%.*]]
+; CHECK:       ph:
+; CHECK-NEXT:    [[MYPHI:%.*]] = phi i32 [ 1, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[PATATINO:%.*]] = ashr i32 [[MYPHI]], undef
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[PATATINO]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[EXIT_LOOPEXIT:%.*]], label [[IF_END]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 true, label %exit, label %ph
+
+ph:
+  %myphi = phi i32 [ 1, %entry ]
+  br label %if.end
+
+if.end:
+  %patatino = ashr i32 %myphi, undef
+  %tobool = icmp eq i32 %patatino, 0
+  br i1 %tobool, label %exit, label %if.end
+
+exit:
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/incorrect-offset-scaling.ll b/test/Transforms/LoopStrengthReduce/X86/incorrect-offset-scaling.ll
index 3adb8bcf514d..00c3222b0051 100644
--- a/test/Transforms/LoopStrengthReduce/X86/incorrect-offset-scaling.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/incorrect-offset-scaling.ll
@@ -25,7 +25,7 @@ L2:                                               ; preds = %idxend.8
 if6:                                              ; preds = %idxend.8
   %r2 = add i64 %0, -1
   %r3 = load i64, i64* %1, align 8
-; CHECK-NOT:  %r2
+; CHECK:  %r2 = add i64 %0, -1
 ; CHECK:  %r3 = load i64
   br label %ib
 
@@ -36,13 +36,11 @@ ib:                                               ; preds = %if6
   %r4 = mul i64 %r3, %r0
   %r5 = add i64 %r2, %r4
   %r6 = icmp ult i64 %r5, undef
-; CHECK:  [[MUL1:%[0-9]+]] = mul i64 %lsr.iv, %r3
-; CHECK:  [[ADD1:%[0-9]+]] = add i64 [[MUL1]], -1
-; CHECK:  add i64 %{{.}}, [[ADD1]]
-; CHECK:  %r6
+; CHECK:  %r4 = mul i64 %r3, %lsr.iv
+; CHECK:  %r5 = add i64 %r2, %r4
+; CHECK:  %r6 = icmp ult i64 %r5, undef
+; CHECK:  %r7 = getelementptr i64, i64* undef, i64 %r5
   %r7 = getelementptr i64, i64* undef, i64 %r5
   store i64 1, i64* %r7, align 8
-; CHECK:  [[MUL2:%[0-9]+]] = mul i64 %lsr.iv, %r3
-; CHECK:  [[ADD2:%[0-9]+]] = add i64 [[MUL2]], -1
   br label %L
 }
diff --git a/test/Transforms/LoopStrengthReduce/lsr-expand-quadratic.ll b/test/Transforms/LoopStrengthReduce/X86/lsr-expand-quadratic.ll
similarity index 51%
rename from test/Transforms/LoopStrengthReduce/lsr-expand-quadratic.ll
rename to test/Transforms/LoopStrengthReduce/X86/lsr-expand-quadratic.ll
index aa688d999e60..a7731bfcec56 100644
--- a/test/Transforms/LoopStrengthReduce/lsr-expand-quadratic.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/lsr-expand-quadratic.ll
@@ -1,5 +1,14 @@
+; REQUIRES: x86
 ; RUN: opt -loop-reduce -S < %s | FileCheck %s
 
+; Strength reduction analysis here relies on IV Users analysis, that
+; only finds users among instructions with types that are treated as
+; legal by the data layout. When running this test on pure non-x86
+; configs (for example, ARM 64), it gets confused with the target
+; triple and uses a default data layout instead. This default layout
+; does not have any legal types (even i32), so the transformation
+; does not happen.
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx"
 
@@ -7,16 +16,23 @@ target triple = "x86_64-apple-macosx"
 ;
 ; SCEV expander cannot expand quadratic recurrences outside of the
 ; loop. This recurrence depends on %sub.us, so can't be expanded.
+; We cannot fold SCEVUnknown (sub.us) with recurrences since it is
+; declared after the loop.
 ;
 ; CHECK-LABEL: @test2
 ; CHECK-LABEL: test2.loop:
-; CHECK: %lsr.iv = phi i32 [ %lsr.iv.next, %test2.loop ], [ -16777216, %entry ]
-; CHECK: %lsr.iv.next = add nsw i32 %lsr.iv, 16777216
+; CHECK:  %lsr.iv1 = phi i32 [ %lsr.iv.next2, %test2.loop ], [ -16777216, %entry ]
+; CHECK:  %lsr.iv = phi i32 [ %lsr.iv.next, %test2.loop ], [ -1, %entry ]
+; CHECK:  %lsr.iv.next = add nsw i32 %lsr.iv, 1
+; CHECK:  %lsr.iv.next2 = add nsw i32 %lsr.iv1, 16777216
 ;
 ; CHECK-LABEL: for.end:
-; CHECK: %sub.cond.us = sub nsw i32 %inc1115.us, %sub.us
-; CHECK: %sext.us = mul i32 %lsr.iv.next, %sub.cond.us
-; CHECK: %f = ashr i32 %sext.us, 24
+; CHECK:  %tobool.us = icmp eq i32 %lsr.iv.next2, 0
+; CHECK:  %sub.us = select i1 %tobool.us, i32 0, i32 0
+; CHECK:  %1 = sub i32 0, %sub.us
+; CHECK:  %2 = add i32 %1, %lsr.iv.next
+; CHECK:  %sext.us = mul i32 %lsr.iv.next2, %2
+; CHECK:  %f = ashr i32 %sext.us, 24
 ; CHECK: ret i32 %f
 define i32 @test2() {
 entry:
diff --git a/test/Transforms/LoopStrengthReduce/nonintegral.ll b/test/Transforms/LoopStrengthReduce/nonintegral.ll
new file mode 100644
index 000000000000..5648e3aa74af
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/nonintegral.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+
+; Address Space 10 is non-integral. The optimizer is not allowed to use
+; ptrtoint/inttoptr instructions. Make sure that this doesn't happen
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @japi1__unsafe_getindex_65028(i64 addrspace(10)* %arg) {
+; CHECK-NOT: inttoptr
+; CHECK-NOT: ptrtoint
+; How exactly SCEV chooses to materialize isn't all that important, as
+; long as it doesn't try to round-trip through integers. As of this writing,
+; it emits a byte-wise gep, which is fine.
+; CHECK: getelementptr i64, i64 addrspace(10)* {{.*}}, i64 {{.*}}
+top:
+  br label %L86
+
+L86:                                              ; preds = %L86, %top
+  %i.0 = phi i64 [ 0, %top ], [ %tmp, %L86 ]
+  %tmp = add i64 %i.0, 1
+  br i1 undef, label %L86, label %if29
+
+if29:                                             ; preds = %L86
+  %tmp1 = shl i64 %tmp, 1
+  %tmp2 = add i64 %tmp1, -2
+  br label %if31
+
+if31:                                             ; preds = %if38, %if29
+  %"#temp#1.sroa.0.022" = phi i64 [ 0, %if29 ], [ %tmp3, %if38 ]
+  br label %L119
+
+L119:                                             ; preds = %L119, %if31
+  %i5.0 = phi i64 [ %"#temp#1.sroa.0.022", %if31 ], [ %tmp3, %L119 ]
+  %tmp3 = add i64 %i5.0, 1
+  br i1 undef, label %L119, label %if38
+
+if38:                                             ; preds = %L119
+  %tmp4 = add i64 %tmp2, %i5.0
+  %tmp5 = getelementptr i64, i64 addrspace(10)* %arg, i64 %tmp4
+  %tmp6 = load i64, i64 addrspace(10)* %tmp5
+  br i1 undef, label %done, label %if31
+
+done:                                             ; preds = %if38
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll b/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
index fbf55fd81d23..cbf177c0d4b9 100644
--- a/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
+++ b/test/Transforms/LoopStrengthReduce/post-inc-icmpzero.ll
@@ -25,6 +25,8 @@ define void @_Z15IntegerToStringjjR7Vector2(i32 %i, i32 %radix, %struct.Vector2*
 entry:
   %buffer = alloca [33 x i16], align 16
   %add.ptr = getelementptr inbounds [33 x i16], [33 x i16]* %buffer, i64 0, i64 33
+  %sub.ptr.lhs.cast = ptrtoint i16* %add.ptr to i64
+  %sub.ptr.rhs.cast = ptrtoint i16* %add.ptr to i64
   br label %do.body
 
 do.body:                                          ; preds = %do.body, %entry
@@ -46,8 +48,6 @@ do.body:                                          ; preds = %do.body, %entry
 do.end:                                           ; preds = %do.body
   %xap.0 = inttoptr i64 %0 to i1*
   %cap.0 = ptrtoint i1* %xap.0 to i64
-  %sub.ptr.lhs.cast = ptrtoint i16* %add.ptr to i64
-  %sub.ptr.rhs.cast = ptrtoint i16* %incdec.ptr to i64
   %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
   %sub.ptr.div39 = lshr exact i64 %sub.ptr.sub, 1
   %conv11 = trunc i64 %sub.ptr.div39 to i32
diff --git a/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll b/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
new file mode 100644
index 000000000000..a7f414b8694b
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/no_vector_instructions.ll
@@ -0,0 +1,26 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -S -debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: all_scalar
+; CHECK:       LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       LV: Found an estimated cost of 2 for VF 2 For instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       LV: Not considering vector loop of width 2 because it will not generate any vector instructions
+;
+define void @all_scalar(i64* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr i64, i64* %a, i64 %i
+  store i64 0, i64* %tmp0, align 1
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/SystemZ/addressing.ll b/test/Transforms/LoopVectorize/SystemZ/addressing.ll
new file mode 100644
index 000000000000..1f7a6d29c57c
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/addressing.ll
@@ -0,0 +1,72 @@
+; RUN: opt -S  -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize -dce \
+; RUN:   -instcombine -force-vector-width=2  < %s | FileCheck %s
+;
+; Test that loop vectorizer does not generate vector addresses that must then
+; always be extracted.
+
+; Check that the addresses for a scalarized memory access is not extracted
+; from a vector register.
+define i32 @foo(i32* nocapture %A) {
+;CHECK-LABEL: @foo(
+;CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK:  %0 = shl nsw i64 %index, 2
+;CHECK:  %1 = shl i64 %index, 2
+;CHECK:  %2 = or i64 %1, 4
+;CHECK:  %3 = getelementptr inbounds i32, i32* %A, i64 %0
+;CHECK:  %4 = getelementptr inbounds i32, i32* %A, i64 %2
+;CHECK:  store i32 4, i32* %3, align 4
+;CHECK:  store i32 4, i32* %4, align 4
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %0
+  store i32 4, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
+
+
+; Check that a load of address is scalarized.
+define i32 @foo1(i32* nocapture noalias %A, i32** nocapture %PtrPtr) {
+;CHECK-LABEL: @foo1(
+;CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+;CHECK:  %0 = or i64 %index, 1
+;CHECK:  %1 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %index
+;CHECK:  %2 = getelementptr inbounds i32*, i32** %PtrPtr, i64 %0
+;CHECK:  %3 = load i32*, i32** %1, align 8
+;CHECK:  %4 = load i32*, i32** %2, align 8
+;CHECK:  %5 = load i32, i32* %3, align 4
+;CHECK:  %6 = load i32, i32* %4, align 4
+;CHECK:  %7 = insertelement <2 x i32> undef, i32 %5, i32 0
+;CHECK:  %8 = insertelement <2 x i32> %7, i32 %6, i32 1
+;CHECK:  %9 = getelementptr inbounds i32, i32* %A, i64 %index
+;CHECK:  %10 = bitcast i32* %9 to <2 x i32>*
+;CHECK:  store <2 x i32> %8, <2 x i32>* %10, align 4
+
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %ptr = getelementptr inbounds i32*, i32** %PtrPtr, i64 %indvars.iv
+  %el = load i32*, i32** %ptr
+  %v = load i32, i32* %el
+  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %v, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index b2933c4b56f2..4dc62d86453f 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -11,38 +11,38 @@
 ;       break;
 ;   }
 ; }
+; File, line, and column should match those specified in the metadata
+; CHECK: remark: source.cpp:4:5: loop not vectorized: could not determine number of loop iterations
+; CHECK: remark: source.cpp:4:5: loop not vectorized
 
 ; void test_disabled(int *A, int Length) {
 ; #pragma clang loop vectorize(disable) interleave(disable)
 ;   for (int i = 0; i < Length; i++)
 ;     A[i] = i;
 ; }
+; CHECK: remark: source.cpp:13:5: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1
 
 ; void test_array_bounds(int *A, int *B, int Length) {
 ; #pragma clang loop vectorize(enable)
 ;   for (int i = 0; i < Length; i++)
 ;     A[i] = A[B[i]];
 ; }
-
-; File, line, and column should match those specified in the metadata
-; CHECK: remark: source.cpp:4:5: loop not vectorized: could not determine number of loop iterations
-; CHECK: remark: source.cpp:4:5: loop not vectorized
-; CHECK: remark: source.cpp:13:5: loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1
 ; CHECK: remark: source.cpp:19:5: loop not vectorized: cannot identify array bounds
 ; CHECK: remark: source.cpp:19:5: loop not vectorized
 ; CHECK: warning: source.cpp:19:5: loop not vectorized: failed explicitly specified loop vectorization
 
-; CHECK: _Z4testPii
-; CHECK-NOT: x i32>
-; CHECK: ret
-
-; CHECK: _Z13test_disabledPii
-; CHECK-NOT: x i32>
-; CHECK: ret
-
-; CHECK: _Z17test_array_boundsPiS_i
-; CHECK-NOT: x i32>
-; CHECK: ret
+; int foo();
+; void test_multiple_failures(int *A) {
+;   int k = 0;
+; #pragma clang loop vectorize(enable) interleave(enable)
+;   for (int i = 0; i < 1000; i+=A[i]) {
+;     if (A[i])
+;       k = foo();
+;   }
+;   return k;
+; }
+; CHECK: remark: source.cpp:29:7: loop not vectorized: control flow cannot be substituted for a select
+; CHECK: remark: source.cpp:27:3: loop not vectorized
 
 ; YAML:       --- !Analysis
 ; YAML-NEXT: Pass:            loop-vectorize
@@ -98,6 +98,41 @@
 ; YAML-NEXT:   - String:          'loop not vectorized: '
 ; YAML-NEXT:   - String:          failed explicitly specified loop vectorization
 ; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            NoCFGForSelect
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 29, Column: 7 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          control flow cannot be substituted for a select
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            NonReductionValueUsedOutsideLoop
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          value that could not be identified as reduction is used outside the loop
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            CantComputeNumberOfIterations
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          could not determine number of loop iterations
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 27, Column: 3 }
+; YAML-NEXT: Function:        test_multiple_failures
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT: ...
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -124,6 +159,10 @@ for.end:                                          ; preds = %for.body, %entry
   ret void, !dbg !24
 }
 
+; CHECK: _Z4testPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
 ; Function Attrs: nounwind optsize ssp uwtable
 define void @_Z13test_disabledPii(i32* nocapture %A, i32 %Length) #0 !dbg !7 {
 entry:
@@ -144,6 +183,10 @@ for.end:                                          ; preds = %for.body, %entry
   ret void, !dbg !31
 }
 
+; CHECK: _Z13test_disabledPii
+; CHECK-NOT: x i32>
+; CHECK: ret
+
 ; Function Attrs: nounwind optsize ssp uwtable
 define void @_Z17test_array_boundsPiS_i(i32* nocapture %A, i32* nocapture readonly %B, i32 %Length) #0 !dbg !8 {
 entry:
@@ -174,6 +217,45 @@ for.end:                                          ; preds = %for.end.loopexit, %
   ret void, !dbg !36
 }
 
+; CHECK: _Z17test_array_boundsPiS_i
+; CHECK-NOT: x i32>
+; CHECK: ret
+
+; Function Attrs: nounwind uwtable
+define i32 @test_multiple_failures(i32* nocapture readonly %A) #0 !dbg !46 {
+entry:
+  br label %for.body, !dbg !38
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i.09 = phi i32 [ 0, %entry ], [ %add, %for.inc ]
+  %k.09 = phi i32 [ 0, %entry ], [ %k.1, %for.inc ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.09, !dbg !40
+  %0 = load i32, i32* %arrayidx, align 4, !dbg !40
+  %tobool = icmp eq i32 %0, 0, !dbg !40
+  br i1 %tobool, label %for.inc, label %if.then, !dbg !40
+
+if.then:                                          ; preds = %for.body
+  %call = tail call i32 (...) @foo(), !dbg !41
+  %.pre = load i32, i32* %arrayidx, align 4
+  br label %for.inc, !dbg !42
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %1 = phi i32 [ %.pre, %if.then ], [ 0, %for.body ], !dbg !43
+  %k.1 = phi i32 [ %call, %if.then ], [ %k.09, %for.body ]
+  %add = add nsw i32 %1, %i.09, !dbg !44
+  %cmp = icmp slt i32 %add, 1000, !dbg !45
+  br i1 %cmp, label %for.body, label %for.cond.cleanup, !dbg !38
+
+for.cond.cleanup:                                 ; preds = %for.inc
+  ret i32 %k.1, !dbg !39
+}
+
+declare i32 @foo(...)
+
+; CHECK: test_multiple_failure
+; CHECK-NOT: x i32>
+; CHECK: ret
+
 attributes #0 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
@@ -216,3 +298,13 @@ attributes #0 = { nounwind }
 !34 = !{!34, !15}
 !35 = !DILocation(line: 19, column: 5, scope: !33)
 !36 = !DILocation(line: 20, column: 1, scope: !8)
+!37 = distinct !DILexicalBlock(line: 24, column: 3, file: !1, scope: !46)
+!38 = !DILocation(line: 27, column: 3, scope: !37)
+!39 = !DILocation(line: 31, column: 3, scope: !37)
+!40 = !DILocation(line: 28, column: 9, scope: !37)
+!41 = !DILocation(line: 29, column: 11, scope: !37)
+!42 = !DILocation(line: 29, column: 7, scope: !37)
+!43 = !DILocation(line: 27, column: 32, scope: !37)
+!44 = !DILocation(line: 27, column: 30, scope: !37)
+!45 = !DILocation(line: 27, column: 21, scope: !37)
+!46 = distinct !DISubprogram(name: "test_multiple_failures", line: 26, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 26, file: !1, scope: !5, type: !6, variables: !2)
diff --git a/test/Transforms/NewGVN/pr32403.ll b/test/Transforms/NewGVN/pr32403.ll
index 2552e0e66ab9..505d31a9463e 100644
--- a/test/Transforms/NewGVN/pr32403.ll
+++ b/test/Transforms/NewGVN/pr32403.ll
@@ -17,7 +17,8 @@ define void @reorder_ref_pic_list() local_unnamed_addr {
 ; CHECK-NEXT:    [[INC_I:%.*]] = add nsw i32 [[REFIDXLX_0]], 1
 ; CHECK-NEXT:    br label [[FOR_BODY8_I:%.*]]
 ; CHECK:       for.body8.i:
-; CHECK-NEXT:    br i1 undef, label [[FOR_INC24_I:%.*]], label [[IF_THEN17_I:%.*]]
+; CHECK-NEXT:    [[NIDX_052_I:%.*]] = phi i32 [ [[INC_I]], [[IF_THEN13]] ], [ [[NIDX_052_I]], [[FOR_INC24_I:%.*]] ]
+; CHECK-NEXT:    br i1 undef, label [[FOR_INC24_I]], label [[IF_THEN17_I:%.*]]
 ; CHECK:       if.then17.i:
 ; CHECK-NEXT:    br label [[FOR_INC24_I]]
 ; CHECK:       for.inc24.i:
diff --git a/test/Transforms/NewGVN/pr32836.ll b/test/Transforms/NewGVN/pr32836.ll
new file mode 100644
index 000000000000..623f216101bf
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32836.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -newgvn %s | FileCheck %s
+
+%struct.anon = type { i32 }
+@b = external global %struct.anon
+define void @tinkywinky(i1 %patatino) {
+; CHECK-LABEL: @tinkywinky(
+; CHECK-NEXT:    store i32 8, i32* null
+; CHECK-NEXT:    br i1 [[PATATINO:%.*]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    br label [[L:%.*]]
+; CHECK:       L:
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* null
+; CHECK-NEXT:    [[BF_LOAD1:%.*]] = load i32, i32* getelementptr inbounds (%struct.anon, %struct.anon* @b, i64 0, i32 0)
+; CHECK-NEXT:    [[BF_VALUE:%.*]] = and i32 [[TMP1]], 536870911
+; CHECK-NEXT:    [[BF_CLEAR:%.*]] = and i32 [[BF_LOAD1]], -536870912
+; CHECK-NEXT:    [[BF_SET:%.*]] = or i32 [[BF_CLEAR]], [[BF_VALUE]]
+; CHECK-NEXT:    store i32 [[BF_SET]], i32* getelementptr inbounds (%struct.anon, %struct.anon* @b, i64 0, i32 0)
+; CHECK-NEXT:    br label [[LOR_END:%.*]]
+; CHECK:       lor.end:
+; CHECK-NEXT:    br label [[L]]
+;
+  store i32 8, i32* null
+  br i1 %patatino, label %if.end, label %if.then
+if.then:
+  store i32 8, i32* null
+  br label %L
+L:
+  br label %if.end
+if.end:
+  %tmp1 = load i32, i32* null
+  %bf.load1 = load i32, i32* getelementptr (%struct.anon, %struct.anon* @b, i64 0, i32 0)
+  %bf.value = and i32 %tmp1, 536870911
+  %bf.clear = and i32 %bf.load1, -536870912
+  %bf.set = or i32 %bf.clear, %bf.value
+  store i32 %bf.set, i32* getelementptr (%struct.anon, %struct.anon* @b, i64 0, i32 0)
+  br label %lor.end
+lor.end:
+  %bf.load4 = load i32, i32* getelementptr (%struct.anon, %struct.anon* @b, i64 0, i32 0)
+  %tmp4 = and i32 %bf.load4, 536870911
+  %or = or i32 0, %tmp4
+  br label %L
+}
diff --git a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
index 3ac3c5138ae7..a97e3f81a8ef 100644
--- a/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
+++ b/test/Transforms/SimpleLoopUnswitch/trivial-unswitch.ll
@@ -382,3 +382,64 @@ loop_exit2:
 ; CHECK-NEXT:    %[[R:.*]] = add i32 %[[R1]], %[[R2]]
 ; CHECK-NEXT:    ret i32 %[[R]]
 }
+
+; This test, extracted from the LLVM test suite, has an interesting dominator
+; tree to update as there are edges to sibling domtree nodes within child
+; domtree nodes of the unswitched node.
+define void @xgets(i1 %cond1, i1* %cond2.ptr) {
+; CHECK-LABEL: @xgets(
+entry:
+  br label %for.cond.preheader
+; CHECK:       entry:
+; CHECK-NEXT:    br label %for.cond.preheader
+
+for.cond.preheader:
+  br label %for.cond
+; CHECK:       for.cond.preheader:
+; CHECK-NEXT:    br i1 %cond1, label %for.cond.preheader.split, label %if.end17.thread.loopexit
+;
+; CHECK:       for.cond.preheader.split:
+; CHECK-NEXT:    br label %for.cond
+
+for.cond:
+  br i1 %cond1, label %land.lhs.true, label %if.end17.thread.loopexit
+; CHECK:       for.cond:
+; CHECK-NEXT:    br label %land.lhs.true
+
+land.lhs.true:
+  br label %if.then20
+; CHECK:       land.lhs.true:
+; CHECK-NEXT:    br label %if.then20
+
+if.then20:
+  %cond2 = load volatile i1, i1* %cond2.ptr
+  br i1 %cond2, label %if.then23, label %if.else
+; CHECK:       if.then20:
+; CHECK-NEXT:    %[[COND2:.*]] = load volatile i1, i1* %cond2.ptr
+; CHECK-NEXT:    br i1 %[[COND2]], label %if.then23, label %if.else
+
+if.else:
+  br label %for.cond
+; CHECK:       if.else:
+; CHECK-NEXT:    br label %for.cond
+
+if.end17.thread.loopexit:
+  br label %if.end17.thread
+; CHECK:       if.end17.thread.loopexit:
+; CHECK-NEXT:    br label %if.end17.thread
+
+if.end17.thread:
+  br label %cleanup
+; CHECK:       if.end17.thread:
+; CHECK-NEXT:    br label %cleanup
+
+if.then23:
+  br label %cleanup
+; CHECK:       if.then23:
+; CHECK-NEXT:    br label %cleanup
+
+cleanup:
+  ret void
+; CHECK:       cleanup:
+; CHECK-NEXT:    ret void
+}
diff --git a/test/Verifier/fp-intrinsics.ll b/test/Verifier/fp-intrinsics.ll
index 0a308115cc35..cc3c3bc2ed88 100644
--- a/test/Verifier/fp-intrinsics.ll
+++ b/test/Verifier/fp-intrinsics.ll
@@ -1,13 +1,17 @@
 ; RUN: opt -verify -S < %s 2>&1 | FileCheck --check-prefix=CHECK1 %s
 ; RUN: sed -e s/.T2:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK2 %s
 ; RUN: sed -e s/.T3:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK3 %s
+; RUN: sed -e s/.T4:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK4 %s
+; RUN: sed -e s/.T5:// %s | not opt -verify -disable-output 2>&1 | FileCheck --check-prefix=CHECK5 %s
 
-; Common declaration used for all runs.
+; Common declarations used for all runs.
 declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata)
+declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata)
 
 ; Test that the verifier accepts legal code, and that the correct attributes are
 ; attached to the FP intrinsic.
 ; CHECK1: declare double @llvm.experimental.constrained.fadd.f64(double, double, metadata, metadata) #[[ATTR:[0-9]+]]
+; CHECK1: declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata) #[[ATTR]]
 ; CHECK1: attributes #[[ATTR]] = { inaccessiblememonly nounwind }
 ; Note: FP exceptions aren't usually caught through normal unwind mechanisms,
 ;       but we may want to revisit this for asynchronous exception handling.
@@ -20,6 +24,15 @@ entry:
   ret double %fadd
 }
 
+define double @f1u(double %a) {
+entry:
+  %fsqrt = call double @llvm.experimental.constrained.sqrt.f64(
+                                               double %a,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+  ret double %fsqrt
+}
+
 ; Test an illegal value for the rounding mode argument.
 ; CHECK2: invalid rounding mode argument
 ;T2: define double @f2(double %a, double %b) {
@@ -33,7 +46,7 @@ entry:
 
 ; Test an illegal value for the exception behavior argument.
 ; CHECK3: invalid exception behavior argument
-;T3: define double @f2(double %a, double %b) {
+;T3: define double @f3(double %a, double %b) {
 ;T3: entry:
 ;T3:   %fadd = call double @llvm.experimental.constrained.fadd.f64(
 ;T3:                                         double %a, double %b,
@@ -41,3 +54,25 @@ entry:
 ;T3:                                         metadata !"fpexcept.restrict")
 ;T3:   ret double %fadd
 ;T3: }
+
+; Test an illegal value for the rounding mode argument.
+; CHECK4: invalid rounding mode argument
+;T4: define double @f4(double %a) {
+;T4: entry:
+;T4:   %fadd = call double @llvm.experimental.constrained.sqrt.f64(
+;T4:                                           double %a,
+;T4:                                           metadata !"round.dynomite",
+;T4:                                           metadata !"fpexcept.strict")
+;T4:   ret double %fadd
+;T4: }
+
+; Test an illegal value for the exception behavior argument.
+; CHECK5: invalid exception behavior argument
+;T5: define double @f5(double %a) {
+;T5: entry:
+;T5:   %fadd = call double @llvm.experimental.constrained.sqrt.f64(
+;T5:                                         double %a,
+;T5:                                         metadata !"round.dynamic",
+;T5:                                         metadata !"fpexcept.restrict")
+;T5:   ret double %fadd
+;T5: }
diff --git a/test/Verifier/module-flags-1.ll b/test/Verifier/module-flags-1.ll
index 36bcb335ffc2..ff82c2845235 100644
--- a/test/Verifier/module-flags-1.ll
+++ b/test/Verifier/module-flags-1.ll
@@ -41,6 +41,10 @@
 ; CHECK-NOT: invalid value for 'append'-type module flag (expected a metadata node)
 !18 = !{i32 5, !"flag-4", !{i32 57}}
 
+; Check that any 'max' module flags are valid.
+; CHECK: invalid value for 'max' module flag (expected constant integer)
+!19 = !{i32 7, !"max", !"max"}
+
 ; Check that any 'require' module flags are valid.
 ; CHECK: invalid requirement on flag, flag is not present in module
 !11 = !{i32 3, !"bar", !{!"no-such-flag", i32 52}}
@@ -54,4 +58,4 @@
 
 !llvm.module.flags = !{
   !0, !1, !2, !3, !4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15,
-  !16, !17, !18 }
+  !16, !17, !18, !19 }
diff --git a/test/tools/gold/X86/relocation-model-pic.ll b/test/tools/gold/X86/relocation-model-pic.ll
new file mode 100644
index 000000000000..65b7beecc22d
--- /dev/null
+++ b/test/tools/gold/X86/relocation-model-pic.ll
@@ -0,0 +1,63 @@
+; RUN: cat %s >%t.pic.ll
+; RUN: echo '!llvm.module.flags = !{!0}' >>%t.pic.ll
+; RUN: echo '!0 = !{i32 1, !"PIC Level", i32 2}' >>%t.pic.ll
+
+; RUN: llvm-as %s -o %t.o
+; RUN: llvm-as %t.pic.ll -o %t.pic.o
+
+;; Non-PIC source.
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --shared \
+; RUN:    --plugin-opt=save-temps %t.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --export-dynamic --noinhibit-exec -pie \
+; RUN:    --plugin-opt=save-temps %t.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --export-dynamic --noinhibit-exec \
+; RUN:    --plugin-opt=save-temps %t.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=STATIC
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    -r \
+; RUN:    --plugin-opt=save-temps %t.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=STATIC
+
+;; PIC source.
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --shared \
+; RUN:    --plugin-opt=save-temps %t.pic.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --export-dynamic --noinhibit-exec -pie \
+; RUN:    --plugin-opt=save-temps %t.pic.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    --export-dynamic --noinhibit-exec \
+; RUN:    --plugin-opt=save-temps %t.pic.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=STATIC
+
+; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    -r \
+; RUN:    --plugin-opt=save-temps %t.pic.o -o %t-out
+; RUN: llvm-readobj -r %t-out.o | FileCheck %s --check-prefix=PIC
+
+
+; PIC: R_X86_64_GOTPCREL foo
+; STATIC: R_X86_64_PC32 foo
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@foo = external global i32
+define i32 @main() {
+  %t = load i32, i32* @foo
+  ret i32 %t
+}
diff --git a/test/tools/llvm-nm/X86/Inputs/example.lib b/test/tools/llvm-nm/X86/Inputs/example.lib
new file mode 100644
index 0000000000000000000000000000000000000000..edcd888f2ba9eaee5fe354678ab8053cc65f7b62
GIT binary patch
literal 2000
zcmcIl&2AGh5dL=aQ%Dht0^(2!X^|oUm(U~|LL`I|C=yY(X_Tf1E_IhpN>e4-s7)nq
zya1;@LM3<yZhZj`9C!fUpl?uScI~VqnvxW-WRGV&_RKe)u^&8l554^tsVCNS%iEPw
zsch%-<!Sk<Wb@@`a<-cSa04*Tf%poLXaY+IvXeY{%3X;1Msu^(7HhA!UbX7Y_GU|T
zKe&TYzY8H6+m7?Q0zOKOaJ~IzyRk0TYV8{B=nY0f>>UrianBz@n}sVKckC)H&mSI*
z-QgHwGFupJMd98}B?<1i0j$J<W&$`^BKXu#X@TIvfQcj$h|?HHjJ&u8atUzau;g`E
z5%@@JBAXdG8PQy)-w%TDr+c<5#mPN8Mhx9A*|*Dto{=)xT&Mf)_<eROWPut@J}E}h
zD$*}=J@!3YP)m1d@{RgWDTooRV)abwJ(XhIqR9nyrN{CrtTU5n1~-|-vIgoQ?xg85
zDDGjqP2^-Jv(fVo{Ud*GY&~i{ww!unU9L|NHpUzI07~XymZp(znPi$3R6Tz%B41B-
z4C8|Q3(fHkZtR>11-S|Y?(Qo|0FHOWl*DEo5@4cJiX<$zc#w@x2P%?dVxmRkwGwS+
zqNFsqb7e}Q6xE1hm_ep|4V+^aFVB=tuoMkirWBQwGv%{lB;`!`FBve;z1}A$W<FU9
zM@@T2Rek*+1K}lK9<?j#tE2+OsmS%EH?6*GC1<_wijnl6Mt9zb&P>EbBWf4akHtR|
z7pG(Jt936xm(qR#?PR^U=gRh?`u?nGD;ct4(7Pmpr}}R!T!q7=_c!|UbSu9At%rlD

literal 0
HcmV?d00001

diff --git a/test/tools/llvm-nm/X86/importlibrary.test b/test/tools/llvm-nm/X86/importlibrary.test
new file mode 100644
index 000000000000..9111694c2c6f
--- /dev/null
+++ b/test/tools/llvm-nm/X86/importlibrary.test
@@ -0,0 +1,7 @@
+# RUN: llvm-nm -B %S/Inputs/example.lib | FileCheck --match-full-lines %s
+
+CHECK: 00000000 R __imp__constant
+CHECK: 00000000 R _constant
+CHECK: 00000000 D __imp__data
+CHECK: 00000000 T __imp__function
+CHECK: 00000000 T _function
diff --git a/test/tools/llvm-profdata/memop-size-prof.proftext b/test/tools/llvm-profdata/memop-size-prof.proftext
index 882fc1ecf296..79dc0f8a1632 100644
--- a/test/tools/llvm-profdata/memop-size-prof.proftext
+++ b/test/tools/llvm-profdata/memop-size-prof.proftext
@@ -67,7 +67,7 @@ ic2:20000
 7:33
 8:22
 
-#MEMOP: Memory Instrinsic Size Results:
+#MEMOP: Memory Intrinsic Size Results:
 #MEMOP-NEXT:  [ 0, 1, 99 ]
 #MEMOP-NEXT:  [ 0, 2, 88 ]
 #MEMOP-NEXT:  [ 0, 3, 77 ]
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index 9b783d19a283..cf207d9dbbb3 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -102,7 +102,7 @@ static ld_plugin_add_input_file add_input_file = nullptr;
 static ld_plugin_set_extra_library_path set_extra_library_path = nullptr;
 static ld_plugin_get_view get_view = nullptr;
 static bool IsExecutable = false;
-static Optional<Reloc::Model> RelocationModel;
+static Optional<Reloc::Model> RelocationModel = None;
 static std::string output_name = "";
 static std::list<claimed_file> Modules;
 static DenseMap<int, void *> FDToLeaderHandle;
@@ -282,6 +282,8 @@ ld_plugin_status onload(ld_plugin_tv *tv) {
     case LDPT_LINKER_OUTPUT:
       switch (tv->tv_u.tv_val) {
       case LDPO_REL: // .o
+        IsExecutable = false;
+        break;
       case LDPO_DYN: // .so
         IsExecutable = false;
         RelocationModel = Reloc::PIC_;
@@ -726,7 +728,7 @@ static std::unique_ptr<LTO> createLTO() {
   Conf.Options.RelaxELFRelocations = false;
 
   Conf.MAttrs = MAttrs;
-  Conf.RelocModel = *RelocationModel;
+  Conf.RelocModel = RelocationModel;
   Conf.CGOptLevel = getCGOptLevel();
   Conf.DisableVerify = options::DisableVerify;
   Conf.OptLevel = options::OptLevel;
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index a07fcef35ebe..b022c300756d 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -24,6 +24,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/COFFImportFile.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/MachO.h"
@@ -269,7 +270,7 @@ static bool compareSymbolName(const NMSymbol &A, const NMSymbol &B) {
 static char isSymbolList64Bit(SymbolicFile &Obj) {
   if (auto *IRObj = dyn_cast<IRObjectFile>(&Obj))
     return Triple(IRObj->getTargetTriple()).isArch64Bit();
-  if (isa<COFFObjectFile>(Obj))
+  if (isa<COFFObjectFile>(Obj) || isa<COFFImportFile>(Obj))
     return false;
   if (isa<WasmObjectFile>(Obj))
     return false;
@@ -849,6 +850,18 @@ static char getSymbolNMTypeChar(COFFObjectFile &Obj, symbol_iterator I) {
   return '?';
 }
 
+static char getSymbolNMTypeChar(COFFImportFile &Obj) {
+  switch (Obj.getCOFFImportHeader()->getType()) {
+  case COFF::IMPORT_CODE:
+    return 't';
+  case COFF::IMPORT_DATA:
+    return 'd';
+  case COFF::IMPORT_CONST:
+    return 'r';
+  }
+  return '?';
+}
+
 static char getSymbolNMTypeChar(MachOObjectFile &Obj, basic_symbol_iterator I) {
   DataRefImpl Symb = I->getRawDataRefImpl();
   uint8_t NType = Obj.is64Bit() ? Obj.getSymbol64TableEntry(Symb).n_type
@@ -932,6 +945,8 @@ static char getNMTypeChar(SymbolicFile &Obj, basic_symbol_iterator I) {
     Ret = getSymbolNMTypeChar(*IR, I);
   else if (COFFObjectFile *COFF = dyn_cast<COFFObjectFile>(&Obj))
     Ret = getSymbolNMTypeChar(*COFF, I);
+  else if (COFFImportFile *COFFImport = dyn_cast<COFFImportFile>(&Obj))
+    Ret = getSymbolNMTypeChar(*COFFImport);
   else if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(&Obj))
     Ret = getSymbolNMTypeChar(*MachO, I);
   else if (WasmObjectFile *Wasm = dyn_cast<WasmObjectFile>(&Obj))
diff --git a/tools/llvm-pdbdump/PrettyClassLayoutGraphicalDumper.cpp b/tools/llvm-pdbdump/PrettyClassLayoutGraphicalDumper.cpp
index 16cec82f718a..d11472679626 100644
--- a/tools/llvm-pdbdump/PrettyClassLayoutGraphicalDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyClassLayoutGraphicalDumper.cpp
@@ -80,7 +80,8 @@ bool PrettyClassLayoutGraphicalDumper::start(const UDTLayoutBase &Layout) {
 
     if (Item->getLayoutSize() > 0) {
       uint32_t Prev = RelativeOffset + Item->getLayoutSize() - 1;
-      NextPaddingByte = UseMap.find_next_unset(Prev);
+      if (Prev < UseMap.size())
+        NextPaddingByte = UseMap.find_next_unset(Prev);
     }
   }
 
diff --git a/tools/llvm-pdbdump/YAMLOutputStyle.cpp b/tools/llvm-pdbdump/YAMLOutputStyle.cpp
index 0573b23cdc76..652182e8e9b3 100644
--- a/tools/llvm-pdbdump/YAMLOutputStyle.cpp
+++ b/tools/llvm-pdbdump/YAMLOutputStyle.cpp
@@ -39,6 +39,20 @@ YAMLOutputStyle::YAMLOutputStyle(PDBFile &File)
 }
 
 Error YAMLOutputStyle::dump() {
+  if (opts::pdb2yaml::All) {
+    opts::pdb2yaml::StreamMetadata = true;
+    opts::pdb2yaml::StreamDirectory = true;
+    opts::pdb2yaml::PdbStream = true;
+    opts::pdb2yaml::StringTable = true;
+    opts::pdb2yaml::DbiStream = true;
+    opts::pdb2yaml::DbiModuleInfo = true;
+    opts::pdb2yaml::DbiModuleSyms = true;
+    opts::pdb2yaml::DbiModuleSourceFileInfo = true;
+    opts::pdb2yaml::DbiModuleSourceLineInfo = true;
+    opts::pdb2yaml::TpiStream = true;
+    opts::pdb2yaml::IpiStream = true;
+  }
+
   if (opts::pdb2yaml::StreamDirectory)
     opts::pdb2yaml::StreamMetadata = true;
   if (opts::pdb2yaml::DbiModuleSyms)
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.cpp b/tools/llvm-pdbdump/llvm-pdbdump.cpp
index 1767c3cfda85..ff14c39cbaab 100644
--- a/tools/llvm-pdbdump/llvm-pdbdump.cpp
+++ b/tools/llvm-pdbdump/llvm-pdbdump.cpp
@@ -378,6 +378,9 @@ cl::opt<std::string> InputFilename(cl::Positional,
 }
 
 namespace pdb2yaml {
+cl::opt<bool> All("all",
+                  cl::desc("Dump everything we know how to dump."),
+                  cl::sub(PdbToYamlSubcommand), cl::init(false));
 cl::opt<bool>
     NoFileHeaders("no-file-headers",
                   cl::desc("Do not dump MSF file headers (you will not be able "
@@ -500,6 +503,7 @@ static void yamlToPdb(StringRef Path) {
   pdb::yaml::PdbInfoStream DefaultInfoStream;
   pdb::yaml::PdbDbiStream DefaultDbiStream;
   pdb::yaml::PdbTpiStream DefaultTpiStream;
+  pdb::yaml::PdbTpiStream DefaultIpiStream;
 
   const auto &Info = YamlObj.PdbStream.getValueOr(DefaultInfoStream);
 
@@ -524,12 +528,12 @@ static void yamlToPdb(StringRef Path) {
   DbiBuilder.setVersionHeader(Dbi.VerHeader);
   for (const auto &MI : Dbi.ModInfos) {
     auto &ModiBuilder = ExitOnErr(DbiBuilder.addModuleInfo(MI.Mod));
+    ModiBuilder.setObjFileName(MI.Obj);
 
     for (auto S : MI.SourceFiles)
       ExitOnErr(DbiBuilder.addModuleSourceFile(MI.Mod, S));
     if (MI.Modi.hasValue()) {
       const auto &ModiStream = *MI.Modi;
-      ModiBuilder.setObjFileName(MI.Obj);
       for (auto Symbol : ModiStream.Symbols)
         ModiBuilder.addSymbol(Symbol.Record);
     }
@@ -601,11 +605,11 @@ static void yamlToPdb(StringRef Path) {
   for (const auto &R : Tpi.Records)
     TpiBuilder.addTypeRecord(R.Record.data(), R.Record.Hash);
 
-  const auto &Ipi = YamlObj.IpiStream.getValueOr(DefaultTpiStream);
+  const auto &Ipi = YamlObj.IpiStream.getValueOr(DefaultIpiStream);
   auto &IpiBuilder = Builder.getIpiBuilder();
   IpiBuilder.setVersionHeader(Ipi.Version);
   for (const auto &R : Ipi.Records)
-    TpiBuilder.addTypeRecord(R.Record.data(), R.Record.Hash);
+    IpiBuilder.addTypeRecord(R.Record.data(), R.Record.Hash);
 
   ExitOnErr(Builder.commit(opts::yaml2pdb::YamlPdbOutputFile));
 }
@@ -852,18 +856,17 @@ static void mergePdbs() {
   for (const auto &Path : opts::merge::InputFilenames) {
     std::unique_ptr<IPDBSession> Session;
     auto &File = loadPDB(Path, Session);
-    SmallVector<TypeIndex, 128> SourceToDest;
+    SmallVector<TypeIndex, 128> TypeMap;
+    SmallVector<TypeIndex, 128> IdMap;
     if (File.hasPDBTpiStream()) {
-      SourceToDest.clear();
       auto &Tpi = ExitOnErr(File.getPDBTpiStream());
-      ExitOnErr(codeview::mergeTypeStreams(MergedIpi, MergedTpi, SourceToDest,
-                                           nullptr, Tpi.typeArray()));
+      ExitOnErr(codeview::mergeTypeRecords(MergedTpi, TypeMap, nullptr,
+                                           Tpi.typeArray()));
     }
     if (File.hasPDBIpiStream()) {
-      SourceToDest.clear();
       auto &Ipi = ExitOnErr(File.getPDBIpiStream());
-      ExitOnErr(codeview::mergeTypeStreams(MergedIpi, MergedTpi, SourceToDest,
-                                           nullptr, Ipi.typeArray()));
+      ExitOnErr(codeview::mergeIdRecords(MergedIpi, TypeMap, IdMap,
+                                         Ipi.typeArray()));
     }
   }
 
@@ -877,14 +880,12 @@ static void mergePdbs() {
 
   auto &DestTpi = Builder.getTpiBuilder();
   auto &DestIpi = Builder.getIpiBuilder();
-  MergedTpi.ForEachRecord(
-      [&DestTpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
-        DestTpi.addTypeRecord(Data, None);
-      });
-  MergedIpi.ForEachRecord(
-      [&DestIpi](TypeIndex TI, MutableArrayRef<uint8_t> Data) {
-        DestIpi.addTypeRecord(Data, None);
-      });
+  MergedTpi.ForEachRecord([&DestTpi](TypeIndex TI, ArrayRef<uint8_t> Data) {
+    DestTpi.addTypeRecord(Data, None);
+  });
+  MergedIpi.ForEachRecord([&DestIpi](TypeIndex TI, ArrayRef<uint8_t> Data) {
+    DestIpi.addTypeRecord(Data, None);
+  });
 
   SmallString<64> OutFile(opts::merge::PdbOutputFile);
   if (OutFile.empty()) {
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.h b/tools/llvm-pdbdump/llvm-pdbdump.h
index e38b32c6a345..b344129d217a 100644
--- a/tools/llvm-pdbdump/llvm-pdbdump.h
+++ b/tools/llvm-pdbdump/llvm-pdbdump.h
@@ -115,6 +115,7 @@ extern llvm::cl::opt<bool> Pedantic;
 }
 
 namespace pdb2yaml {
+extern llvm::cl::opt<bool> All;
 extern llvm::cl::opt<bool> NoFileHeaders;
 extern llvm::cl::opt<bool> Minimal;
 extern llvm::cl::opt<bool> StreamMetadata;
diff --git a/tools/llvm-profdata/llvm-profdata.cpp b/tools/llvm-profdata/llvm-profdata.cpp
index a257910ecf77..4867acf70983 100644
--- a/tools/llvm-profdata/llvm-profdata.cpp
+++ b/tools/llvm-profdata/llvm-profdata.cpp
@@ -572,7 +572,7 @@ static int showInstrProfile(const std::string &Filename, bool ShowCounts,
       }
 
       if (ShowMemOPSizes && NumMemOPCalls > 0) {
-        OS << "    Memory Instrinsic Size Results:\n";
+        OS << "    Memory Intrinsic Size Results:\n";
         traverseAllValueSites(Func, IPVK_MemOPSize, VPStats[IPVK_MemOPSize], OS,
                               nullptr);
       }
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index 78bfa558e4a3..a59caa351e72 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -1073,8 +1073,8 @@ void COFFDumper::mergeCodeViewTypes(TypeTableBuilder &CVIDs,
         error(object_error::parse_failed);
       }
       SmallVector<TypeIndex, 128> SourceToDest;
-      if (auto EC =
-              mergeTypeStreams(CVIDs, CVTypes, SourceToDest, nullptr, Types))
+      if (auto EC = mergeTypeAndIdRecords(CVIDs, CVTypes, SourceToDest, nullptr,
+                                          Types))
         return error(std::move(EC));
     }
   }
diff --git a/unittests/Analysis/ScalarEvolutionTest.cpp b/unittests/Analysis/ScalarEvolutionTest.cpp
index b21f4dc622bb..fcbbb46f7a7e 100644
--- a/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -7,21 +7,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
@@ -853,5 +854,82 @@ TEST_F(ScalarEvolutionsTest, SCEVZeroExtendExpr) {
   Type *I128Ty = Type::getInt128Ty(Context);
   SE.getZeroExtendExpr(S, I128Ty);
 }
+
+// Make sure that SCEV doesn't introduce illegal ptrtoint/inttoptr instructions
+TEST_F(ScalarEvolutionsTest, SCEVZeroExtendExprNonIntegral) {
+  /*
+   * Create the following code:
+   * func(i64 addrspace(10)* %arg)
+   * top:
+   *  br label %L.ph
+   * L.ph:
+   *  br label %L
+   * L:
+   *  %phi = phi i64 [i64 0, %L.ph], [ %add, %L2 ]
+   *  %add = add i64 %phi2, 1
+   *  br i1 undef, label %post, label %L2
+   * post:
+   *  %gepbase = getelementptr i64 addrspace(10)* %arg, i64 1
+   *  #= %gep = getelementptr i64 addrspace(10)* %gepbase, i64 %add =#
+   *  ret void
+   *
+   * We will create the appropriate SCEV expression for %gep and expand it,
+   * then check that no inttoptr/ptrtoint instructions got inserted.
+   */
+
+  // Create a module with non-integral pointers in it's datalayout
+  Module NIM("nonintegral", Context);
+  std::string DataLayout = M.getDataLayoutStr();
+  if (!DataLayout.empty())
+    DataLayout += "-";
+  DataLayout += "ni:10";
+  NIM.setDataLayout(DataLayout);
+
+  Type *T_int1 = Type::getInt1Ty(Context);
+  Type *T_int64 = Type::getInt64Ty(Context);
+  Type *T_pint64 = T_int64->getPointerTo(10);
+
+  FunctionType *FTy =
+      FunctionType::get(Type::getVoidTy(Context), {T_pint64}, false);
+  Function *F = cast<Function>(NIM.getOrInsertFunction("foo", FTy));
+
+  Argument *Arg = &*F->arg_begin();
+
+  BasicBlock *Top = BasicBlock::Create(Context, "top", F);
+  BasicBlock *LPh = BasicBlock::Create(Context, "L.ph", F);
+  BasicBlock *L = BasicBlock::Create(Context, "L", F);
+  BasicBlock *Post = BasicBlock::Create(Context, "post", F);
+
+  IRBuilder<> Builder(Top);
+  Builder.CreateBr(LPh);
+
+  Builder.SetInsertPoint(LPh);
+  Builder.CreateBr(L);
+
+  Builder.SetInsertPoint(L);
+  PHINode *Phi = Builder.CreatePHI(T_int64, 2);
+  Value *Add = Builder.CreateAdd(Phi, ConstantInt::get(T_int64, 1), "add");
+  Builder.CreateCondBr(UndefValue::get(T_int1), L, Post);
+  Phi->addIncoming(ConstantInt::get(T_int64, 0), LPh);
+  Phi->addIncoming(Add, L);
+
+  Builder.SetInsertPoint(Post);
+  Value *GepBase = Builder.CreateGEP(Arg, ConstantInt::get(T_int64, 1));
+  Instruction *Ret = Builder.CreateRetVoid();
+
+  ScalarEvolution SE = buildSE(*F);
+  auto *AddRec =
+      SE.getAddRecExpr(SE.getUnknown(GepBase), SE.getConstant(T_int64, 1),
+                       LI->getLoopFor(L), SCEV::FlagNUW);
+
+  SCEVExpander Exp(SE, NIM.getDataLayout(), "expander");
+  Exp.disableCanonicalMode();
+  Exp.expandCodeFor(AddRec, T_pint64, Ret);
+
+  // Make sure none of the instructions inserted were inttoptr/ptrtoint.
+  // The verifier will check this.
+  EXPECT_FALSE(verifyFunction(*F, &errs()));
+}
+
 }  // end anonymous namespace
 }  // end namespace llvm
diff --git a/unittests/DebugInfo/CodeView/CMakeLists.txt b/unittests/DebugInfo/CodeView/CMakeLists.txt
index 854182c4efb4..9576d45e77ba 100644
--- a/unittests/DebugInfo/CodeView/CMakeLists.txt
+++ b/unittests/DebugInfo/CodeView/CMakeLists.txt
@@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS
 
 set(DebugInfoCodeViewSources
   RandomAccessVisitorTest.cpp
+  TypeIndexDiscoveryTest.cpp
   )
 
 add_llvm_unittest(DebugInfoCodeViewTests
diff --git a/unittests/DebugInfo/CodeView/ErrorChecking.h b/unittests/DebugInfo/CodeView/ErrorChecking.h
index 09310883bf58..4ca74c487b3e 100644
--- a/unittests/DebugInfo/CodeView/ErrorChecking.h
+++ b/unittests/DebugInfo/CodeView/ErrorChecking.h
@@ -26,6 +26,15 @@
       consumeError(std::move(E));                                              \
   }
 
+#define ASSERT_EXPECTED(Exp)                                                   \
+  {                                                                            \
+    auto E = Exp.takeError();                                                  \
+    bool Success = static_cast<bool>(E);                                       \
+    if (!Success)                                                              \
+      consumeError(std::move(E));                                              \
+    ASSERT_FALSE(Success);                                                     \
+  }
+
 #define EXPECT_EXPECTED(Exp)                                                   \
   {                                                                            \
     auto E = Exp.takeError();                                                  \
diff --git a/unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp b/unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp
new file mode 100644
index 000000000000..4eca7777a1e2
--- /dev/null
+++ b/unittests/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp
@@ -0,0 +1,496 @@
+//===- llvm/unittest/DebugInfo/CodeView/TypeIndexDiscoveryTest.cpp --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/TypeIndexDiscovery.h"
+
+#include "ErrorChecking.h"
+#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
+#include "llvm/Support/Allocator.h"
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+class TypeIndexIteratorTest : public testing::Test {
+public:
+  TypeIndexIteratorTest() {}
+
+  void SetUp() override {
+    Refs.clear();
+    TTB = make_unique<TypeTableBuilder>(Storage);
+    FLRB = make_unique<FieldListRecordBuilder>(*TTB);
+  }
+
+  void TearDown() override {
+    FLRB.reset();
+    TTB.reset();
+  }
+
+protected:
+  template <typename... Indices>
+  bool checkTypeReferences(uint32_t RecordIndex, Indices &&... TIs) const {
+    EXPECT_EQ(sizeof...(Indices), countRefs(RecordIndex));
+    return checkTypeReferencesImpl(RecordIndex, std::forward<Indices>(TIs)...);
+  }
+
+  template <typename... T> void writeFieldList(T &&... MemberRecords) {
+    FLRB->begin();
+    writeFieldListImpl(std::forward<T>(MemberRecords)...);
+    FLRB->end(true);
+    ASSERT_EQ(1u, TTB->records().size());
+    discoverAllTypeIndices();
+  }
+
+  template <typename... T> void writeTypeRecords(T &&... Records) {
+    writeTypeRecordsImpl(std::forward<T>(Records)...);
+    ASSERT_EQ(sizeof...(T), TTB->records().size());
+    discoverAllTypeIndices();
+  }
+
+  std::unique_ptr<TypeTableBuilder> TTB;
+
+private:
+  uint32_t countRefs(uint32_t RecordIndex) const {
+    auto &R = Refs[RecordIndex];
+    uint32_t Count = 0;
+    for (auto &Ref : R) {
+      Count += Ref.Count;
+    }
+    return Count;
+  }
+
+  bool checkOneTypeReference(uint32_t RecordIndex, ArrayRef<uint8_t> RecordData,
+                             TypeIndex TI) const {
+    RecordData = RecordData.drop_front(sizeof(RecordPrefix));
+    auto &RefList = Refs[RecordIndex];
+    for (auto &Ref : RefList) {
+      uint32_t Offset = Ref.Offset;
+      ArrayRef<uint8_t> Loc = RecordData.drop_front(Offset);
+      ArrayRef<TypeIndex> Indices(
+          reinterpret_cast<const TypeIndex *>(Loc.data()), Ref.Count);
+      if (llvm::any_of(Indices,
+                       [TI](const TypeIndex &Other) { return Other == TI; }))
+        return true;
+    }
+    return false;
+  }
+
+  template <typename... Indices>
+  bool checkTypeReferencesImpl(uint32_t RecordIndex) const {
+    return true;
+  }
+
+  template <typename... Indices>
+  bool checkTypeReferencesImpl(uint32_t RecordIndex, TypeIndex TI,
+                               Indices &&... Rest) const {
+    ArrayRef<uint8_t> Record = TTB->records()[RecordIndex];
+    bool Success = checkOneTypeReference(RecordIndex, Record, TI);
+    EXPECT_TRUE(Success);
+    return Success &
+           checkTypeReferencesImpl(RecordIndex, std::forward<Indices>(Rest)...);
+  }
+
+  void discoverAllTypeIndices() {
+    Refs.resize(TTB->records().size());
+    for (uint32_t I = 0; I < TTB->records().size(); ++I) {
+      ArrayRef<uint8_t> Data = TTB->records()[I];
+      discoverTypeIndices(Data, Refs[I]);
+    }
+  }
+
+  // Helper function to write out a field list record with the given list
+  // of member records.
+  void writeFieldListImpl() {}
+
+  template <typename RecType, typename... Rest>
+  void writeFieldListImpl(RecType &&Record, Rest &&... Records) {
+    FLRB->writeMemberType(Record);
+    writeFieldListImpl(std::forward<Rest>(Records)...);
+  }
+
+  // Helper function to write out a list of type records.
+  void writeTypeRecordsImpl() {}
+
+  template <typename RecType, typename... Rest>
+  void writeTypeRecordsImpl(RecType &&Record, Rest &&... Records) {
+    TTB->writeKnownType(Record);
+    writeTypeRecordsImpl(std::forward<Rest>(Records)...);
+  }
+
+  std::vector<SmallVector<TiReference, 4>> Refs;
+  std::unique_ptr<FieldListRecordBuilder> FLRB;
+  BumpPtrAllocator Storage;
+};
+
+namespace leafs {
+static FuncIdRecord FuncId(TypeIndex(1), TypeIndex(2), "FuncId");
+static MemberFuncIdRecord MemFuncId(TypeIndex(3), TypeIndex(4), "FuncId");
+static StringIdRecord StringId(TypeIndex(5), "TheString");
+static struct {
+  std::vector<TypeIndex> Ids = {TypeIndex(6), TypeIndex(7), TypeIndex(8)};
+  StringListRecord Record{TypeRecordKind::StringList, Ids};
+} StringList;
+static struct {
+  std::vector<TypeIndex> Ids = {TypeIndex(9), TypeIndex(10), TypeIndex(11)};
+  BuildInfoRecord Record{Ids};
+} BuildInfo;
+static UdtSourceLineRecord UdtSourceLine(TypeIndex(12), TypeIndex(13), 0);
+static UdtModSourceLineRecord UdtModSourceLine(TypeIndex(14), TypeIndex(15), 0,
+                                               0);
+static ModifierRecord Modifier(TypeIndex(16), ModifierOptions::None);
+static ProcedureRecord Procedure(TypeIndex(17), CallingConvention::PpcCall,
+                                 FunctionOptions::None, 0, TypeIndex(18));
+static MemberFunctionRecord MemberFunction(TypeIndex(19), TypeIndex(20),
+                                           TypeIndex(21),
+                                           CallingConvention::ThisCall,
+                                           FunctionOptions::None, 2,
+                                           TypeIndex(22), 0);
+static struct {
+  std::vector<TypeIndex> Ids = {TypeIndex(23), TypeIndex(24), TypeIndex(25)};
+  ArgListRecord Record{TypeRecordKind::ArgList, Ids};
+} ArgList;
+static ArrayRecord Array(TypeIndex(26), TypeIndex(27), 10, "MyArray");
+static ClassRecord Class(TypeRecordKind::Class, 3, ClassOptions::None,
+                         TypeIndex(28), TypeIndex(29), TypeIndex(30), 10,
+                         "MyClass", "MyClassUniqueName");
+static ClassRecord Struct(TypeRecordKind::Struct, 3, ClassOptions::None,
+                          TypeIndex(31), TypeIndex(32), TypeIndex(33), 10,
+                          "MyClass", "MyClassUniqueName");
+static UnionRecord Union(1, ClassOptions::None, TypeIndex(34), 10, "MyUnion",
+                         "MyUnionUniqueName");
+static EnumRecord Enum(1, ClassOptions::None, TypeIndex(35), "MyEnum",
+                       "EnumUniqueName", TypeIndex(36));
+static BitFieldRecord BitField(TypeIndex(37), 1, 0);
+static VFTableRecord VFTable(TypeIndex(38), TypeIndex(39), 1, "VFT", {});
+static VFTableShapeRecord VTableShape({});
+static struct {
+  const TypeIndex T1{40};
+  const TypeIndex T2{41};
+  const TypeIndex T3{42};
+  const TypeIndex T4{43};
+
+  std::vector<OneMethodRecord> Methods{
+      {T1, MemberAccess::Public, MethodKind::IntroducingVirtual,
+       MethodOptions::None, 0, "Method1"},
+      {T2, MemberAccess::Public, MethodKind::PureVirtual, MethodOptions::None,
+       0, "Method1"},
+      {T3, MemberAccess::Public, MethodKind::PureIntroducingVirtual,
+       MethodOptions::None, 0, "Method1"},
+      {T4, MemberAccess::Public, MethodKind::Static, MethodOptions::None, 0,
+       "Method1"}};
+
+  MethodOverloadListRecord Record{Methods};
+} MethodOverloadList;
+static PointerRecord Pointer(TypeIndex(44), PointerKind::Near32,
+                             PointerMode::Pointer, PointerOptions::Const, 3);
+static PointerRecord MemberPointer(
+    TypeIndex(45), PointerKind::Near32, PointerMode::PointerToDataMember,
+    PointerOptions::Const, 3,
+    MemberPointerInfo(TypeIndex(46),
+                      PointerToMemberRepresentation::GeneralData));
+}
+
+namespace members {
+static BaseClassRecord BaseClass(MemberAccess::Public, TypeIndex(47), 0);
+static EnumeratorRecord Enumerator(MemberAccess::Public,
+                                   APSInt(APInt(8, 3, false)), "Test");
+DataMemberRecord DataMember(MemberAccess::Public, TypeIndex(48), 0, "Test");
+OverloadedMethodRecord OverloadedMethod(3, TypeIndex(49), "MethodList");
+static struct {
+  const TypeIndex T1{50};
+  const TypeIndex T2{51};
+  const TypeIndex T3{52};
+  const TypeIndex T4{53};
+  OneMethodRecord R1{T1,
+                     MemberAccess::Public,
+                     MethodKind::IntroducingVirtual,
+                     MethodOptions::None,
+                     0,
+                     "M1"};
+  OneMethodRecord R2{T2,
+                     MemberAccess::Public,
+                     MethodKind::PureVirtual,
+                     MethodOptions::None,
+                     0,
+                     "M2"};
+  OneMethodRecord R3{T3,
+                     MemberAccess::Public,
+                     MethodKind::PureIntroducingVirtual,
+                     MethodOptions::None,
+                     0,
+                     "M3"};
+  OneMethodRecord R4{T4,
+                     MemberAccess::Protected,
+                     MethodKind::Vanilla,
+                     MethodOptions::CompilerGenerated,
+                     0,
+                     "M4"};
+} OneMethod;
+static NestedTypeRecord NestedType(TypeIndex(54), "MyClass");
+static StaticDataMemberRecord StaticDataMember(MemberAccess::Public,
+                                               TypeIndex(55), "Foo");
+static VirtualBaseClassRecord VirtualBaseClass(TypeRecordKind::VirtualBaseClass,
+                                               MemberAccess::Public,
+                                               TypeIndex(56), TypeIndex(57), 0,
+                                               0);
+static VFPtrRecord VFPtr(TypeIndex(58));
+static ListContinuationRecord Continuation(TypeIndex(59));
+}
+
+TEST_F(TypeIndexIteratorTest, FuncId) {
+  using namespace leafs;
+  writeTypeRecords(FuncId);
+  checkTypeReferences(0, FuncId.FunctionType, FuncId.ParentScope);
+}
+
+TEST_F(TypeIndexIteratorTest, MemFuncId) {
+  using namespace leafs;
+  writeTypeRecords(MemFuncId);
+  checkTypeReferences(0, MemFuncId.ClassType, MemFuncId.FunctionType);
+}
+
+TEST_F(TypeIndexIteratorTest, StringId) {
+  using namespace leafs;
+  writeTypeRecords(StringId);
+  checkTypeReferences(0, StringId.Id);
+}
+
+TEST_F(TypeIndexIteratorTest, SubstrList) {
+  using namespace leafs;
+  writeTypeRecords(StringList.Record);
+  checkTypeReferences(0, StringList.Ids[0], StringList.Ids[1],
+                      StringList.Ids[2]);
+}
+
+TEST_F(TypeIndexIteratorTest, BuildInfo) {
+  using namespace leafs;
+  writeTypeRecords(BuildInfo.Record);
+  checkTypeReferences(0, BuildInfo.Ids[0], BuildInfo.Ids[1], BuildInfo.Ids[2]);
+}
+
+TEST_F(TypeIndexIteratorTest, UdtSrcLine) {
+  using namespace leafs;
+  writeTypeRecords(UdtSourceLine);
+  checkTypeReferences(0, UdtSourceLine.UDT, UdtSourceLine.SourceFile);
+}
+
+TEST_F(TypeIndexIteratorTest, UdtModSrcLine) {
+  using namespace leafs;
+  writeTypeRecords(UdtModSourceLine);
+  checkTypeReferences(0, UdtModSourceLine.UDT, UdtModSourceLine.SourceFile);
+}
+
+TEST_F(TypeIndexIteratorTest, Modifier) {
+  using namespace leafs;
+  writeTypeRecords(Modifier);
+  checkTypeReferences(0, Modifier.ModifiedType);
+}
+
+TEST_F(TypeIndexIteratorTest, Procedure) {
+  using namespace leafs;
+  writeTypeRecords(Procedure);
+  checkTypeReferences(0, Procedure.ReturnType, Procedure.ArgumentList);
+}
+
+TEST_F(TypeIndexIteratorTest, MemFunc) {
+  using namespace leafs;
+  writeTypeRecords(MemberFunction);
+  checkTypeReferences(0, MemberFunction.ReturnType, MemberFunction.ClassType,
+                      MemberFunction.ThisType, MemberFunction.ArgumentList);
+}
+
+TEST_F(TypeIndexIteratorTest, ArgList) {
+  using namespace leafs;
+  writeTypeRecords(ArgList.Record);
+  checkTypeReferences(0, ArgList.Ids[0], ArgList.Ids[1], ArgList.Ids[2]);
+}
+
+TEST_F(TypeIndexIteratorTest, Array) {
+  using namespace leafs;
+  writeTypeRecords(Array);
+  checkTypeReferences(0, Array.ElementType, Array.IndexType);
+}
+
+TEST_F(TypeIndexIteratorTest, Class) {
+  using namespace leafs;
+  writeTypeRecords(Class);
+  checkTypeReferences(0, Class.FieldList, Class.DerivationList,
+                      Class.VTableShape);
+}
+
+TEST_F(TypeIndexIteratorTest, Struct) {
+  using namespace leafs;
+  writeTypeRecords(Struct);
+  checkTypeReferences(0, Struct.FieldList, Struct.DerivationList,
+                      Struct.VTableShape);
+}
+
+TEST_F(TypeIndexIteratorTest, Union) {
+  using namespace leafs;
+  writeTypeRecords(Union);
+  checkTypeReferences(0, Union.FieldList);
+}
+
+TEST_F(TypeIndexIteratorTest, Enum) {
+  using namespace leafs;
+  writeTypeRecords(Enum);
+  checkTypeReferences(0, Enum.FieldList, Enum.UnderlyingType);
+}
+
+TEST_F(TypeIndexIteratorTest, Bitfield) {
+  using namespace leafs;
+  writeTypeRecords(BitField);
+  checkTypeReferences(0, BitField.Type);
+}
+
+TEST_F(TypeIndexIteratorTest, VTable) {
+  using namespace leafs;
+  writeTypeRecords(VFTable);
+  checkTypeReferences(0, VFTable.CompleteClass, VFTable.OverriddenVFTable);
+}
+
+TEST_F(TypeIndexIteratorTest, VTShape) {
+  using namespace leafs;
+  writeTypeRecords(VTableShape);
+  checkTypeReferences(0);
+}
+
+TEST_F(TypeIndexIteratorTest, OverloadList) {
+  using namespace leafs;
+  writeTypeRecords(MethodOverloadList.Record);
+  checkTypeReferences(0, MethodOverloadList.T1, MethodOverloadList.T2,
+                      MethodOverloadList.T3, MethodOverloadList.T4);
+}
+
+TEST_F(TypeIndexIteratorTest, Pointer) {
+  using namespace leafs;
+  writeTypeRecords(Pointer);
+  checkTypeReferences(0, Pointer.ReferentType);
+}
+
+TEST_F(TypeIndexIteratorTest, MemberPointer) {
+  using namespace leafs;
+  writeTypeRecords(MemberPointer);
+  checkTypeReferences(0, MemberPointer.ReferentType,
+                      MemberPointer.MemberInfo->ContainingType);
+}
+
+TEST_F(TypeIndexIteratorTest, ManyTypes) {
+
+  using namespace leafs;
+  writeTypeRecords(FuncId, MemFuncId, StringId, StringList.Record,
+                   BuildInfo.Record, UdtSourceLine, UdtModSourceLine, Modifier,
+                   Procedure, MemberFunction, ArgList.Record, Array, Class,
+                   Union, Enum, BitField, VFTable, VTableShape,
+                   MethodOverloadList.Record, Pointer, MemberPointer);
+
+  checkTypeReferences(0, FuncId.FunctionType, FuncId.ParentScope);
+  checkTypeReferences(1, MemFuncId.ClassType, MemFuncId.FunctionType);
+  checkTypeReferences(2, StringId.Id);
+  checkTypeReferences(3, StringList.Ids[0], StringList.Ids[1],
+                      StringList.Ids[2]);
+  checkTypeReferences(4, BuildInfo.Ids[0], BuildInfo.Ids[1], BuildInfo.Ids[2]);
+  checkTypeReferences(5, UdtSourceLine.UDT, UdtSourceLine.SourceFile);
+  checkTypeReferences(6, UdtModSourceLine.UDT, UdtModSourceLine.SourceFile);
+  checkTypeReferences(7, Modifier.ModifiedType);
+  checkTypeReferences(8, Procedure.ReturnType, Procedure.ArgumentList);
+  checkTypeReferences(9, MemberFunction.ReturnType, MemberFunction.ClassType,
+                      MemberFunction.ThisType, MemberFunction.ArgumentList);
+  checkTypeReferences(10, ArgList.Ids[0], ArgList.Ids[1], ArgList.Ids[2]);
+  checkTypeReferences(11, Array.ElementType, Array.IndexType);
+  checkTypeReferences(12, Class.FieldList, Class.DerivationList,
+                      Class.VTableShape);
+  checkTypeReferences(13, Union.FieldList);
+  checkTypeReferences(14, Enum.FieldList, Enum.UnderlyingType);
+  checkTypeReferences(15, BitField.Type);
+  checkTypeReferences(16, VFTable.CompleteClass, VFTable.OverriddenVFTable);
+  checkTypeReferences(17);
+  checkTypeReferences(18, MethodOverloadList.T1, MethodOverloadList.T2,
+                      MethodOverloadList.T3, MethodOverloadList.T4);
+  checkTypeReferences(19, Pointer.ReferentType);
+  checkTypeReferences(20, MemberPointer.ReferentType,
+                      MemberPointer.MemberInfo->ContainingType);
+}
+
+TEST_F(TypeIndexIteratorTest, FieldListBaseClass) {
+  using namespace members;
+  writeFieldList(BaseClass);
+  checkTypeReferences(0, BaseClass.Type);
+}
+
+TEST_F(TypeIndexIteratorTest, FieldListEnumerator) {
+  using namespace members;
+  writeFieldList(Enumerator);
+  checkTypeReferences(0);
+}
+
+TEST_F(TypeIndexIteratorTest, FieldListMember) {
+  using namespace members;
+  writeFieldList(DataMember);
+  checkTypeReferences(0, DataMember.Type);
+}
+
+TEST_F(TypeIndexIteratorTest, FieldListMethod) {
+  using namespace members;
+  writeFieldList(OverloadedMethod);
+  checkTypeReferences(0, OverloadedMethod.MethodList);
+}
+
+TEST_F(TypeIndexIteratorTest, FieldListOneMethod) {
+  using namespace members;
+  writeFieldList(OneMethod.R1, OneMethod.R2, OneMethod.R3, OneMethod.R4);
+  checkTypeReferences(0, OneMethod.T1, OneMethod.T2, OneMethod.T3,
+                      OneMethod.T4);
+}
+
+TEST_F(TypeIndexIteratorTest, FieldListNestedType) {
+  using namespace members;
+  writeFieldList(NestedType);
+  checkTypeReferences(0, NestedType.Type);
+}
+
+TEST_F(TypeIndexIteratorTest, FieldListStaticMember) {
+  using namespace members;
+  writeFieldList(StaticDataMember);
+  checkTypeReferences(0, StaticDataMember.Type);
+}
+
+TEST_F(TypeIndexIteratorTest, FieldListVirtualBase) {
+  using namespace members;
+  writeFieldList(VirtualBaseClass);
+  checkTypeReferences(0, VirtualBaseClass.BaseType, VirtualBaseClass.VBPtrType);
+}
+
+TEST_F(TypeIndexIteratorTest, FieldListVFTable) {
+  using namespace members;
+  writeFieldList(VFPtr);
+  checkTypeReferences(0, VFPtr.Type);
+}
+
+TEST_F(TypeIndexIteratorTest, FieldListContinuation) {
+  using namespace members;
+  writeFieldList(Continuation);
+  checkTypeReferences(0, Continuation.ContinuationIndex);
+}
+
+TEST_F(TypeIndexIteratorTest, ManyMembers) {
+  using namespace members;
+  writeFieldList(BaseClass, Enumerator, DataMember, OverloadedMethod,
+                 OneMethod.R1, OneMethod.R2, OneMethod.R3, OneMethod.R4,
+                 NestedType, StaticDataMember, VirtualBaseClass, VFPtr,
+                 Continuation);
+
+  checkTypeReferences(
+      0, BaseClass.Type, DataMember.Type, OverloadedMethod.MethodList,
+      OneMethod.T1, OneMethod.T2, OneMethod.T3, OneMethod.T4, NestedType.Type,
+      StaticDataMember.Type, VirtualBaseClass.BaseType,
+      VirtualBaseClass.VBPtrType, VFPtr.Type, Continuation.ContinuationIndex);
+}
\ No newline at end of file
diff --git a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
index a9d0d9ef4e69..9e846674648c 100644
--- a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
+++ b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
@@ -311,8 +311,9 @@ void TestAllForms() {
   EXPECT_EQ(Data2, toReference(DieDG.find(Attr_DW_FORM_ref2), 0));
   EXPECT_EQ(Data4, toReference(DieDG.find(Attr_DW_FORM_ref4), 0));
   EXPECT_EQ(Data8, toReference(DieDG.find(Attr_DW_FORM_ref8), 0));
-  if (Version >= 4)
+  if (Version >= 4) {
     EXPECT_EQ(Data8_2, toReference(DieDG.find(Attr_DW_FORM_ref_sig8), 0));
+  }
   EXPECT_EQ(UData[0], toReference(DieDG.find(Attr_DW_FORM_ref_udata), 0));
 
   //----------------------------------------------------------------------
@@ -320,15 +321,17 @@ void TestAllForms() {
   //----------------------------------------------------------------------
   EXPECT_EQ(1ULL, toUnsigned(DieDG.find(Attr_DW_FORM_flag_true), 0));
   EXPECT_EQ(0ULL, toUnsigned(DieDG.find(Attr_DW_FORM_flag_false), 1));
-  if (Version >= 4)
+  if (Version >= 4) {
     EXPECT_EQ(1ULL, toUnsigned(DieDG.find(Attr_DW_FORM_flag_present), 0));
+  }
 
   //----------------------------------------------------------------------
   // Test SLEB128 based forms
   //----------------------------------------------------------------------
   EXPECT_EQ(SData, toSigned(DieDG.find(Attr_DW_FORM_sdata), 0));
-  if (Version >= 5)
+  if (Version >= 5) {
     EXPECT_EQ(ICSData, toSigned(DieDG.find(Attr_DW_FORM_implicit_const), 0));
+  }
 
   //----------------------------------------------------------------------
   // Test ULEB128 based forms
@@ -340,9 +343,10 @@ void TestAllForms() {
   //----------------------------------------------------------------------
   EXPECT_EQ(Dwarf32Values[0],
             toReference(DieDG.find(Attr_DW_FORM_GNU_ref_alt), 0));
-  if (Version >= 4)
+  if (Version >= 4) {
     EXPECT_EQ(Dwarf32Values[1],
               toSectionOffset(DieDG.find(Attr_DW_FORM_sec_offset), 0));
+  }
 
   //----------------------------------------------------------------------
   // Add an address at the end to make sure we can decode this value
@@ -849,8 +853,8 @@ template <uint16_t Version, class AddrType> void TestAddresses() {
   // Get the compile unit DIE is valid.
   auto DieDG = U->getUnitDIE(false);
   EXPECT_TRUE(DieDG.isValid());
-  
-  uint64_t LowPC, HighPC;
+
+  uint64_t LowPC, HighPC, SectionIndex;
   Optional<uint64_t> OptU64;
   // Verify the that our subprogram with no PC value fails appropriately when
   // asked for any PC values.
@@ -861,14 +865,14 @@ template <uint16_t Version, class AddrType> void TestAddresses() {
   EXPECT_FALSE((bool)OptU64);
   OptU64 = toAddress(SubprogramDieNoPC.find(DW_AT_high_pc));
   EXPECT_FALSE((bool)OptU64);
-  EXPECT_FALSE(SubprogramDieNoPC.getLowAndHighPC(LowPC, HighPC));
+  EXPECT_FALSE(SubprogramDieNoPC.getLowAndHighPC(LowPC, HighPC, SectionIndex));
   OptU64 = toAddress(SubprogramDieNoPC.find(DW_AT_high_pc));
   EXPECT_FALSE((bool)OptU64);
   OptU64 = toUnsigned(SubprogramDieNoPC.find(DW_AT_high_pc));
   EXPECT_FALSE((bool)OptU64);
   OptU64 = SubprogramDieNoPC.getHighPC(ActualLowPC);
   EXPECT_FALSE((bool)OptU64);
-  EXPECT_FALSE(SubprogramDieNoPC.getLowAndHighPC(LowPC, HighPC));
+  EXPECT_FALSE(SubprogramDieNoPC.getLowAndHighPC(LowPC, HighPC, SectionIndex));
  
   // Verify the that our subprogram with only a low PC value succeeds when
   // we ask for the Low PC, but fails appropriately when asked for the high PC
@@ -885,7 +889,7 @@ template <uint16_t Version, class AddrType> void TestAddresses() {
   EXPECT_FALSE((bool)OptU64);
   OptU64 = SubprogramDieLowPC.getHighPC(ActualLowPC);
   EXPECT_FALSE((bool)OptU64);
-  EXPECT_FALSE(SubprogramDieLowPC.getLowAndHighPC(LowPC, HighPC));
+  EXPECT_FALSE(SubprogramDieLowPC.getLowAndHighPC(LowPC, HighPC, SectionIndex));
 
   // Verify the that our subprogram with only a low PC value succeeds when
   // we ask for the Low PC, but fails appropriately when asked for the high PC
@@ -919,7 +923,7 @@ template <uint16_t Version, class AddrType> void TestAddresses() {
   EXPECT_TRUE((bool)OptU64);
   EXPECT_EQ(OptU64.getValue(), ActualHighPC);
 
-  EXPECT_TRUE(SubprogramDieLowHighPC.getLowAndHighPC(LowPC, HighPC));
+  EXPECT_TRUE(SubprogramDieLowHighPC.getLowAndHighPC(LowPC, HighPC, SectionIndex));
   EXPECT_EQ(LowPC, ActualLowPC);
   EXPECT_EQ(HighPC, ActualHighPC);
 }
diff --git a/unittests/DebugInfo/PDB/MappedBlockStreamTest.cpp b/unittests/DebugInfo/PDB/MappedBlockStreamTest.cpp
index 9f8940b77f28..9d90e265df33 100644
--- a/unittests/DebugInfo/PDB/MappedBlockStreamTest.cpp
+++ b/unittests/DebugInfo/PDB/MappedBlockStreamTest.cpp
@@ -75,12 +75,19 @@ private:
   MutableArrayRef<uint8_t> Data;
 };
 
+TEST(MappedBlockStreamTest, NumBlocks) {
+  DiscontiguousStream F(BlocksAry, DataAry);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
+  EXPECT_EQ(F.block_size(), S->getBlockSize());
+  EXPECT_EQ(F.layout().Blocks.size(), S->getNumBlocks());
+
+}
+
 // Tests that a read which is entirely contained within a single block works
 // and does not allocate.
 TEST(MappedBlockStreamTest, ReadBeyondEndOfStreamRef) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
-                                           F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
 
   BinaryStreamReader R(*S);
   BinaryStreamRef SR;
@@ -95,8 +102,7 @@ TEST(MappedBlockStreamTest, ReadBeyondEndOfStreamRef) {
 // does not fail due to the length of the output buffer.
 TEST(MappedBlockStreamTest, ReadOntoNonEmptyBuffer) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
-                                           F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
 
   BinaryStreamReader R(*S);
   StringRef Str = "ZYXWVUTSRQPONMLKJIHGFEDCBA";
@@ -110,7 +116,7 @@ TEST(MappedBlockStreamTest, ReadOntoNonEmptyBuffer) {
 // not allocate memory.
 TEST(MappedBlockStreamTest, ZeroCopyReadContiguousBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
+  auto S = MappedBlockStream::createStream(F.block_size(),
                                            F.layout(), F);
   BinaryStreamReader R(*S);
   StringRef Str;
@@ -129,8 +135,7 @@ TEST(MappedBlockStreamTest, ZeroCopyReadContiguousBreak) {
 // requested.
 TEST(MappedBlockStreamTest, CopyReadNonContiguousBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
-                                           F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
   BinaryStreamReader R(*S);
   StringRef Str;
   EXPECT_NO_ERROR(R.readFixedString(Str, 10));
@@ -142,8 +147,7 @@ TEST(MappedBlockStreamTest, CopyReadNonContiguousBreak) {
 // fails and allocates no memory.
 TEST(MappedBlockStreamTest, InvalidReadSizeNoBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
-                                           F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
   BinaryStreamReader R(*S);
   StringRef Str;
 
@@ -156,8 +160,7 @@ TEST(MappedBlockStreamTest, InvalidReadSizeNoBreak) {
 // fails and allocates no memory.
 TEST(MappedBlockStreamTest, InvalidReadSizeContiguousBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
-                                           F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
   BinaryStreamReader R(*S);
   StringRef Str;
 
@@ -170,8 +173,7 @@ TEST(MappedBlockStreamTest, InvalidReadSizeContiguousBreak) {
 // boundary fails and allocates no memory.
 TEST(MappedBlockStreamTest, InvalidReadSizeNonContiguousBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
-                                           F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
   BinaryStreamReader R(*S);
   StringRef Str;
 
@@ -183,8 +185,7 @@ TEST(MappedBlockStreamTest, InvalidReadSizeNonContiguousBreak) {
 // beyond the end of a StreamRef fails.
 TEST(MappedBlockStreamTest, ZeroCopyReadNoBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
-                                           F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
   BinaryStreamReader R(*S);
   StringRef Str;
   EXPECT_NO_ERROR(R.readFixedString(Str, 1));
@@ -197,8 +198,7 @@ TEST(MappedBlockStreamTest, ZeroCopyReadNoBreak) {
 // previous allocation.
 TEST(MappedBlockStreamTest, UnalignedOverlappingRead) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
-                                           F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
   BinaryStreamReader R(*S);
   StringRef Str1;
   StringRef Str2;
@@ -218,8 +218,7 @@ TEST(MappedBlockStreamTest, UnalignedOverlappingRead) {
 // still works correctly and allocates again from the shared pool.
 TEST(MappedBlockStreamTest, UnalignedOverlappingReadFail) {
   DiscontiguousStream F(BlocksAry, DataAry);
-  auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
-                                           F.layout(), F);
+  auto S = MappedBlockStream::createStream(F.block_size(), F.layout(), F);
   BinaryStreamReader R(*S);
   StringRef Str1;
   StringRef Str2;
@@ -243,7 +242,7 @@ TEST(MappedBlockStreamTest, WriteBeyondEndOfStream) {
 
   DiscontiguousStream F(BlocksAry, Data);
   auto S = WritableMappedBlockStream::createStream(
-      F.block_size(), F.block_count(), F.layout(), F);
+      F.block_size(), F.layout(), F);
   ArrayRef<uint8_t> Buffer;
 
   EXPECT_ERROR(S->writeBytes(0, ArrayRef<uint8_t>(LargeBuffer)));
@@ -256,7 +255,7 @@ TEST(MappedBlockStreamTest, TestWriteBytesNoBreakBoundary) {
   static uint8_t Data[] = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'};
   DiscontiguousStream F(BlocksAry, Data);
   auto S = WritableMappedBlockStream::createStream(
-      F.block_size(), F.block_count(), F.layout(), F);
+      F.block_size(), F.layout(), F);
   ArrayRef<uint8_t> Buffer;
 
   EXPECT_NO_ERROR(S->readBytes(0, 1, Buffer));
@@ -289,7 +288,7 @@ TEST(MappedBlockStreamTest, TestWriteBytesBreakBoundary) {
 
   DiscontiguousStream F(BlocksAry, Data);
   auto S = WritableMappedBlockStream::createStream(
-      F.block_size(), F.block_count(), F.layout(), F);
+      F.block_size(), F.layout(), F);
   ArrayRef<uint8_t> Buffer;
 
   EXPECT_NO_ERROR(S->writeBytes(0, TestData));
@@ -308,7 +307,7 @@ TEST(MappedBlockStreamTest, TestWriteThenRead) {
 
   DiscontiguousStream F(Blocks, Data);
   auto S = WritableMappedBlockStream::createStream(
-      F.block_size(), F.block_count(), F.layout(), F);
+      F.block_size(), F.layout(), F);
 
   enum class MyEnum : uint32_t { Val1 = 2908234, Val2 = 120891234 };
   using support::ulittle32_t;
@@ -400,7 +399,7 @@ TEST(MappedBlockStreamTest, TestWriteContiguousStreamRef) {
 
   DiscontiguousStream F(DestBlocks, DestData);
   auto DestStream = WritableMappedBlockStream::createStream(
-      F.block_size(), F.block_count(), F.layout(), F);
+      F.block_size(), F.layout(), F);
 
   // First write "Test Str" into the source stream.
   MutableBinaryByteStream SourceStream(SrcData, little);
@@ -435,9 +434,9 @@ TEST(MappedBlockStreamTest, TestWriteDiscontiguousStreamRef) {
   DiscontiguousStream SrcF(SrcBlocks, SrcData);
 
   auto Dest = WritableMappedBlockStream::createStream(
-      DestF.block_size(), DestF.block_count(), DestF.layout(), DestF);
+      DestF.block_size(), DestF.layout(), DestF);
   auto Src = WritableMappedBlockStream::createStream(
-      SrcF.block_size(), SrcF.block_count(), SrcF.layout(), SrcF);
+      SrcF.block_size(), SrcF.layout(), SrcF);
 
   // First write "Test Str" into the source stream.
   BinaryStreamWriter SourceWriter(*Src);
diff --git a/unittests/IR/BasicBlockTest.cpp b/unittests/IR/BasicBlockTest.cpp
new file mode 100644
index 000000000000..f1777e35b82c
--- /dev/null
+++ b/unittests/IR/BasicBlockTest.cpp
@@ -0,0 +1,75 @@
+//===- llvm/unittest/IR/BasicBlockTest.cpp - BasicBlock unit tests --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "gmock/gmock-matchers.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+namespace llvm {
+namespace {
+
+TEST(BasicBlockTest, PhiRange) {
+  LLVMContext Context;
+
+  // Create the main block.
+  std::unique_ptr<BasicBlock> BB(BasicBlock::Create(Context));
+
+  // Create some predecessors of it.
+  std::unique_ptr<BasicBlock> BB1(BasicBlock::Create(Context));
+  BranchInst::Create(BB.get(), BB1.get());
+  std::unique_ptr<BasicBlock> BB2(BasicBlock::Create(Context));
+  BranchInst::Create(BB.get(), BB2.get());
+
+  // Make it a cycle.
+  auto *BI = BranchInst::Create(BB.get(), BB.get());
+
+  // Now insert some PHI nodes.
+  auto *Int32Ty = Type::getInt32Ty(Context);
+  auto *P1 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.1", BI);
+  auto *P2 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.2", BI);
+  auto *P3 = PHINode::Create(Int32Ty, /*NumReservedValues*/ 3, "phi.3", BI);
+
+  // Some non-PHI nodes.
+  auto *Sum = BinaryOperator::CreateAdd(P1, P2, "sum", BI);
+
+  // Now wire up the incoming values that are interesting.
+  P1->addIncoming(P2, BB.get());
+  P2->addIncoming(P1, BB.get());
+  P3->addIncoming(Sum, BB.get());
+
+  // Finally, let's iterate them, which is the thing we're trying to test.
+  // We'll use this to wire up the rest of the incoming values.
+  for (auto &PN : BB->phis()) {
+    PN.addIncoming(UndefValue::get(Int32Ty), BB1.get());
+    PN.addIncoming(UndefValue::get(Int32Ty), BB2.get());
+  }
+
+  // Test that we can use const iterators and generally that the iterators
+  // behave like iterators.
+  BasicBlock::const_phi_iterator CI;
+  CI = BB->phis().begin();
+  EXPECT_NE(CI, BB->phis().end());
+
+  // And iterate a const range.
+  for (const auto &PN : const_cast<const BasicBlock *>(BB.get())->phis()) {
+    EXPECT_EQ(BB.get(), PN.getIncomingBlock(0));
+    EXPECT_EQ(BB1.get(), PN.getIncomingBlock(1));
+    EXPECT_EQ(BB2.get(), PN.getIncomingBlock(2));
+  }
+}
+
+} // End anonymous namespace.
+} // End llvm namespace.
diff --git a/unittests/IR/CMakeLists.txt b/unittests/IR/CMakeLists.txt
index 750f638c7a42..6734de8e2d95 100644
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt
@@ -8,6 +8,7 @@ set(LLVM_LINK_COMPONENTS
 set(IRSources
   AsmWriterTest.cpp
   AttributesTest.cpp
+  BasicBlockTest.cpp
   ConstantRangeTest.cpp
   ConstantsTest.cpp
   DebugInfoTest.cpp
diff --git a/unittests/IR/DominatorTreeTest.cpp b/unittests/IR/DominatorTreeTest.cpp
index 498e111a31f6..d2062839a734 100644
--- a/unittests/IR/DominatorTreeTest.cpp
+++ b/unittests/IR/DominatorTreeTest.cpp
@@ -7,30 +7,73 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Dominators.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
 
-namespace llvm {
-  void initializeDPassPass(PassRegistry&);
+/// Build the dominator tree for the function and run the Test.
+static void
+runWithDomTree(Module &M, StringRef FuncName,
+               function_ref<void(Function &F, DominatorTree *DT,
+                                 DominatorTreeBase<BasicBlock> *PDT)>
+                   Test) {
+  auto *F = M.getFunction(FuncName);
+  ASSERT_NE(F, nullptr) << "Could not find " << FuncName;
+  // Compute the dominator tree for the function.
+  DominatorTree DT(*F);
+  DominatorTreeBase<BasicBlock> PDT(/*isPostDom*/ true);
+  PDT.recalculate(*F);
+  Test(*F, &DT, &PDT);
+}
 
-  namespace {
-    struct DPass : public FunctionPass {
-      static char ID;
-      bool runOnFunction(Function &F) override {
-        DominatorTree *DT =
-            &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-        PostDominatorTree *PDT =
-            &getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+static std::unique_ptr<Module> makeLLVMModule(LLVMContext &Context,
+                                              StringRef ModuleStr) {
+  SMDiagnostic Err;
+  std::unique_ptr<Module> M = parseAssemblyString(ModuleStr, Err, Context);
+  assert(M && "Bad assembly?");
+  return M;
+}
+
+TEST(DominatorTree, Unreachable) {
+  StringRef ModuleString =
+      "declare i32 @g()\n"
+      "define void @f(i32 %x) personality i32 ()* @g {\n"
+      "bb0:\n"
+      "  %y1 = add i32 %x, 1\n"
+      "  %y2 = add i32 %x, 1\n"
+      "  %y3 = invoke i32 @g() to label %bb1 unwind label %bb2\n"
+      "bb1:\n"
+      "  %y4 = add i32 %x, 1\n"
+      "  br label %bb4\n"
+      "bb2:\n"
+      "  %y5 = landingpad i32\n"
+      "          cleanup\n"
+      "  br label %bb4\n"
+      "bb3:\n"
+      "  %y6 = add i32 %x, 1\n"
+      "  %y7 = add i32 %x, 1\n"
+      "  ret void\n"
+      "bb4:\n"
+      "  %y8 = phi i32 [0, %bb2], [%y4, %bb1]\n"
+      "  %y9 = phi i32 [0, %bb2], [%y4, %bb1]\n"
+      "  ret void\n"
+      "}\n";
+
+  // Parse the module.
+  LLVMContext Context;
+  std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
+
+  runWithDomTree(
+      *M, "f",
+      [&](Function &F, DominatorTree *DT, DominatorTreeBase<BasicBlock> *PDT) {
         Function::iterator FI = F.begin();
 
         BasicBlock *BB0 = &*FI++;
@@ -205,66 +248,12 @@ namespace llvm {
 
         // Change root node
         DT->verifyDomTree();
-        BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "new_entry",
-                                                  &F, BB0);
+        BasicBlock *NewEntry =
+            BasicBlock::Create(F.getContext(), "new_entry", &F, BB0);
         BranchInst::Create(BB0, NewEntry);
         EXPECT_EQ(F.begin()->getName(), NewEntry->getName());
         EXPECT_TRUE(&F.getEntryBlock() == NewEntry);
         DT->setNewRoot(NewEntry);
         DT->verifyDomTree();
-
-        return false;
-      }
-      void getAnalysisUsage(AnalysisUsage &AU) const override {
-        AU.addRequired<DominatorTreeWrapperPass>();
-        AU.addRequired<PostDominatorTreeWrapperPass>();
-      }
-      DPass() : FunctionPass(ID) {
-        initializeDPassPass(*PassRegistry::getPassRegistry());
-      }
-    };
-    char DPass::ID = 0;
-
-    std::unique_ptr<Module> makeLLVMModule(LLVMContext &Context, DPass *P) {
-      const char *ModuleString =
-        "declare i32 @g()\n" \
-        "define void @f(i32 %x) personality i32 ()* @g {\n" \
-        "bb0:\n" \
-        "  %y1 = add i32 %x, 1\n" \
-        "  %y2 = add i32 %x, 1\n" \
-        "  %y3 = invoke i32 @g() to label %bb1 unwind label %bb2\n" \
-        "bb1:\n" \
-        "  %y4 = add i32 %x, 1\n" \
-        "  br label %bb4\n" \
-        "bb2:\n" \
-        "  %y5 = landingpad i32\n" \
-        "          cleanup\n" \
-        "  br label %bb4\n" \
-        "bb3:\n" \
-        "  %y6 = add i32 %x, 1\n" \
-        "  %y7 = add i32 %x, 1\n" \
-        "  ret void\n" \
-        "bb4:\n" \
-        "  %y8 = phi i32 [0, %bb2], [%y4, %bb1]\n"
-        "  %y9 = phi i32 [0, %bb2], [%y4, %bb1]\n"
-        "  ret void\n" \
-        "}\n";
-      SMDiagnostic Err;
-      return parseAssemblyString(ModuleString, Err, Context);
-    }
-
-    TEST(DominatorTree, Unreachable) {
-      DPass *P = new DPass();
-      LLVMContext Context;
-      std::unique_ptr<Module> M = makeLLVMModule(Context, P);
-      legacy::PassManager Passes;
-      Passes.add(P);
-      Passes.run(*M);
-    }
-  }
+      });
 }
-
-INITIALIZE_PASS_BEGIN(DPass, "dpass", "dpass", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(DPass, "dpass", "dpass", false, false)
diff --git a/unittests/Support/CrashRecoveryTest.cpp b/unittests/Support/CrashRecoveryTest.cpp
index 33d87a1c0e4a..e9ffd1f8871f 100644
--- a/unittests/Support/CrashRecoveryTest.cpp
+++ b/unittests/Support/CrashRecoveryTest.cpp
@@ -17,10 +17,6 @@
 #include <windows.h>
 #endif
 
-extern "C" const char *__asan_default_options() {
-  return "allow_user_segv_handler=1";
-}
-
 using namespace llvm;
 using namespace llvm::sys;
 
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index b2913afae12a..9933c8319d30 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -35,6 +35,7 @@ add_tablegen(llvm-tblgen LLVM
   TableGen.cpp
   Types.cpp
   X86DisassemblerTables.cpp
+  X86FoldTablesEmitter.cpp
   X86EVEX2VEXTablesEmitter.cpp
   X86ModRMFilters.cpp
   X86RecognizableInstr.cpp
diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp
index 7a500eaf4111..185119c2ca43 100644
--- a/utils/TableGen/GlobalISelEmitter.cpp
+++ b/utils/TableGen/GlobalISelEmitter.cpp
@@ -135,6 +135,9 @@ static Error isTrivialOperatorNode(const TreePatternNode *N) {
   std::string Explanation = "";
   std::string Separator = "";
   if (N->isLeaf()) {
+    if (isa<IntInit>(N->getLeafValue()))
+      return Error::success();
+
     Explanation = "Is a leaf";
     Separator = ", ";
   }
@@ -272,6 +275,7 @@ public:
     OPM_ComplexPattern,
     OPM_Instruction,
     OPM_Int,
+    OPM_LiteralInt,
     OPM_LLT,
     OPM_RegBank,
     OPM_MBB,
@@ -406,13 +410,14 @@ public:
   }
 };
 
-/// Generates code to check that an operand is a particular int.
-class IntOperandMatcher : public OperandPredicateMatcher {
+/// Generates code to check that an operand is a G_CONSTANT with a particular
+/// int.
+class ConstantIntOperandMatcher : public OperandPredicateMatcher {
 protected:
   int64_t Value;
 
 public:
-  IntOperandMatcher(int64_t Value)
+  ConstantIntOperandMatcher(int64_t Value)
       : OperandPredicateMatcher(OPM_Int), Value(Value) {}
 
   static bool classof(const OperandPredicateMatcher *P) {
@@ -425,6 +430,27 @@ public:
   }
 };
 
+/// Generates code to check that an operand is a raw int (where MO.isImm() or
+/// MO.isCImm() is true).
+class LiteralIntOperandMatcher : public OperandPredicateMatcher {
+protected:
+  int64_t Value;
+
+public:
+  LiteralIntOperandMatcher(int64_t Value)
+      : OperandPredicateMatcher(OPM_LiteralInt), Value(Value) {}
+
+  static bool classof(const OperandPredicateMatcher *P) {
+    return P->getKind() == OPM_LiteralInt;
+  }
+
+  void emitCxxPredicateExpr(raw_ostream &OS, RuleMatcher &Rule,
+                            StringRef OperandExpr) const override {
+    OS << OperandExpr << ".isCImm() && " << OperandExpr
+       << ".getCImm()->equalsInt(" << Value << ")";
+  }
+};
+
 /// Generates code to check that a set of predicates match for a particular
 /// operand.
 class OperandMatcher : public PredicateListMatcher<OperandPredicateMatcher> {
@@ -1121,14 +1147,21 @@ void RuleMatcher::emit(raw_ostream &OS,
 
   // We must also check if it's safe to fold the matched instructions.
   if (InsnVariableNames.size() >= 2) {
+    // Invert the map to create stable ordering (by var names)
+    SmallVector<StringRef, 2> Names;
     for (const auto &Pair : InsnVariableNames) {
       // Skip the root node since it isn't moving anywhere. Everything else is
       // sinking to meet it.
       if (Pair.first == Matchers.front().get())
         continue;
 
+      Names.push_back(Pair.second);
+    }
+    std::sort(Names.begin(), Names.end());
+
+    for (const auto &Name : Names) {
       // Reject the difficult cases until we have a more accurate check.
-      OS << "      if (!isObviouslySafeToFold(" << Pair.second
+      OS << "      if (!isObviouslySafeToFold(" << Name
          << ")) return false;\n";
 
       // FIXME: Emit checks to determine it's _actually_ safe to fold and/or
@@ -1236,7 +1269,7 @@ private:
   createAndImportSelDAGMatcher(InstructionMatcher &InsnMatcher,
                                const TreePatternNode *Src) const;
   Error importChildMatcher(InstructionMatcher &InsnMatcher,
-                           TreePatternNode *SrcChild, unsigned OpIdx,
+                           const TreePatternNode *SrcChild, unsigned OpIdx,
                            unsigned &TempOpIdx) const;
   Expected<BuildMIAction &> createAndImportInstructionRenderer(
       RuleMatcher &M, const TreePatternNode *Dst,
@@ -1299,14 +1332,23 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
   if (Src->getExtTypes().size() > 1)
     return failedImport("Src pattern has multiple results");
 
-  auto SrcGIOrNull = findNodeEquiv(Src->getOperator());
-  if (!SrcGIOrNull)
-    return failedImport("Pattern operator lacks an equivalent Instruction" +
-                        explainOperator(Src->getOperator()));
-  auto &SrcGI = *SrcGIOrNull;
+  if (Src->isLeaf()) {
+    Init *SrcInit = Src->getLeafValue();
+    if (isa<IntInit>(SrcInit)) {
+      InsnMatcher.addPredicate<InstructionOpcodeMatcher>(
+          &Target.getInstruction(RK.getDef("G_CONSTANT")));
+    } else
+      return failedImport("Unable to deduce gMIR opcode to handle Src (which is a leaf)");
+  } else {
+    auto SrcGIOrNull = findNodeEquiv(Src->getOperator());
+    if (!SrcGIOrNull)
+      return failedImport("Pattern operator lacks an equivalent Instruction" +
+                          explainOperator(Src->getOperator()));
+    auto &SrcGI = *SrcGIOrNull;
 
-  // The operators look good: match the opcode and mutate it to the new one.
-  InsnMatcher.addPredicate<InstructionOpcodeMatcher>(&SrcGI);
+    // The operators look good: match the opcode
+    InsnMatcher.addPredicate<InstructionOpcodeMatcher>(&SrcGI);
+  }
 
   unsigned OpIdx = 0;
   unsigned TempOpIdx = 0;
@@ -1323,18 +1365,27 @@ Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
     OM.addPredicate<LLTOperandMatcher>(*OpTyOrNone);
   }
 
-  // Match the used operands (i.e. the children of the operator).
-  for (unsigned i = 0, e = Src->getNumChildren(); i != e; ++i) {
-    if (auto Error = importChildMatcher(InsnMatcher, Src->getChild(i), OpIdx++,
-                                        TempOpIdx))
-      return std::move(Error);
+  if (Src->isLeaf()) {
+    Init *SrcInit = Src->getLeafValue();
+    if (IntInit *SrcIntInit = dyn_cast<IntInit>(SrcInit)) {
+      OperandMatcher &OM = InsnMatcher.addOperand(OpIdx++, "", TempOpIdx);
+      OM.addPredicate<LiteralIntOperandMatcher>(SrcIntInit->getValue());
+    } else
+      return failedImport("Unable to deduce gMIR opcode to handle Src (which is a leaf)");
+  } else {
+    // Match the used operands (i.e. the children of the operator).
+    for (unsigned i = 0, e = Src->getNumChildren(); i != e; ++i) {
+      if (auto Error = importChildMatcher(InsnMatcher, Src->getChild(i),
+                                          OpIdx++, TempOpIdx))
+        return std::move(Error);
+    }
   }
 
   return InsnMatcher;
 }
 
 Error GlobalISelEmitter::importChildMatcher(InstructionMatcher &InsnMatcher,
-                                            TreePatternNode *SrcChild,
+                                            const TreePatternNode *SrcChild,
                                             unsigned OpIdx,
                                             unsigned &TempOpIdx) const {
   OperandMatcher &OM =
@@ -1379,7 +1430,7 @@ Error GlobalISelEmitter::importChildMatcher(InstructionMatcher &InsnMatcher,
 
   // Check for constant immediates.
   if (auto *ChildInt = dyn_cast<IntInit>(SrcChild->getLeafValue())) {
-    OM.addPredicate<IntOperandMatcher>(ChildInt->getValue());
+    OM.addPredicate<ConstantIntOperandMatcher>(ChildInt->getValue());
     return Error::success();
   }
 
@@ -1605,6 +1656,9 @@ Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
     return failedImport("Src pattern root isn't a trivial operator (" +
                         toString(std::move(Err)) + ")");
 
+  if (Dst->isLeaf())
+    return failedImport("Dst pattern root isn't a known leaf");
+
   // Start with the defined operands (i.e., the results of the root operator).
   Record *DstOp = Dst->getOperator();
   if (!DstOp->isSubClassOf("Instruction"))
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index 00d20f1df6c2..329ce348727e 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -46,6 +46,7 @@ enum ActionType {
   GenAttributes,
   GenSearchableTables,
   GenGlobalISel,
+  GenX86FoldTables,
   GenX86EVEX2VEXTables,
   GenRegisterBank,
 };
@@ -97,6 +98,8 @@ namespace {
                                "Generate generic binary-searchable table"),
                     clEnumValN(GenGlobalISel, "gen-global-isel",
                                "Generate GlobalISel selector"),
+                    clEnumValN(GenX86FoldTables, "gen-x86-fold-tables",
+                               "Generate X86 fold tables"),
                     clEnumValN(GenX86EVEX2VEXTables, "gen-x86-EVEX2VEX-tables",
                                "Generate X86 EVEX to VEX compress tables"),
                     clEnumValN(GenRegisterBank, "gen-register-bank",
@@ -190,6 +193,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenGlobalISel:
     EmitGlobalISel(Records, OS);
     break;
+  case GenX86FoldTables:
+    EmitX86FoldTables(Records, OS);
+    break;
   case GenRegisterBank:
     EmitRegisterBank(Records, OS);
     break;
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index 2512997e27f9..53614df27c40 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h
@@ -81,6 +81,7 @@ void EmitCTags(RecordKeeper &RK, raw_ostream &OS);
 void EmitAttributes(RecordKeeper &RK, raw_ostream &OS);
 void EmitSearchableTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitGlobalISel(RecordKeeper &RK, raw_ostream &OS);
+void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS);
 
diff --git a/utils/TableGen/X86FoldTablesEmitter.cpp b/utils/TableGen/X86FoldTablesEmitter.cpp
new file mode 100644
index 000000000000..f211a8fab975
--- /dev/null
+++ b/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -0,0 +1,720 @@
+//===- utils/TableGen/X86FoldTablesEmitter.cpp - X86 backend-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This tablegen backend is responsible for emitting the memory fold tables of
+// the X86 backend instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenDAGPatterns.h"
+#include "CodeGenTarget.h"
+#include "X86RecognizableInstr.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+
+namespace {
+
+// 3 possible strategies for the unfolding flag (TB_NO_REVERSE) of the
+// manual added entries.
+enum UnfoldStrategy {
+  UNFOLD,     // Allow unfolding
+  NO_UNFOLD,  // Prevent unfolding
+  NO_STRATEGY // Make decision according to operands' sizes
+};
+
+// Represents an entry in the manual mapped instructions set.
+struct ManualMapEntry {
+  const char *RegInstStr;
+  const char *MemInstStr;
+  UnfoldStrategy Strategy;
+
+  ManualMapEntry(const char *RegInstStr, const char *MemInstStr,
+                 UnfoldStrategy Strategy = NO_STRATEGY)
+      : RegInstStr(RegInstStr), MemInstStr(MemInstStr), Strategy(Strategy) {}
+};
+
+class IsMatch;
+
+// List of instructions requiring explicitly aligned memory.
+const char *const ExplicitAlign[] = {"MOVDQA",  "MOVAPS",  "MOVAPD",  "MOVNTPS",
+                                     "MOVNTPD", "MOVNTDQ", "MOVNTDQA"};
+
+// List of instructions NOT requiring explicit memory alignment.
+const char *const ExplicitUnalign[] = {"MOVDQU", "MOVUPS", "MOVUPD"};
+
+// For manually mapping instructions that do not match by their encoding.
+const ManualMapEntry ManualMapSet[] = {
+    { "ADD16ri_DB",       "ADD16mi",         NO_UNFOLD  },
+    { "ADD16ri8_DB",      "ADD16mi8",        NO_UNFOLD  },
+    { "ADD16rr_DB",       "ADD16mr",         NO_UNFOLD  },
+    { "ADD32ri_DB",       "ADD32mi",         NO_UNFOLD  },
+    { "ADD32ri8_DB",      "ADD32mi8",        NO_UNFOLD  },
+    { "ADD32rr_DB",       "ADD32mr",         NO_UNFOLD  },
+    { "ADD64ri32_DB",     "ADD64mi32",       NO_UNFOLD  },
+    { "ADD64ri8_DB",      "ADD64mi8",        NO_UNFOLD  },
+    { "ADD64rr_DB",       "ADD64mr",         NO_UNFOLD  },
+    { "ADD16rr_DB",       "ADD16rm",         NO_UNFOLD  },
+    { "ADD32rr_DB",       "ADD32rm",         NO_UNFOLD  },
+    { "ADD64rr_DB",       "ADD64rm",         NO_UNFOLD  },
+    { "PUSH16r",          "PUSH16rmm",       NO_UNFOLD  },
+    { "PUSH32r",          "PUSH32rmm",       NO_UNFOLD  },
+    { "PUSH64r",          "PUSH64rmm",       NO_UNFOLD  },
+    { "TAILJMPr",         "TAILJMPm",        UNFOLD },
+    { "TAILJMPr64",       "TAILJMPm64",      UNFOLD },
+    { "TAILJMPr64_REX",   "TAILJMPm64_REX",  UNFOLD },
+};
+
+// Do not add these instructions to any of the folding tables.
+const char *const NoFoldSet[] = {
+    "TCRETURNri64",
+    "TCRETURNmi64", // Special dealing (in X86InstrCompiler.td under
+    "TCRETURNri",   // "tailcall stuff" section).
+    "TCRETURNmi"
+
+    // Different calculations of the folded operand between
+    // memory and register forms (folding is illegal).
+    // - In their register form, the second register operand's relevant
+    //   bits are only the first 4/5/6 (depending on mode and reg size).
+    // - In their memory form, the second register operand's relevant
+    //   bits are only the first 16/32/64 (depending on mode and reg size).
+    "BT16rr",  "BT32rr",  "BT64rr",
+    "BT16mr",  "BT32mr",  "BT64mr",
+    "BTC16rr", "BTC32rr", "BTC64rr",
+    "BTC16mr", "BTC32mr", "BTC64mr",
+    "BTR16rr", "BTR32rr", "BTR64rr",
+    "BTR16mr", "BTR32mr", "BTR64mr",
+    "BTS16rr", "BTS32rr", "BTS64rr",
+    "BTS16mr", "BTS32mr", "BTS64mr",
+
+    // Memory folding is enabled only when optimizing for size by DAG
+    // patterns only. (issue detailed in D28744 review)
+    "VCVTSS2SDrm",            "VCVTSS2SDrr",
+    "VCVTSS2SDZrm",           "VCVTSS2SDZrr",
+    "VCVTSS2SDZrmk",          "VCVTSS2SDZrrk",
+    "VCVTSS2SDZrmkz",         "VCVTSS2SDZrrkz",
+    "VCVTSS2SDZrm_Int",       "VCVTSS2SDZrr_Int",
+    "VCVTSS2SDZrm_Intk",      "VCVTSS2SDZrr_Intk",
+    "VCVTSS2SDZrm_Intkz",     "VCVTSS2SDZrr_Intkz",
+    "VCVTSD2SSrm",            "VCVTSD2SSrr",
+    "VCVTSD2SSZrm",           "VCVTSD2SSZrr",
+    "VCVTSD2SSZrmk",          "VCVTSD2SSZrrk",
+    "VCVTSD2SSZrmkz",         "VCVTSD2SSZrrkz",
+    "VCVTSD2SSZrm_Int",       "VCVTSD2SSZrr_Int",
+    "VCVTSD2SSZrm_Intk",      "VCVTSD2SSZrr_Intk",
+    "VCVTSD2SSZrm_Intkz",     "VCVTSD2SSZrr_Intkz",
+    "VRCP14SSrm",             "VRCP14SSrr",
+    "VRCP14SDrm",             "VRCP14SDrr",
+    "VRSQRT14SSrm",           "VRSQRT14SSrr",
+    "VRSQRT14SDrm",           "VRSQRT14SDrr",
+    "VSQRTSSm",               "VSQRTSSr",
+    "VSQRTSSm_Int",           "VSQRTSSr_Int",
+    "VSQRTSSZm",              "VSQRTSSZr",
+    "VSQRTSSZm_Int",          "VSQRTSSZr_Int",
+    "VSQRTSSZm_Intk",         "VSQRTSSZr_Intk",
+    "VSQRTSSZm_Intkz",        "VSQRTSSZr_Intkz",
+    "VSQRTSDm",               "VSQRTSDr",
+    "VSQRTSDm_Int",           "VSQRTSDr_Int",
+    "VSQRTSDZm",              "VSQRTSDZr",
+    "VSQRTSDZm_Int",          "VSQRTSDZr_Int",
+    "VSQRTSDZm_Intk",         "VSQRTSDZr_Intk",
+    "VSQRTSDZm_Intkz",        "VSQRTSDZr_Intkz",
+};
+
+static bool isExplicitAlign(const CodeGenInstruction *Inst) {
+  return any_of(ExplicitAlign, [Inst](const char *InstStr) {
+    return Inst->TheDef->getName().find(InstStr) != StringRef::npos;
+  });
+}
+
+static bool isExplicitUnalign(const CodeGenInstruction *Inst) {
+  return any_of(ExplicitUnalign, [Inst](const char *InstStr) {
+    return Inst->TheDef->getName().find(InstStr) != StringRef::npos;
+  });
+}
+
+class X86FoldTablesEmitter {
+  RecordKeeper &Records;
+  CodeGenTarget Target;
+
+  // Represents an entry in the folding table
+  class X86FoldTableEntry {
+    const CodeGenInstruction *RegInst;
+    const CodeGenInstruction *MemInst;
+
+  public:
+    bool CannotUnfold = false;
+    bool IsLoad = false;
+    bool IsStore = false;
+    bool IsAligned = false;
+    unsigned int Alignment = 0;
+
+    X86FoldTableEntry(const CodeGenInstruction *RegInst,
+                      const CodeGenInstruction *MemInst)
+        : RegInst(RegInst), MemInst(MemInst) {}
+
+    friend raw_ostream &operator<<(raw_ostream &OS,
+                                   const X86FoldTableEntry &E) {
+      OS << "{ X86::" << E.RegInst->TheDef->getName()
+         << ", X86::" << E.MemInst->TheDef->getName() << ", ";
+
+      if (E.IsLoad)
+        OS << "TB_FOLDED_LOAD | ";
+      if (E.IsStore)
+        OS << "TB_FOLDED_STORE | ";
+      if (E.CannotUnfold)
+        OS << "TB_NO_REVERSE | ";
+      if (E.IsAligned)
+        OS << "TB_ALIGN_" << E.Alignment << " | ";
+
+      OS << "0 },\n";
+
+      return OS;
+    }
+  };
+
+  typedef std::vector<X86FoldTableEntry> FoldTable;
+  // std::vector for each folding table.
+  // Table2Addr - Holds instructions which their memory form performs load+store
+  // Table#i - Holds instructions which the their memory form perform a load OR
+  //           a store,  and their #i'th operand is folded.
+  FoldTable Table2Addr;
+  FoldTable Table0;
+  FoldTable Table1;
+  FoldTable Table2;
+  FoldTable Table3;
+  FoldTable Table4;
+
+public:
+  X86FoldTablesEmitter(RecordKeeper &R) : Records(R), Target(R) {}
+
+  // run - Generate the 6 X86 memory fold tables.
+  void run(raw_ostream &OS);
+
+private:
+  // Decides to which table to add the entry with the given instructions.
+  // S sets the strategy of adding the TB_NO_REVERSE flag.
+  void updateTables(const CodeGenInstruction *RegInstr,
+                    const CodeGenInstruction *MemInstr,
+                    const UnfoldStrategy S = NO_STRATEGY);
+
+  // Generates X86FoldTableEntry with the given instructions and fill it with
+  // the appropriate flags - then adds it to Table.
+  void addEntryWithFlags(FoldTable &Table, const CodeGenInstruction *RegInstr,
+                         const CodeGenInstruction *MemInstr,
+                         const UnfoldStrategy S, const unsigned int FoldedInd);
+
+  // Print the given table as a static const C++ array of type
+  // X86MemoryFoldTableEntry.
+  void printTable(const FoldTable &Table, std::string TableName,
+                  raw_ostream &OS) {
+    OS << "\nstatic const X86MemoryFoldTableEntry MemoryFold" << TableName
+       << "[] = {\n";
+
+    for (const X86FoldTableEntry &E : Table)
+      OS.indent(2) << E;
+
+    OS << "};\n";
+  }
+};
+
+// Return true if one of the instruction's operands is a RST register class
+static bool hasRSTRegClass(const CodeGenInstruction *Inst) {
+  return any_of(Inst->Operands, [](const CGIOperandList::OperandInfo &OpIn) {
+    return OpIn.Rec->getName() == "RST";
+  });
+}
+
+// Return true if one of the instruction's operands is a ptr_rc_tailcall
+static bool hasPtrTailcallRegClass(const CodeGenInstruction *Inst) {
+  return any_of(Inst->Operands, [](const CGIOperandList::OperandInfo &OpIn) {
+    return OpIn.Rec->getName() == "ptr_rc_tailcall";
+  });
+}
+
+// Calculates the integer value representing the BitsInit object
+static inline uint64_t getValueFromBitsInit(const BitsInit *B) {
+  assert(B->getNumBits() <= sizeof(uint64_t) * CHAR_BIT &&
+         "BitInits' too long!");
+
+  uint64_t Value = 0;
+  for (unsigned i = 0, e = B->getNumBits(); i != e; ++i) {
+    BitInit *Bit = cast<BitInit>(B->getBit(i));
+    Value |= uint64_t(Bit->getValue()) << i;
+  }
+  return Value;
+}
+
+// Returns true if the two given BitsInits represent the same integer value
+static inline bool equalBitsInits(const BitsInit *B1, const BitsInit *B2) {
+  if (B1->getNumBits() != B2->getNumBits())
+    PrintFatalError("Comparing two BitsInits with different sizes!");
+
+  for (unsigned i = 0, e = B1->getNumBits(); i != e; ++i) {
+    BitInit *Bit1 = cast<BitInit>(B1->getBit(i));
+    BitInit *Bit2 = cast<BitInit>(B2->getBit(i));
+    if (Bit1->getValue() != Bit2->getValue())
+      return false;
+  }
+  return true;
+}
+
+// Return the size of the register operand
+static inline unsigned int getRegOperandSize(const Record *RegRec) {
+  if (RegRec->isSubClassOf("RegisterOperand"))
+    RegRec = RegRec->getValueAsDef("RegClass");
+  if (RegRec->isSubClassOf("RegisterClass"))
+    return RegRec->getValueAsListOfDefs("RegTypes")[0]->getValueAsInt("Size");
+
+  llvm_unreachable("Register operand's size not known!");
+}
+
+// Return the size of the memory operand
+static inline unsigned int
+getMemOperandSize(const Record *MemRec, const bool IntrinsicSensitive = false) {
+  if (MemRec->isSubClassOf("Operand")) {
+    // Intrinsic memory instructions use ssmem/sdmem.
+    if (IntrinsicSensitive &&
+        (MemRec->getName() == "sdmem" || MemRec->getName() == "ssmem"))
+      return 128;
+
+    std::string Name =
+        MemRec->getValueAsDef("ParserMatchClass")->getValueAsString("Name");
+    if (Name == "Mem8")
+      return 8;
+    if (Name == "Mem16")
+      return 16;
+    if (Name == "Mem32")
+      return 32;
+    if (Name == "Mem64")
+      return 64;
+    if (Name == "Mem80")
+      return 80;
+    if (Name == "Mem128")
+      return 128;
+    if (Name == "Mem256")
+      return 256;
+    if (Name == "Mem512")
+      return 512;
+  }
+
+  llvm_unreachable("Memory operand's size not known!");
+}
+
+// Returns true if the record's list of defs includes the given def.
+static inline bool hasDefInList(const Record *Rec, const StringRef List,
+                                const StringRef Def) {
+  if (!Rec->isValueUnset(List)) {
+    return any_of(*(Rec->getValueAsListInit(List)),
+                  [Def](const Init *I) { return I->getAsString() == Def; });
+  }
+  return false;
+}
+
+// Return true if the instruction defined as a register flavor.
+static inline bool hasRegisterFormat(const Record *Inst) {
+  const BitsInit *FormBits = Inst->getValueAsBitsInit("FormBits");
+  uint64_t FormBitsNum = getValueFromBitsInit(FormBits);
+
+  // Values from X86Local namespace defined in X86RecognizableInstr.cpp
+  return FormBitsNum >= X86Local::MRMDestReg && FormBitsNum <= X86Local::MRM7r;
+}
+
+// Return true if the instruction defined as a memory flavor.
+static inline bool hasMemoryFormat(const Record *Inst) {
+  const BitsInit *FormBits = Inst->getValueAsBitsInit("FormBits");
+  uint64_t FormBitsNum = getValueFromBitsInit(FormBits);
+
+  // Values from X86Local namespace defined in X86RecognizableInstr.cpp
+  return FormBitsNum >= X86Local::MRMDestMem && FormBitsNum <= X86Local::MRM7m;
+}
+
+static inline bool isNOREXRegClass(const Record *Op) {
+  return Op->getName().find("_NOREX") != StringRef::npos;
+}
+
+static inline bool isRegisterOperand(const Record *Rec) {
+  return Rec->isSubClassOf("RegisterClass") ||
+         Rec->isSubClassOf("RegisterOperand") ||
+         Rec->isSubClassOf("PointerLikeRegClass");
+}
+
+static inline bool isMemoryOperand(const Record *Rec) {
+  return Rec->isSubClassOf("Operand") &&
+         Rec->getValueAsString("OperandType") == "OPERAND_MEMORY";
+}
+
+static inline bool isImmediateOperand(const Record *Rec) {
+  return Rec->isSubClassOf("Operand") &&
+         Rec->getValueAsString("OperandType") == "OPERAND_IMMEDIATE";
+}
+
+// Get the alternative instruction pointed by "FoldGenRegForm" field.
+static inline const CodeGenInstruction *
+getAltRegInst(const CodeGenInstruction *I, const RecordKeeper &Records,
+              const CodeGenTarget &Target) {
+
+  std::string AltRegInstStr = I->TheDef->getValueAsString("FoldGenRegForm");
+  Record *AltRegInstRec = Records.getDef(AltRegInstStr);
+  assert(AltRegInstRec &&
+         "Alternative register form instruction def not found");
+  CodeGenInstruction &AltRegInst = Target.getInstruction(AltRegInstRec);
+  return &AltRegInst;
+}
+
+// Function object - Operator() returns true if the given VEX instruction
+// matches the EVEX instruction of this object.
+class IsMatch {
+  const CodeGenInstruction *MemInst;
+  const RecordKeeper &Records;
+
+public:
+  IsMatch(const CodeGenInstruction *Inst, const RecordKeeper &Records)
+      : MemInst(Inst), Records(Records) {}
+
+  bool operator()(const CodeGenInstruction *RegInst) {
+    Record *MemRec = MemInst->TheDef;
+    Record *RegRec = RegInst->TheDef;
+
+    // Return false if one (at least) of the encoding fields of both
+    // instructions do not match.
+    if (RegRec->getValueAsDef("OpEnc") != MemRec->getValueAsDef("OpEnc") ||
+        !equalBitsInits(RegRec->getValueAsBitsInit("Opcode"),
+                        MemRec->getValueAsBitsInit("Opcode")) ||
+        // VEX/EVEX fields
+        RegRec->getValueAsDef("OpPrefix") !=
+            MemRec->getValueAsDef("OpPrefix") ||
+        RegRec->getValueAsDef("OpMap") != MemRec->getValueAsDef("OpMap") ||
+        RegRec->getValueAsDef("OpSize") != MemRec->getValueAsDef("OpSize") ||
+        RegRec->getValueAsBit("hasVEX_4V") !=
+            MemRec->getValueAsBit("hasVEX_4V") ||
+        RegRec->getValueAsBit("hasEVEX_K") !=
+            MemRec->getValueAsBit("hasEVEX_K") ||
+        RegRec->getValueAsBit("hasEVEX_Z") !=
+            MemRec->getValueAsBit("hasEVEX_Z") ||
+        RegRec->getValueAsBit("hasEVEX_B") !=
+            MemRec->getValueAsBit("hasEVEX_B") ||
+        RegRec->getValueAsBit("hasEVEX_RC") !=
+            MemRec->getValueAsBit("hasEVEX_RC") ||
+        RegRec->getValueAsBit("hasREX_WPrefix") !=
+            MemRec->getValueAsBit("hasREX_WPrefix") ||
+        RegRec->getValueAsBit("hasLockPrefix") !=
+            MemRec->getValueAsBit("hasLockPrefix") ||
+        !equalBitsInits(RegRec->getValueAsBitsInit("EVEX_LL"),
+                        MemRec->getValueAsBitsInit("EVEX_LL")) ||
+        !equalBitsInits(RegRec->getValueAsBitsInit("VEX_WPrefix"),
+                        MemRec->getValueAsBitsInit("VEX_WPrefix")) ||
+        // Instruction's format - The register form's "Form" field should be
+        // the opposite of the memory form's "Form" field.
+        !areOppositeForms(RegRec->getValueAsBitsInit("FormBits"),
+                          MemRec->getValueAsBitsInit("FormBits")) ||
+        RegRec->getValueAsBit("isAsmParserOnly") !=
+            MemRec->getValueAsBit("isAsmParserOnly"))
+      return false;
+
+    // Make sure the sizes of the operands of both instructions suit each other.
+    // This is needed for instructions with intrinsic version (_Int).
+    // Where the only difference is the size of the operands.
+    // For example: VUCOMISDZrm and Int_VUCOMISDrm
+    // Also for instructions that their EVEX version was upgraded to work with
+    // k-registers. For example VPCMPEQBrm (xmm output register) and
+    // VPCMPEQBZ128rm (k register output register).
+    bool ArgFolded = false;
+    unsigned MemOutSize = MemRec->getValueAsDag("OutOperandList")->getNumArgs();
+    unsigned RegOutSize = RegRec->getValueAsDag("OutOperandList")->getNumArgs();
+    unsigned MemInSize = MemRec->getValueAsDag("InOperandList")->getNumArgs();
+    unsigned RegInSize = RegRec->getValueAsDag("InOperandList")->getNumArgs();
+
+    // Instructions with one output in their memory form use the memory folded
+    // operand as source and destination (Read-Modify-Write).
+    unsigned RegStartIdx =
+        (MemOutSize + 1 == RegOutSize) && (MemInSize == RegInSize) ? 1 : 0;
+
+    for (unsigned i = 0, e = MemInst->Operands.size(); i < e; i++) {
+      Record *MemOpRec = MemInst->Operands[i].Rec;
+      Record *RegOpRec = RegInst->Operands[i + RegStartIdx].Rec;
+
+      if (MemOpRec == RegOpRec)
+        continue;
+
+      if (isRegisterOperand(MemOpRec) && isRegisterOperand(RegOpRec)) {
+        if (getRegOperandSize(MemOpRec) != getRegOperandSize(RegOpRec) ||
+            isNOREXRegClass(MemOpRec) != isNOREXRegClass(RegOpRec))
+          return false;
+      } else if (isMemoryOperand(MemOpRec) && isMemoryOperand(RegOpRec)) {
+        if (getMemOperandSize(MemOpRec) != getMemOperandSize(RegOpRec))
+          return false;
+      } else if (isImmediateOperand(MemOpRec) && isImmediateOperand(RegOpRec)) {
+        if (MemOpRec->getValueAsDef("Type") != RegOpRec->getValueAsDef("Type"))
+          return false;
+      } else {
+        // Only one operand can be folded.
+        if (ArgFolded)
+          return false;
+
+        assert(isRegisterOperand(RegOpRec) && isMemoryOperand(MemOpRec));
+        ArgFolded = true;
+      }
+    }
+
+    return true;
+  }
+
+private:
+  // Return true of the 2 given forms are the opposite of each other.
+  bool areOppositeForms(const BitsInit *RegFormBits,
+                        const BitsInit *MemFormBits) {
+    uint64_t MemFormNum = getValueFromBitsInit(MemFormBits);
+    uint64_t RegFormNum = getValueFromBitsInit(RegFormBits);
+
+    if ((MemFormNum == X86Local::MRM0m && RegFormNum == X86Local::MRM0r) ||
+        (MemFormNum == X86Local::MRM1m && RegFormNum == X86Local::MRM1r) ||
+        (MemFormNum == X86Local::MRM2m && RegFormNum == X86Local::MRM2r) ||
+        (MemFormNum == X86Local::MRM3m && RegFormNum == X86Local::MRM3r) ||
+        (MemFormNum == X86Local::MRM4m && RegFormNum == X86Local::MRM4r) ||
+        (MemFormNum == X86Local::MRM5m && RegFormNum == X86Local::MRM5r) ||
+        (MemFormNum == X86Local::MRM6m && RegFormNum == X86Local::MRM6r) ||
+        (MemFormNum == X86Local::MRM7m && RegFormNum == X86Local::MRM7r) ||
+        (MemFormNum == X86Local::MRMXm && RegFormNum == X86Local::MRMXr) ||
+        (MemFormNum == X86Local::MRMDestMem &&
+         RegFormNum == X86Local::MRMDestReg) ||
+        (MemFormNum == X86Local::MRMSrcMem &&
+         RegFormNum == X86Local::MRMSrcReg) ||
+        (MemFormNum == X86Local::MRMSrcMem4VOp3 &&
+         RegFormNum == X86Local::MRMSrcReg4VOp3) ||
+        (MemFormNum == X86Local::MRMSrcMemOp4 &&
+         RegFormNum == X86Local::MRMSrcRegOp4))
+      return true;
+
+    return false;
+  }
+};
+
+} // end anonymous namespace
+
+void X86FoldTablesEmitter::addEntryWithFlags(FoldTable &Table,
+                                             const CodeGenInstruction *RegInstr,
+                                             const CodeGenInstruction *MemInstr,
+                                             const UnfoldStrategy S,
+                                             const unsigned int FoldedInd) {
+
+  X86FoldTableEntry Result = X86FoldTableEntry(RegInstr, MemInstr);
+  Record *RegRec = RegInstr->TheDef;
+  Record *MemRec = MemInstr->TheDef;
+
+  // Only table0 entries should explicitly specify a load or store flag.
+  if (&Table == &Table0) {
+    unsigned MemInOpsNum = MemRec->getValueAsDag("InOperandList")->getNumArgs();
+    unsigned RegInOpsNum = RegRec->getValueAsDag("InOperandList")->getNumArgs();
+    // If the instruction writes to the folded operand, it will appear as an
+    // output in the register form instruction and as an input in the memory
+    // form instruction.
+    // If the instruction reads from the folded operand, it well appear as in
+    // input in both forms.
+    if (MemInOpsNum == RegInOpsNum)
+      Result.IsLoad = true;
+    else
+      Result.IsStore = true;
+  }
+
+  Record *RegOpRec = RegInstr->Operands[FoldedInd].Rec;
+  Record *MemOpRec = MemInstr->Operands[FoldedInd].Rec;
+
+  // Unfolding code generates a load/store instruction according to the size of
+  // the register in the register form instruction.
+  // If the register's size is greater than the memory's operand size, do not
+  // allow unfolding.
+  if (S == UNFOLD)
+    Result.CannotUnfold = false;
+  else if (S == NO_UNFOLD)
+    Result.CannotUnfold = true;
+  else if (getRegOperandSize(RegOpRec) > getMemOperandSize(MemOpRec))
+    Result.CannotUnfold = true; // S == NO_STRATEGY
+
+  uint64_t Enc = getValueFromBitsInit(RegRec->getValueAsBitsInit("OpEncBits"));
+  if (isExplicitAlign(RegInstr)) {
+    // The instruction require explicitly aligned memory.
+    BitsInit *VectSize = RegRec->getValueAsBitsInit("VectSize");
+    uint64_t Value = getValueFromBitsInit(VectSize);
+    Result.IsAligned = true;
+    Result.Alignment = Value;
+  } else if (Enc != X86Local::XOP && Enc != X86Local::VEX &&
+             Enc != X86Local::EVEX) {
+    // Instructions with VEX encoding do not require alignment.
+    if (!isExplicitUnalign(RegInstr) && getMemOperandSize(MemOpRec) > 64) {
+      // SSE packed vector instructions require a 16 byte alignment.
+      Result.IsAligned = true;
+      Result.Alignment = 16;
+    }
+  }
+
+  Table.push_back(Result);
+}
+
+void X86FoldTablesEmitter::updateTables(const CodeGenInstruction *RegInstr,
+                                        const CodeGenInstruction *MemInstr,
+                                        const UnfoldStrategy S) {
+
+  Record *RegRec = RegInstr->TheDef;
+  Record *MemRec = MemInstr->TheDef;
+  unsigned MemOutSize = MemRec->getValueAsDag("OutOperandList")->getNumArgs();
+  unsigned RegOutSize = RegRec->getValueAsDag("OutOperandList")->getNumArgs();
+  unsigned MemInSize = MemRec->getValueAsDag("InOperandList")->getNumArgs();
+  unsigned RegInSize = RegRec->getValueAsDag("InOperandList")->getNumArgs();
+
+  // Instructions which have the WriteRMW value (Read-Modify-Write) should be
+  // added to Table2Addr.
+  if (hasDefInList(MemRec, "SchedRW", "WriteRMW") && MemOutSize != RegOutSize &&
+      MemInSize == RegInSize) {
+    addEntryWithFlags(Table2Addr, RegInstr, MemInstr, S, 0);
+    return;
+  }
+
+  if (MemInSize == RegInSize && MemOutSize == RegOutSize) {
+    // Load-Folding cases.
+    // If the i'th register form operand is a register and the i'th memory form
+    // operand is a memory operand, add instructions to Table#i.
+    for (unsigned i = RegOutSize, e = RegInstr->Operands.size(); i < e; i++) {
+      Record *RegOpRec = RegInstr->Operands[i].Rec;
+      Record *MemOpRec = MemInstr->Operands[i].Rec;
+      if (isRegisterOperand(RegOpRec) && isMemoryOperand(MemOpRec)) {
+        switch (i) {
+        default: llvm_unreachable("Unexpected operand count!");
+        case 0:
+          addEntryWithFlags(Table0, RegInstr, MemInstr, S, 0);
+          return;
+        case 1:
+          addEntryWithFlags(Table1, RegInstr, MemInstr, S, 1);
+          return;
+        case 2:
+          addEntryWithFlags(Table2, RegInstr, MemInstr, S, 2);
+          return;
+        case 3:
+          addEntryWithFlags(Table3, RegInstr, MemInstr, S, 3);
+          return;
+        case 4:
+          addEntryWithFlags(Table4, RegInstr, MemInstr, S, 4);
+          return;
+        }
+      }
+    }
+  } else if (MemInSize == RegInSize + 1 && MemOutSize + 1 == RegOutSize) {
+    // Store-Folding cases.
+    // If the memory form instruction performs performs a store, the *output*
+    // register of the register form instructions disappear and instead a
+    // memory *input* operand appears in the memory form instruction.
+    // For example:
+    //   MOVAPSrr => (outs VR128:$dst), (ins VR128:$src)
+    //   MOVAPSmr => (outs), (ins f128mem:$dst, VR128:$src)
+    Record *RegOpRec = RegInstr->Operands[RegOutSize - 1].Rec;
+    Record *MemOpRec = MemInstr->Operands[RegOutSize - 1].Rec;
+    if (isRegisterOperand(RegOpRec) && isMemoryOperand(MemOpRec))
+      addEntryWithFlags(Table0, RegInstr, MemInstr, S, 0);
+  }
+
+  return;
+}
+
+void X86FoldTablesEmitter::run(raw_ostream &OS) {
+  emitSourceFileHeader("X86 fold tables", OS);
+
+  // Holds all memory instructions
+  std::vector<const CodeGenInstruction *> MemInsts;
+  // Holds all register instructions - divided according to opcode.
+  std::map<uint8_t, std::vector<const CodeGenInstruction *>> RegInsts;
+
+  ArrayRef<const CodeGenInstruction *> NumberedInstructions =
+      Target.getInstructionsByEnumValue();
+
+  for (const CodeGenInstruction *Inst : NumberedInstructions) {
+    if (!Inst->TheDef->getNameInit() || !Inst->TheDef->isSubClassOf("X86Inst"))
+      continue;
+
+    const Record *Rec = Inst->TheDef;
+
+    // - Do not proceed matching if the instruction in NoFoldSet.
+    // - Instructions including RST register class operands are not relevant
+    //   for memory folding (for further details check the explanation in
+    //   lib/Target/X86/X86InstrFPStack.td file).
+    // - Some instructions (listed in the manual map above) use the register
+    //   class ptr_rc_tailcall, which can be of a size 32 or 64, to ensure
+    //   safe mapping of these instruction we manually map them and exclude
+    //   them from the automation.
+    if (find(NoFoldSet, Rec->getName()) != std::end(NoFoldSet) ||
+        hasRSTRegClass(Inst) || hasPtrTailcallRegClass(Inst))
+      continue;
+
+    // Add all the memory form instructions to MemInsts, and all the register
+    // form instructions to RegInsts[Opc], where Opc in the opcode of each
+    // instructions. this helps reducing the runtime of the backend.
+    if (hasMemoryFormat(Rec))
+      MemInsts.push_back(Inst);
+    else if (hasRegisterFormat(Rec)) {
+      uint8_t Opc = getValueFromBitsInit(Rec->getValueAsBitsInit("Opcode"));
+      RegInsts[Opc].push_back(Inst);
+    }
+  }
+
+  // For each memory form instruction, try to find its register form
+  // instruction.
+  for (const CodeGenInstruction *MemInst : MemInsts) {
+    uint8_t Opc =
+        getValueFromBitsInit(MemInst->TheDef->getValueAsBitsInit("Opcode"));
+
+    if (RegInsts.count(Opc) == 0)
+      continue;
+
+    // Two forms (memory & register) of the same instruction must have the same
+    // opcode. try matching only with register form instructions with the same
+    // opcode.
+    std::vector<const CodeGenInstruction *> &OpcRegInsts =
+        RegInsts.find(Opc)->second;
+
+    auto Match = find_if(OpcRegInsts, IsMatch(MemInst, Records));
+    if (Match != OpcRegInsts.end()) {
+      const CodeGenInstruction *RegInst = *Match;
+      // If the matched instruction has it's "FoldGenRegForm" set, map the
+      // memory form instruction to the register form instruction pointed by
+      // this field
+      if (RegInst->TheDef->isValueUnset("FoldGenRegForm")) {
+        updateTables(RegInst, MemInst);
+      } else {
+        const CodeGenInstruction *AltRegInst =
+            getAltRegInst(RegInst, Records, Target);
+        updateTables(AltRegInst, MemInst);
+      }
+      OpcRegInsts.erase(Match);
+    }
+  }
+
+  // Add the manually mapped instructions listed above.
+  for (const ManualMapEntry &Entry : ManualMapSet) {
+    Record *RegInstIter = Records.getDef(Entry.RegInstStr);
+    Record *MemInstIter = Records.getDef(Entry.MemInstStr);
+
+    updateTables(&(Target.getInstruction(RegInstIter)),
+                 &(Target.getInstruction(MemInstIter)), Entry.Strategy);
+  }
+
+  // Print all tables to raw_ostream OS.
+  printTable(Table2Addr, "Table2Addr", OS);
+  printTable(Table0, "Table0", OS);
+  printTable(Table1, "Table1", OS);
+  printTable(Table2, "Table2", OS);
+  printTable(Table3, "Table3", OS);
+  printTable(Table4, "Table4", OS);
+}
+
+namespace llvm {
+
+void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS) {
+  X86FoldTablesEmitter(RK).run(OS);
+}
+} // namespace llvm
diff --git a/utils/abtest/abtest.py b/utils/abtest.py
similarity index 81%
rename from utils/abtest/abtest.py
rename to utils/abtest.py
index ad6a3e0ea8d2..1219dbae1b28 100755
--- a/utils/abtest/abtest.py
+++ b/utils/abtest.py
@@ -11,10 +11,10 @@
 # in each step replacing one of them with a file from the "bad" directory.
 #
 # Additionally you can perform the same steps with a single .s file. In this
-# mode functions are identified by "# -- Begin FunctionName" and
-# "# -- End FunctionName" markers. The abtest.py then takes all functions from
-# the file in the "before" directory and replaces one function with the
-# corresponding function from the "bad" file in each step.
+# mode functions are identified by " -- Begin function FunctionName" and
+# " -- End function" markers. The abtest.py then takes all
+# function from the file in the "before" directory and replaces one function
+# with the corresponding function from the "bad" file in each step.
 #
 # Example usage to identify miscompiled files:
 #    1. Create a link_test script, make it executable. Simple Example:
@@ -26,12 +26,7 @@
 #       anotherfile.s: failed: './link_test' exitcode != 0
 #       ...
 # Example usage to identify miscompiled functions inside a file:
-#    3. First you have to mark begin and end of the functions.
-#       The script comes with some examples called mark_xxx.py.
-#       Unfortunately this is very specific to your environment and it is likely
-#       that you have to write a custom version for your environment.
-#       > for i in before/*.s after/*.s; do mark_xxx.py $i; done
-#    4. Run the tests on a single file (assuming before/file.s and
+#    3. Run the tests on a single file (assuming before/file.s and
 #       after/file.s exist)
 #       > ./abtest.py file.s
 #       funcname1 [0/XX]: ok
@@ -70,21 +65,23 @@ def extract_functions(file):
     functions = []
     in_function = None
     for line in open(file):
-        if line.startswith("# -- Begin  "):
+        marker = line.find(" -- Begin function ")
+        if marker != -1:
             if in_function != None:
                 warn("Missing end of function %s" % (in_function,))
-            funcname = line[12:-1]
+            funcname = line[marker + 19:-1]
             in_function = funcname
             text = line
-        elif line.startswith("# -- End  "):
-            function_name = line[10:-1]
-            if in_function != function_name:
-                warn("End %s does not match begin %s" % (function_name, in_function))
-            else:
-                text += line
-                functions.append( (in_function, text) )
+            continue
+
+        marker = line.find(" -- End function")
+        if marker != -1:
+            text += line
+            functions.append( (in_function, text) )
             in_function = None
-        elif in_function != None:
+            continue
+
+        if in_function != None:
             text += line
     return functions
 
@@ -94,22 +91,23 @@ def replace_function(file, function, replacement, dest):
     found = False
     in_function = None
     for line in open(file):
-        if line.startswith("# -- Begin  "):
+        marker = line.find(" -- Begin function ")
+        if marker != -1:
             if in_function != None:
                 warn("Missing end of function %s" % (in_function,))
-            funcname = line[12:-1]
+            funcname = line[marker + 19:-1]
             in_function = funcname
             if in_function == function:
                 out.write(replacement)
                 skip = True
-        elif line.startswith("# -- End  "):
-            function_name = line[10:-1]
-            if in_function != function_name:
-                warn("End %s does not match begin %s" % (function_name, in_function))
-            in_function = None
-            if skip:
-                skip = False
-                continue
+        else:
+            marker = line.find(" -- End function")
+            if marker != -1:
+                in_function = None
+                if skip:
+                    skip = False
+                    continue
+
         if not skip:
             out.write(line)
 
diff --git a/utils/abtest/mark_aarch64fns.py b/utils/abtest/mark_aarch64fns.py
deleted file mode 100755
index 652014792849..000000000000
--- a/utils/abtest/mark_aarch64fns.py
+++ /dev/null
@@ -1,65 +0,0 @@
-#!/usr/bin/env python
-#
-# Mark functions in an arm assembly file. This is done by surrounding the
-# function with "# -- Begin Name" and "# -- End Name"
-# (This script is designed for aarch64 ios assembly syntax)
-import sys
-import re
-
-inp = open(sys.argv[1], "r").readlines()
-
-# First pass
-linenum = 0
-INVALID=-100
-last_align = INVALID
-last_code = INVALID
-last_globl = INVALID
-last_globl_name = None
-begin = INVALID
-in_text_section = False
-begins = dict()
-for line in inp:
-    linenum += 1
-    if re.search(r'.section\s+__TEXT,__text,regular,pure_instructions', line):
-        in_text_section = True
-        continue
-    elif ".section" in line:
-        in_text_section = False
-        continue
-
-    if not in_text_section:
-        continue
-
-    if ".align" in line:
-        last_align = linenum
-    gl = re.search(r'.globl\s+(\w+)', line)
-    if gl:
-        last_globl_name = gl.group(1)
-        last_globl = linenum
-    m = re.search(r'^(\w+):', line)
-    if m and begin == INVALID:
-        labelname = m.group(1)
-        if last_globl+2 == linenum and last_globl_name == labelname:
-            begin = last_globl
-            funcname = labelname
-    if line == "\n" and begin != INVALID:
-        end = linenum
-        triple = (funcname, begin, end)
-        begins[begin] = triple
-        begin = INVALID
-
-# Second pass: Mark
-out = open(sys.argv[1], "w")
-in_func = None
-linenum = 0
-for line in inp:
-    linenum += 1
-    if in_func is not None and linenum == end:
-        out.write("# -- End  %s\n" % in_func)
-        in_func = None
-
-    triple = begins.get(linenum)
-    if triple is not None:
-        in_func, begin, end = triple
-        out.write("# -- Begin  %s\n" % in_func)
-    out.write(line)
diff --git a/utils/abtest/mark_armfns.py b/utils/abtest/mark_armfns.py
deleted file mode 100755
index 0edf42e8a83c..000000000000
--- a/utils/abtest/mark_armfns.py
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env python
-#
-# Mark functions in an arm assembly file. This is done by surrounding the
-# function with "# -- Begin Name" and "# -- End Name"
-# (This script is designed for arm ios assembly syntax)
-import sys
-import re
-
-inp = open(sys.argv[1], "r").readlines()
-
-# First pass
-linenum = 0
-INVALID=-100
-last_align = INVALID
-last_code = INVALID
-last_globl = INVALID
-begin = INVALID
-begins = dict()
-for line in inp:
-    linenum += 1
-    if ".align" in line:
-        last_align = linenum
-    if ".code" in line:
-        last_code = linenum
-    if ".globl" in line:
-        last_globl = linenum
-    m = re.search(r'.thumb_func\s+(\w+)', line)
-    if m:
-        funcname = m.group(1)
-        if last_code == last_align+1 and (linenum - last_code) < 4:
-            begin = last_align
-            if last_globl+1 == last_align:
-                begin = last_globl
-    if line == "\n" and begin != INVALID:
-        end = linenum
-        triple = (funcname, begin, end)
-        begins[begin] = triple
-        begin = INVALID
-
-# Second pass: Mark
-out = open(sys.argv[1], "w")
-in_func = None
-linenum = 0
-for line in inp:
-    linenum += 1
-    if in_func is not None and linenum == end:
-        out.write("# -- End  %s\n" % in_func)
-        in_func = None
-
-    triple = begins.get(linenum)
-    if triple is not None:
-        in_func, begin, end = triple
-        out.write("# -- Begin  %s\n" % in_func)
-    out.write(line)
diff --git a/utils/git-svn/git-llvm b/utils/git-svn/git-llvm
index 9309889f30df..975b8480601b 100755
--- a/utils/git-svn/git-llvm
+++ b/utils/git-svn/git-llvm
@@ -133,6 +133,13 @@ def svn(cwd, *cmd, **kwargs):
     return shell(['svn'] + list(cmd), cwd=cwd, stdin=kwargs.get('stdin', None),
                  ignore_errors=kwargs.get('ignore_errors', None))
 
+def program_exists(cmd):
+    if sys.platform == 'win32' and not cmd.endswith('.exe'):
+        cmd += '.exe'
+    for path in os.environ["PATH"].split(os.pathsep):
+        if os.access(os.path.join(path, cmd), os.X_OK):
+            return True
+    return False
 
 def get_default_rev_range():
     # Get the branch tracked by the current branch, as set by
@@ -309,6 +316,9 @@ def cmd_push(args):
 
 
 if __name__ == '__main__':
+    if not program_exists('svn'):
+        die('error: git-llvm needs svn command, but svn is not installed.')
+
     argv = sys.argv[1:]
     p = argparse.ArgumentParser(
         prog='git llvm', formatter_class=argparse.RawDescriptionHelpFormatter,
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 98563db7ba59..37b03cc19f85 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -1056,7 +1056,7 @@ def _runShTest(test, litConfig, useExternalSh, script, tmpBase):
 def executeShTest(test, litConfig, useExternalSh,
                   extra_substitutions=[]):
     if test.config.unsupported:
-        return (Test.UNSUPPORTED, 'Test is unsupported')
+        return lit.Test.Result(Test.UNSUPPORTED, 'Test is unsupported')
 
     script = parseIntegratedTestScript(test)
     if isinstance(script, lit.Test.Result):
diff --git a/utils/release/merge-request.sh b/utils/release/merge-request.sh
index 3345d2ad85c5..703023aaa797 100755
--- a/utils/release/merge-request.sh
+++ b/utils/release/merge-request.sh
@@ -23,7 +23,7 @@ bugzilla_component="new bugs"
 bugzilla_assigned_to=""
 bugzilla_user=""
 bugzilla_version=""
-bugzilla_url="http://bugs.llvm.org/xmlrpc.cgi"
+bugzilla_url="https://bugs.llvm.org/xmlrpc.cgi"
 
 function usage() {
   echo "usage: `basename $0` -user EMAIL -stable-version X.Y -r NUM"