From aac4ca60bc813a35242145a1f92f325303d5df6e Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Sun, 29 Jan 2017 20:58:36 +0000
Subject: [PATCH 1/5] Vendor import of llvm release_40 branch r293443:
 https://llvm.org/svn/llvm-project/llvm/branches/release_40@293443

---
 cmake/modules/HandleLLVMOptions.cmake         |   2 +
 include/llvm/IR/IntrinsicsAMDGPU.td           |   4 +
 lib/Analysis/BasicAliasAnalysis.cpp           |  10 +-
 lib/Bitcode/Reader/MetadataLoader.cpp         |  99 ++++++---
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   5 +-
 lib/Target/AArch64/AArch64.td                 |   9 +-
 lib/Target/AArch64/AArch64InstrInfo.cpp       |   2 +-
 lib/Target/AArch64/AArch64Subtarget.h         |   4 +-
 lib/Target/AMDGPU/AMDGPU.td                   |   6 +
 lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp        |   6 +-
 lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp      |  10 +-
 lib/Target/AMDGPU/AMDGPUISelLowering.cpp      |   7 +
 lib/Target/AMDGPU/AMDGPUISelLowering.h        |  11 +
 lib/Target/AMDGPU/AMDGPUInstrInfo.td          |   3 +
 lib/Target/AMDGPU/AMDGPUSubtarget.cpp         |  12 +-
 lib/Target/AMDGPU/AMDGPUSubtarget.h           |  23 +-
 lib/Target/AMDGPU/R600ISelLowering.cpp        |  30 ++-
 lib/Target/AMDGPU/R600Instructions.td         |  14 ++
 lib/Target/AMDGPU/SIFrameLowering.cpp         |  58 ++++-
 lib/Target/AMDGPU/SIISelLowering.cpp          |  18 +-
 lib/Target/AMDGPU/SIMachineFunctionInfo.cpp   |  15 +-
 lib/Target/AMDGPU/SIMachineFunctionInfo.h     |  17 ++
 lib/Target/AMDGPU/SIRegisterInfo.cpp          |  10 +-
 lib/Target/AMDGPU/VOP3Instructions.td         |   6 +-
 lib/Target/ARM/ARMAsmPrinter.cpp              |   3 +
 lib/Target/ARM/ARMISelLowering.cpp            |   4 +-
 lib/Target/X86/X86ISelLowering.cpp            | 101 +--------
 lib/Transforms/Utils/SimplifyCFG.cpp          |   8 +
 test/Analysis/BasicAA/pr31761.ll              |  19 ++
 test/CodeGen/AArch64/no-quad-ldp-stp.ll       |   2 +-
 .../AMDGPU/32-bit-local-address-space.ll      |   2 +-
 test/CodeGen/AMDGPU/add.i16.ll                |   2 +-
 test/CodeGen/AMDGPU/add.ll                    |   2 +-
 test/CodeGen/AMDGPU/amdgcn.private-memory.ll  |   4 +-
 .../amdgpu.work-item-intrinsics.deprecated.ll |   4 +-
 test/CodeGen/AMDGPU/and.ll                    |   2 +-
 test/CodeGen/AMDGPU/anyext.ll                 |   2 +-
 test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll  |   4 +-
 test/CodeGen/AMDGPU/atomic_load_add.ll        |   4 +-
 test/CodeGen/AMDGPU/atomic_load_sub.ll        |   2 +-
 test/CodeGen/AMDGPU/basic-branch.ll           |   4 +-
 test/CodeGen/AMDGPU/bfi_int.ll                |   4 +-
 test/CodeGen/AMDGPU/bfm.ll                    |   4 +-
 test/CodeGen/AMDGPU/bitcast-vector-extract.ll |   2 +-
 test/CodeGen/AMDGPU/bitreverse.ll             |   4 +-
 test/CodeGen/AMDGPU/br_cc.f16.ll              |   2 +-
 test/CodeGen/AMDGPU/branch-condition-and.ll   |   2 +-
 test/CodeGen/AMDGPU/bswap.ll                  |   2 +-
 test/CodeGen/AMDGPU/bug-vopc-commute.ll       |   6 +-
 test/CodeGen/AMDGPU/build_vector.ll           |   4 +-
 .../AMDGPU/cgp-addressing-modes-flat.ll       |   4 +-
 test/CodeGen/AMDGPU/cgp-addressing-modes.ll   |   4 +-
 test/CodeGen/AMDGPU/cgp-bitfield-extract.ll   |   4 +-
 test/CodeGen/AMDGPU/ci-use-flat-for-global.ll |  26 ---
 test/CodeGen/AMDGPU/concat_vectors.ll         |   4 +-
 .../AMDGPU/constant-fold-mi-operands.ll       |   2 +-
 test/CodeGen/AMDGPU/copy-illegal-type.ll      |   2 +-
 test/CodeGen/AMDGPU/copy-to-reg.ll            |   4 +-
 test/CodeGen/AMDGPU/ctlz.ll                   |   2 +-
 test/CodeGen/AMDGPU/ctlz_zero_undef.ll        |   4 +-
 test/CodeGen/AMDGPU/ctpop.ll                  |   4 +-
 test/CodeGen/AMDGPU/ctpop64.ll                |   2 +-
 test/CodeGen/AMDGPU/cttz_zero_undef.ll        |   4 +-
 test/CodeGen/AMDGPU/cube.ll                   |   4 +-
 test/CodeGen/AMDGPU/cvt_f32_ubyte.ll          |   2 +-
 test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll        |   4 +-
 test/CodeGen/AMDGPU/elf.ll                    |  14 +-
 test/CodeGen/AMDGPU/extload.ll                |   2 +-
 test/CodeGen/AMDGPU/extract_vector_elt-f64.ll |   2 +-
 test/CodeGen/AMDGPU/extract_vector_elt-i16.ll |   2 +-
 test/CodeGen/AMDGPU/extract_vector_elt-i64.ll |   2 +-
 test/CodeGen/AMDGPU/extract_vector_elt-i8.ll  |   2 +-
 test/CodeGen/AMDGPU/fabs.ll                   |   2 +-
 test/CodeGen/AMDGPU/fadd.f16.ll               |   2 +-
 test/CodeGen/AMDGPU/fadd.ll                   |   2 +-
 test/CodeGen/AMDGPU/fadd64.ll                 |   4 +-
 test/CodeGen/AMDGPU/fcanonicalize.f16.ll      |   6 +-
 test/CodeGen/AMDGPU/fceil.ll                  |   4 +-
 test/CodeGen/AMDGPU/fcmp.f16.ll               |   2 +-
 test/CodeGen/AMDGPU/fcmp64.ll                 |   2 +-
 test/CodeGen/AMDGPU/fcopysign.f32.ll          |   2 +-
 test/CodeGen/AMDGPU/fcopysign.f64.ll          |   2 +-
 test/CodeGen/AMDGPU/fdiv.f16.ll               |   4 +-
 test/CodeGen/AMDGPU/fdiv.f64.ll               |   2 +-
 test/CodeGen/AMDGPU/fdiv.ll                   |   6 +-
 test/CodeGen/AMDGPU/ffloor.f64.ll             |   2 +-
 test/CodeGen/AMDGPU/ffloor.ll                 |   4 +-
 test/CodeGen/AMDGPU/flat-address-space.ll     |   4 +-
 .../flat-for-global-subtarget-feature.ll      |  54 +++++
 test/CodeGen/AMDGPU/flat-scratch-reg.ll       |   2 +-
 test/CodeGen/AMDGPU/fma.f64.ll                |   4 +-
 test/CodeGen/AMDGPU/fmax3.f64.ll              |   4 +-
 test/CodeGen/AMDGPU/fmax3.ll                  |   4 +-
 test/CodeGen/AMDGPU/fmaxnum.f64.ll            |   4 +-
 test/CodeGen/AMDGPU/fmaxnum.ll                |   2 +-
 test/CodeGen/AMDGPU/fmin3.ll                  |   6 +-
 test/CodeGen/AMDGPU/fminnum.ll                |   2 +-
 .../AMDGPU/fmul-2-combine-multi-use.ll        |   2 +-
 test/CodeGen/AMDGPU/fmul.f16.ll               |   2 +-
 test/CodeGen/AMDGPU/fmul.ll                   |   2 +-
 test/CodeGen/AMDGPU/fmuladd.f64.ll            |   4 +-
 test/CodeGen/AMDGPU/fnearbyint.ll             |   4 +-
 test/CodeGen/AMDGPU/fneg-combines.ll          | 143 ++++++++----
 test/CodeGen/AMDGPU/fneg-fabs.f64.ll          |   2 +-
 test/CodeGen/AMDGPU/fneg.f64.ll               |   4 +-
 test/CodeGen/AMDGPU/fneg.ll                   |   2 +-
 test/CodeGen/AMDGPU/fp16_to_fp32.ll           |   2 +-
 test/CodeGen/AMDGPU/fp16_to_fp64.ll           |   2 +-
 test/CodeGen/AMDGPU/fp32_to_fp16.ll           |   2 +-
 test/CodeGen/AMDGPU/fp_to_sint.ll             |   2 +-
 test/CodeGen/AMDGPU/fp_to_uint.ll             |   2 +-
 test/CodeGen/AMDGPU/fpext.f16.ll              |   2 +-
 test/CodeGen/AMDGPU/fpext.ll                  |   2 +-
 test/CodeGen/AMDGPU/fptosi.f16.ll             |   2 +-
 test/CodeGen/AMDGPU/fptoui.f16.ll             |   2 +-
 test/CodeGen/AMDGPU/fptrunc.f16.ll            |   2 +-
 test/CodeGen/AMDGPU/fptrunc.ll                |   6 +-
 test/CodeGen/AMDGPU/fract.f64.ll              |   4 +-
 test/CodeGen/AMDGPU/fract.ll                  |   4 +-
 test/CodeGen/AMDGPU/frem.ll                   |   4 +-
 test/CodeGen/AMDGPU/fsqrt.f64.ll              |   2 +-
 test/CodeGen/AMDGPU/fsqrt.ll                  |   2 +-
 test/CodeGen/AMDGPU/fsub.f16.ll               |   2 +-
 test/CodeGen/AMDGPU/ftrunc.ll                 |   2 +-
 test/CodeGen/AMDGPU/gep-address-space.ll      |   2 +-
 test/CodeGen/AMDGPU/global-directive.ll       |   2 +-
 test/CodeGen/AMDGPU/global-extload-i16.ll     |   4 +-
 test/CodeGen/AMDGPU/global_atomics.ll         |   2 +-
 test/CodeGen/AMDGPU/global_atomics_i64.ll     |   2 +-
 test/CodeGen/AMDGPU/gv-const-addrspace.ll     |   4 +-
 test/CodeGen/AMDGPU/gv-offset-folding.ll      |   2 +-
 test/CodeGen/AMDGPU/half.ll                   |   2 +-
 test/CodeGen/AMDGPU/hsa-note-no-func.ll       |   6 +-
 test/CodeGen/AMDGPU/i1-copy-phi.ll            |   2 +-
 test/CodeGen/AMDGPU/icmp64.ll                 |   4 +-
 test/CodeGen/AMDGPU/imm.ll                    |   2 +-
 test/CodeGen/AMDGPU/imm16.ll                  |   2 +-
 test/CodeGen/AMDGPU/indirect-addressing-si.ll |   4 +-
 test/CodeGen/AMDGPU/indirect-private-64.ll    |   4 +-
 test/CodeGen/AMDGPU/infinite-loop.ll          |   4 +-
 test/CodeGen/AMDGPU/inline-asm.ll             |   2 +-
 test/CodeGen/AMDGPU/insert_vector_elt.ll      |   2 +-
 test/CodeGen/AMDGPU/inserted-wait-states.mir  |   2 +-
 test/CodeGen/AMDGPU/kernel-args.ll            |   2 +-
 test/CodeGen/AMDGPU/large-alloca-graphics.ll  |   2 +-
 test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll    |   4 +-
 test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll    |   6 +-
 test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll      |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll  |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll    |   2 +-
 .../AMDGPU/llvm.amdgcn.div.fixup.f16.ll       |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll  |   4 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll   |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll  |   6 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll  |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll      |   2 +-
 .../AMDGPU/llvm.amdgcn.frexp.exp.f16.ll       |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll  |   2 +-
 .../AMDGPU/llvm.amdgcn.frexp.mant.f16.ll      |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll  |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll      |  12 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll    |   4 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll    |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll   |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll  |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll    |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll        |   2 +-
 .../AMDGPU/llvm.amdgcn.s.memrealtime.ll       |   4 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll  |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll  |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll      |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll    |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll        |   2 +-
 test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll |   4 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll |   6 +-
 test/CodeGen/AMDGPU/llvm.ceil.f16.ll          |   2 +-
 test/CodeGen/AMDGPU/llvm.cos.f16.ll           |   2 +-
 test/CodeGen/AMDGPU/llvm.exp2.f16.ll          |   2 +-
 test/CodeGen/AMDGPU/llvm.floor.f16.ll         |   2 +-
 test/CodeGen/AMDGPU/llvm.fma.f16.ll           |   2 +-
 test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll       |   4 +-
 test/CodeGen/AMDGPU/llvm.log2.f16.ll          |   2 +-
 test/CodeGen/AMDGPU/llvm.maxnum.f16.ll        |   2 +-
 test/CodeGen/AMDGPU/llvm.memcpy.ll            |   2 +-
 test/CodeGen/AMDGPU/llvm.minnum.f16.ll        |   2 +-
 .../AMDGPU/llvm.r600.read.local.size.ll       |   4 +-
 test/CodeGen/AMDGPU/llvm.rint.f16.ll          |   2 +-
 test/CodeGen/AMDGPU/llvm.round.ll             |   2 +-
 test/CodeGen/AMDGPU/llvm.sin.f16.ll           |   2 +-
 test/CodeGen/AMDGPU/llvm.sqrt.f16.ll          |   2 +-
 test/CodeGen/AMDGPU/llvm.trunc.f16.ll         |   2 +-
 test/CodeGen/AMDGPU/load-constant-f64.ll      |   2 +-
 test/CodeGen/AMDGPU/load-constant-i1.ll       |   2 +-
 test/CodeGen/AMDGPU/load-constant-i16.ll      |   2 +-
 test/CodeGen/AMDGPU/load-constant-i32.ll      |   2 +-
 test/CodeGen/AMDGPU/load-constant-i64.ll      |   2 +-
 test/CodeGen/AMDGPU/load-constant-i8.ll       |   2 +-
 test/CodeGen/AMDGPU/load-global-f32.ll        |   2 +-
 test/CodeGen/AMDGPU/load-global-f64.ll        |   2 +-
 test/CodeGen/AMDGPU/load-global-i1.ll         |   2 +-
 test/CodeGen/AMDGPU/load-global-i16.ll        |   2 +-
 test/CodeGen/AMDGPU/load-global-i32.ll        |   2 +-
 test/CodeGen/AMDGPU/load-global-i64.ll        |   2 +-
 test/CodeGen/AMDGPU/load-global-i8.ll         |   2 +-
 test/CodeGen/AMDGPU/load-local-i32.ll         |   2 +-
 test/CodeGen/AMDGPU/load-local-i8.ll          |  32 +--
 test/CodeGen/AMDGPU/load-weird-sizes.ll       |   2 +-
 test/CodeGen/AMDGPU/local-64.ll               |   4 +-
 test/CodeGen/AMDGPU/local-atomics.ll          |   2 +-
 test/CodeGen/AMDGPU/local-atomics64.ll        |   2 +-
 test/CodeGen/AMDGPU/local-stack-slot-bug.ll   |   2 +-
 .../CodeGen/AMDGPU/local-stack-slot-offset.ll |   2 +-
 test/CodeGen/AMDGPU/lshl.ll                   |   2 +-
 test/CodeGen/AMDGPU/lshr.ll                   |   2 +-
 test/CodeGen/AMDGPU/mad_int24.ll              |   2 +-
 test/CodeGen/AMDGPU/mad_uint24.ll             |   6 +-
 test/CodeGen/AMDGPU/madak.ll                  |   2 +-
 test/CodeGen/AMDGPU/madmk.ll                  |   2 +-
 test/CodeGen/AMDGPU/max.i16.ll                |   2 +-
 test/CodeGen/AMDGPU/merge-store-usedef.ll     |   2 +-
 test/CodeGen/AMDGPU/min.ll                    |   2 +-
 test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll      |   3 +-
 test/CodeGen/AMDGPU/mul.ll                    |   2 +-
 test/CodeGen/AMDGPU/mul_int24.ll              |   2 +-
 test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll      |   2 +-
 test/CodeGen/AMDGPU/operand-spacing.ll        |   4 +-
 test/CodeGen/AMDGPU/or.ll                     |   2 +-
 test/CodeGen/AMDGPU/rcp-pattern.ll            |   2 +-
 test/CodeGen/AMDGPU/readcyclecounter.ll       |   4 +-
 .../AMDGPU/reduce-load-width-alignment.ll     |   2 +-
 .../AMDGPU/reg-coalescer-sched-crash.ll       |   2 +-
 test/CodeGen/AMDGPU/reorder-stores.ll         |   4 +-
 test/CodeGen/AMDGPU/rotl.i64.ll               |   2 +-
 test/CodeGen/AMDGPU/rotr.i64.ll               |   2 +-
 test/CodeGen/AMDGPU/rotr.ll                   |   2 +-
 test/CodeGen/AMDGPU/s_addk_i32.ll             |   2 +-
 test/CodeGen/AMDGPU/s_movk_i32.ll             |   2 +-
 test/CodeGen/AMDGPU/s_mulk_i32.ll             |   4 +-
 test/CodeGen/AMDGPU/scalar_to_vector.ll       |   2 +-
 .../AMDGPU/schedule-kernel-arg-loads.ll       |   2 +-
 test/CodeGen/AMDGPU/sdiv.ll                   |   2 +-
 test/CodeGen/AMDGPU/sdivrem24.ll              |   2 +-
 test/CodeGen/AMDGPU/sdivrem64.ll              |   2 +-
 .../AMDGPU/select-fabs-fneg-extract.ll        |   4 +-
 test/CodeGen/AMDGPU/select-i1.ll              |   2 +-
 test/CodeGen/AMDGPU/select-vectors.ll         |   2 +-
 test/CodeGen/AMDGPU/select.f16.ll             |   2 +-
 test/CodeGen/AMDGPU/selectcc-opt.ll           |   4 +-
 test/CodeGen/AMDGPU/setcc-opt.ll              |   2 +-
 test/CodeGen/AMDGPU/setcc64.ll                |   2 +-
 test/CodeGen/AMDGPU/seto.ll                   |   2 +-
 test/CodeGen/AMDGPU/sext-in-reg.ll            |   2 +-
 .../AMDGPU/sgpr-copy-duplicate-operand.ll     |   2 +-
 test/CodeGen/AMDGPU/sgpr-copy.ll              |   2 +-
 test/CodeGen/AMDGPU/shl.ll                    |   2 +-
 test/CodeGen/AMDGPU/shl_add_ptr.ll            |   2 +-
 test/CodeGen/AMDGPU/si-annotate-cf.ll         |   2 +-
 test/CodeGen/AMDGPU/si-lod-bias.ll            |   2 +-
 test/CodeGen/AMDGPU/si-sgpr-spill.ll          |   2 +-
 test/CodeGen/AMDGPU/si-spill-cf.ll            |   2 +-
 test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll    |   4 +-
 test/CodeGen/AMDGPU/si-vector-hang.ll         |   2 +-
 test/CodeGen/AMDGPU/sign_extend.ll            |   2 +-
 test/CodeGen/AMDGPU/sint_to_fp.i64.ll         |   2 +-
 test/CodeGen/AMDGPU/sint_to_fp.ll             |   4 +-
 test/CodeGen/AMDGPU/sitofp.f16.ll             |   2 +-
 test/CodeGen/AMDGPU/smed3.ll                  |   2 +-
 test/CodeGen/AMDGPU/sminmax.ll                |   4 +-
 test/CodeGen/AMDGPU/smrd-vccz-bug.ll          |   2 +-
 test/CodeGen/AMDGPU/smrd.ll                   |   2 +-
 test/CodeGen/AMDGPU/sopk-compares.ll          |   2 +-
 test/CodeGen/AMDGPU/sra.ll                    |   2 +-
 test/CodeGen/AMDGPU/srl.ll                    |   2 +-
 test/CodeGen/AMDGPU/store-global.ll           |   2 +-
 test/CodeGen/AMDGPU/store-v3i64.ll            |   2 +-
 test/CodeGen/AMDGPU/sub.i16.ll                |   2 +-
 test/CodeGen/AMDGPU/trunc-bitcast-vector.ll   |   2 +-
 test/CodeGen/AMDGPU/trunc-cmp-constant.ll     |   4 +-
 test/CodeGen/AMDGPU/trunc-store-i1.ll         |   4 +-
 test/CodeGen/AMDGPU/trunc-store.ll            |   2 +-
 test/CodeGen/AMDGPU/uaddo.ll                  |   2 +-
 test/CodeGen/AMDGPU/udiv.ll                   |   2 +-
 test/CodeGen/AMDGPU/udivrem.ll                |   2 +-
 test/CodeGen/AMDGPU/udivrem24.ll              |   2 +-
 test/CodeGen/AMDGPU/udivrem64.ll              |   2 +-
 test/CodeGen/AMDGPU/uint_to_fp.i64.ll         |   4 +-
 test/CodeGen/AMDGPU/uint_to_fp.ll             |   4 +-
 test/CodeGen/AMDGPU/uitofp.f16.ll             |   2 +-
 test/CodeGen/AMDGPU/umed3.ll                  |   2 +-
 test/CodeGen/AMDGPU/unaligned-load-store.ll   |   2 +-
 test/CodeGen/AMDGPU/uniform-cfg.ll            |   2 +-
 test/CodeGen/AMDGPU/urecip.ll                 |   6 +-
 test/CodeGen/AMDGPU/urem.ll                   |   2 +-
 .../CodeGen/AMDGPU/use-sgpr-multiple-times.ll |   2 +-
 test/CodeGen/AMDGPU/v_cndmask.ll              |   2 +-
 test/CodeGen/AMDGPU/v_mac.ll                  |   2 +-
 test/CodeGen/AMDGPU/v_mac_f16.ll              |  16 +-
 test/CodeGen/AMDGPU/v_madak_f16.ll            |   2 +-
 ...vgpr-spill-emergency-stack-slot-compute.ll |   4 +-
 test/CodeGen/AMDGPU/vselect.ll                |   2 +-
 test/CodeGen/AMDGPU/wait.ll                   |   8 +-
 test/CodeGen/AMDGPU/waitcnt-flat.ll           |   2 +-
 test/CodeGen/AMDGPU/xor.ll                    |   2 +-
 test/CodeGen/AMDGPU/zero_extend.ll            |   2 +-
 test/CodeGen/ARM/neon_div.ll                  |  83 +++----
 test/CodeGen/ARM/vector-load.ll               |  10 +
 .../xray-armv6-attribute-instrumentation.ll   |   6 +
 .../xray-armv7-attribute-instrumentation.ll   |   6 +
 test/CodeGen/X86/avx-trunc.ll                 |  26 ---
 test/CodeGen/X86/avx512-trunc.ll              | 205 ------------------
 test/CodeGen/X86/phaddsub.ll                  |  58 +++++
 test/MC/AMDGPU/vop3.s                         |   8 +-
 .../SimplifyCFG/sink-common-code.ll           |  24 ++
 utils/sanitizers/ubsan_blacklist.txt          |   7 +
 316 files changed, 1032 insertions(+), 929 deletions(-)
 create mode 100644 test/Analysis/BasicAA/pr31761.ll
 delete mode 100644 test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
 create mode 100644 test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
 create mode 100644 utils/sanitizers/ubsan_blacklist.txt

diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index 4ce7f57403c4..1825b55ed54b 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -555,6 +555,8 @@ if(LLVM_USE_SANITIZER)
       append_common_sanitizer_flags()
       append("-fsanitize=undefined -fno-sanitize=vptr,function -fno-sanitize-recover=all"
               CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+      append("-fsanitize-blacklist=${CMAKE_SOURCE_DIR}/utils/sanitizers/ubsan_blacklist.txt"
+	      CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     elseif (LLVM_USE_SANITIZER STREQUAL "Thread")
       append_common_sanitizer_flags()
       append("-fsanitize=thread" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index 07d5b5ea40dc..dcec3860f2ca 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -100,6 +100,10 @@ def int_amdgcn_dispatch_id :
   GCCBuiltin<"__builtin_amdgcn_dispatch_id">,
   Intrinsic<[llvm_i64_ty], [], [IntrNoMem]>;
 
+def int_amdgcn_implicit_buffer_ptr :
+  GCCBuiltin<"__builtin_amdgcn_implicit_buffer_ptr">,
+  Intrinsic<[LLVMQualPointerType<llvm_i8_ty, 2>], [], [IntrNoMem]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction Intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index e8d86bdbca5b..c8d057949493 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1191,14 +1191,14 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size,
       return MayAlias;
 
     AliasResult R = aliasCheck(UnderlyingV1, MemoryLocation::UnknownSize,
-                               AAMDNodes(), V2, V2Size, V2AAInfo,
-                               nullptr, UnderlyingV2);
+                               AAMDNodes(), V2, MemoryLocation::UnknownSize,
+                               V2AAInfo, nullptr, UnderlyingV2);
     if (R != MustAlias)
       // If V2 may alias GEP base pointer, conservatively returns MayAlias.
       // If V2 is known not to alias GEP base pointer, then the two values
-      // cannot alias per GEP semantics: "A pointer value formed from a
-      // getelementptr instruction is associated with the addresses associated
-      // with the first operand of the getelementptr".
+      // cannot alias per GEP semantics: "Any memory access must be done through
+      // a pointer value associated with an address range of the memory access,
+      // otherwise the behavior is undefined.".
       return R;
 
     // If the max search depth is reached the result is undefined
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index a44b92effc7e..145b9c50367a 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -919,7 +919,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     // If this isn't a LocalAsMetadata record, we're dropping it.  This used
     // to be legal, but there's no upgrade path.
     auto dropRecord = [&] {
-      MetadataList.assignValue(MDNode::get(Context, None), NextMetadataNo++);
+      MetadataList.assignValue(MDNode::get(Context, None), NextMetadataNo);
+      NextMetadataNo++;
     };
     if (Record.size() != 2) {
       dropRecord();
@@ -934,7 +935,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
 
     MetadataList.assignValue(
         LocalAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_OLD_NODE: {
@@ -959,7 +961,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       } else
         Elts.push_back(nullptr);
     }
-    MetadataList.assignValue(MDNode::get(Context, Elts), NextMetadataNo++);
+    MetadataList.assignValue(MDNode::get(Context, Elts), NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_VALUE: {
@@ -972,7 +975,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
 
     MetadataList.assignValue(
         ValueAsMetadata::get(ValueList.getValueFwdRef(Record[1], Ty)),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_DISTINCT_NODE:
@@ -985,7 +989,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       Elts.push_back(getMDOrNull(ID));
     MetadataList.assignValue(IsDistinct ? MDNode::getDistinct(Context, Elts)
                                         : MDNode::get(Context, Elts),
-                             NextMetadataNo++);
+                             NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_LOCATION: {
@@ -999,7 +1004,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     Metadata *InlinedAt = getMDOrNull(Record[4]);
     MetadataList.assignValue(
         GET_OR_DISTINCT(DILocation, (Context, Line, Column, Scope, InlinedAt)),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_GENERIC_DEBUG: {
@@ -1019,7 +1025,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       DwarfOps.push_back(getMDOrNull(Record[I]));
     MetadataList.assignValue(
         GET_OR_DISTINCT(GenericDINode, (Context, Tag, Header, DwarfOps)),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_SUBRANGE: {
@@ -1030,7 +1037,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     MetadataList.assignValue(
         GET_OR_DISTINCT(DISubrange,
                         (Context, Record[1], unrotateSign(Record[2]))),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_ENUMERATOR: {
@@ -1041,7 +1049,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIEnumerator, (Context, unrotateSign(Record[1]),
                                        getMDString(Record[2]))),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_BASIC_TYPE: {
@@ -1053,7 +1062,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         GET_OR_DISTINCT(DIBasicType,
                         (Context, Record[1], getMDString(Record[2]), Record[3],
                          Record[4], Record[5])),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_DERIVED_TYPE: {
@@ -1069,7 +1079,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                          getDITypeRefOrNull(Record[5]),
                          getDITypeRefOrNull(Record[6]), Record[7], Record[8],
                          Record[9], Flags, getDITypeRefOrNull(Record[11]))),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_COMPOSITE_TYPE: {
@@ -1134,7 +1145,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     if (!IsNotUsedInTypeRef && Identifier)
       MetadataList.addTypeRef(*Identifier, *cast<DICompositeType>(CT));
 
-    MetadataList.assignValue(CT, NextMetadataNo++);
+    MetadataList.assignValue(CT, NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_SUBROUTINE_TYPE: {
@@ -1151,7 +1163,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
 
     MetadataList.assignValue(
         GET_OR_DISTINCT(DISubroutineType, (Context, Flags, CC, Types)),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
 
@@ -1165,7 +1178,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                         (Context, getMDOrNull(Record[1]),
                          getMDString(Record[2]), getMDString(Record[3]),
                          getMDString(Record[4]), getMDString(Record[5]))),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
 
@@ -1181,7 +1195,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
              Record.size() == 3 ? DIFile::CSK_None
                                 : static_cast<DIFile::ChecksumKind>(Record[3]),
              Record.size() == 3 ? nullptr : getMDString(Record[4]))),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_COMPILE_UNIT: {
@@ -1200,7 +1215,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         Record.size() <= 14 ? 0 : Record[14],
         Record.size() <= 16 ? true : Record[16]);
 
-    MetadataList.assignValue(CU, NextMetadataNo++);
+    MetadataList.assignValue(CU, NextMetadataNo);
+    NextMetadataNo++;
 
     // Move the Upgrade the list of subprograms.
     if (Metadata *SPs = getMDOrNullWithoutPlaceholders(Record[11]))
@@ -1247,7 +1263,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                        getMDOrNull(Record[16 + Offset]), // declaration
                        getMDOrNull(Record[17 + Offset])  // variables
                        ));
-    MetadataList.assignValue(SP, NextMetadataNo++);
+    MetadataList.assignValue(SP, NextMetadataNo);
+    NextMetadataNo++;
 
     // Upgrade sp->function mapping to function->sp mapping.
     if (HasFn) {
@@ -1272,7 +1289,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         GET_OR_DISTINCT(DILexicalBlock,
                         (Context, getMDOrNull(Record[1]),
                          getMDOrNull(Record[2]), Record[3], Record[4])),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_LEXICAL_BLOCK_FILE: {
@@ -1284,7 +1302,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         GET_OR_DISTINCT(DILexicalBlockFile,
                         (Context, getMDOrNull(Record[1]),
                          getMDOrNull(Record[2]), Record[3])),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_NAMESPACE: {
@@ -1298,7 +1317,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                         (Context, getMDOrNull(Record[1]),
                          getMDOrNull(Record[2]), getMDString(Record[3]),
                          Record[4], ExportSymbols)),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_MACRO: {
@@ -1310,7 +1330,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         GET_OR_DISTINCT(DIMacro,
                         (Context, Record[1], Record[2], getMDString(Record[3]),
                          getMDString(Record[4]))),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_MACRO_FILE: {
@@ -1322,7 +1343,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         GET_OR_DISTINCT(DIMacroFile,
                         (Context, Record[1], Record[2], getMDOrNull(Record[3]),
                          getMDOrNull(Record[4]))),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_TEMPLATE_TYPE: {
@@ -1333,7 +1355,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     MetadataList.assignValue(GET_OR_DISTINCT(DITemplateTypeParameter,
                                              (Context, getMDString(Record[1]),
                                               getDITypeRefOrNull(Record[2]))),
-                             NextMetadataNo++);
+                             NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_TEMPLATE_VALUE: {
@@ -1346,7 +1369,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                         (Context, Record[1], getMDString(Record[2]),
                          getDITypeRefOrNull(Record[3]),
                          getMDOrNull(Record[4]))),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_GLOBAL_VAR: {
@@ -1364,7 +1388,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                            getMDOrNull(Record[4]), Record[5],
                            getDITypeRefOrNull(Record[6]), Record[7], Record[8],
                            getMDOrNull(Record[10]), Record[11])),
-          NextMetadataNo++);
+          NextMetadataNo);
+      NextMetadataNo++;
     } else if (Version == 0) {
       // Upgrade old metadata, which stored a global variable reference or a
       // ConstantInt here.
@@ -1396,7 +1421,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
            getMDOrNull(Record[10]), AlignInBits));
 
       auto *DGVE = DIGlobalVariableExpression::getDistinct(Context, DGV, Expr);
-      MetadataList.assignValue(DGVE, NextMetadataNo++);
+      MetadataList.assignValue(DGVE, NextMetadataNo);
+      NextMetadataNo++;
       if (Attach)
         Attach->addDebugInfo(DGVE);
     } else
@@ -1429,7 +1455,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                          getMDOrNull(Record[3 + HasTag]), Record[4 + HasTag],
                          getDITypeRefOrNull(Record[5 + HasTag]),
                          Record[6 + HasTag], Flags, AlignInBits)),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_EXPRESSION: {
@@ -1446,7 +1473,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
 
     MetadataList.assignValue(
         GET_OR_DISTINCT(DIExpression, (Context, makeArrayRef(Record).slice(1))),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_GLOBAL_VAR_EXPR: {
@@ -1457,7 +1485,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     MetadataList.assignValue(GET_OR_DISTINCT(DIGlobalVariableExpression,
                                              (Context, getMDOrNull(Record[1]),
                                               getMDOrNull(Record[2]))),
-                             NextMetadataNo++);
+                             NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_OBJC_PROPERTY: {
@@ -1471,7 +1500,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                          getMDOrNull(Record[2]), Record[3],
                          getMDString(Record[4]), getMDString(Record[5]),
                          Record[6], getDITypeRefOrNull(Record[7]))),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_IMPORTED_ENTITY: {
@@ -1484,7 +1514,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                         (Context, Record[1], getMDOrNull(Record[2]),
                          getDITypeRefOrNull(Record[3]), Record[4],
                          getMDString(Record[5]))),
-        NextMetadataNo++);
+        NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_STRING_OLD: {
@@ -1494,13 +1525,15 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     HasSeenOldLoopTags |= mayBeOldLoopAttachmentTag(String);
     ++NumMDStringLoaded;
     Metadata *MD = MDString::get(Context, String);
-    MetadataList.assignValue(MD, NextMetadataNo++);
+    MetadataList.assignValue(MD, NextMetadataNo);
+    NextMetadataNo++;
     break;
   }
   case bitc::METADATA_STRINGS: {
     auto CreateNextMDString = [&](StringRef Str) {
       ++NumMDStringLoaded;
-      MetadataList.assignValue(MDString::get(Context, Str), NextMetadataNo++);
+      MetadataList.assignValue(MDString::get(Context, Str), NextMetadataNo);
+      NextMetadataNo++;
     };
     if (Error Err = parseMetadataStrings(Record, Blob, CreateNextMDString))
       return Err;
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 27a9ac337f25..6906f67ebacb 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -3439,7 +3439,10 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
                       LD->getPointerInfo().getWithOffset(Offset),
                       MinAlign(Align, Increment), MMOFlags, AAInfo);
       LdChain.push_back(L.getValue(1));
-      if (L->getValueType(0).isVector()) {
+      if (L->getValueType(0).isVector() && NewVTWidth >= LdWidth) {
+        // Later code assumes the vector loads produced will be mergeable, so we
+        // must pad the final entry up to the previous width. Scalars are
+        // combined separately.
         SmallVector<SDValue, 16> Loads;
         Loads.push_back(L);
         unsigned size = L->getValueSizeInBits(0);
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index 740766b151bb..91c335fac32d 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -85,9 +85,8 @@ def FeaturePostRAScheduler : SubtargetFeature<"use-postra-scheduler",
 def FeatureSlowMisaligned128Store : SubtargetFeature<"slow-misaligned-128store",
     "Misaligned128StoreIsSlow", "true", "Misaligned 128 bit stores are slow">;
 
-def FeatureAvoidQuadLdStPairs : SubtargetFeature<"no-quad-ldst-pairs",
-    "AvoidQuadLdStPairs", "true",
-    "Do not form quad load/store pair operations">;
+def FeatureSlowPaired128 : SubtargetFeature<"slow-paired-128",
+    "Paired128IsSlow", "true", "Paired 128 bit loads and stores are slow">;
 
 def FeatureAlternateSExtLoadCVTF32Pattern : SubtargetFeature<
     "alternate-sextload-cvt-f32-pattern", "UseAlternateSExtLoadCVTF32Pattern",
@@ -222,7 +221,7 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
 
 def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                     "Samsung Exynos-M1 processors",
-                                    [FeatureAvoidQuadLdStPairs,
+                                    [FeatureSlowPaired128,
                                      FeatureCRC,
                                      FeatureCrypto,
                                      FeatureCustomCheapAsMoveHandling,
@@ -236,7 +235,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
 
 def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
                                     "Samsung Exynos-M2/M3 processors",
-                                    [FeatureAvoidQuadLdStPairs,
+                                    [FeatureSlowPaired128,
                                      FeatureCRC,
                                      FeatureCrypto,
                                      FeatureCustomCheapAsMoveHandling,
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 5c8acba26aab..4c789926e3e4 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1652,7 +1652,7 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
     return false;
 
   // On some CPUs quad load/store pairs are slower than two single load/stores.
-  if (Subtarget.avoidQuadLdStPairs()) {
+  if (Subtarget.isPaired128Slow()) {
     switch (MI.getOpcode()) {
     default:
       break;
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 73f63b8b9f67..a99340225082 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -79,7 +79,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   bool CustomAsCheapAsMove = false;
   bool UsePostRAScheduler = false;
   bool Misaligned128StoreIsSlow = false;
-  bool AvoidQuadLdStPairs = false;
+  bool Paired128IsSlow = false;
   bool UseAlternateSExtLoadCVTF32Pattern = false;
   bool HasArithmeticBccFusion = false;
   bool HasArithmeticCbzFusion = false;
@@ -189,7 +189,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
   }
   bool hasCustomCheapAsMoveHandling() const { return CustomAsCheapAsMove; }
   bool isMisaligned128StoreSlow() const { return Misaligned128StoreIsSlow; }
-  bool avoidQuadLdStPairs() const { return AvoidQuadLdStPairs; }
+  bool isPaired128Slow() const { return Paired128IsSlow; }
   bool useAlternateSExtLoadCVTF32Pattern() const {
     return UseAlternateSExtLoadCVTF32Pattern;
   }
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 0b2badff7ccf..13022009af16 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -282,6 +282,12 @@ def FeatureEnableSIScheduler : SubtargetFeature<"si-scheduler",
   "Enable SI Machine Scheduler"
 >;
 
+// Unless +-flat-for-global is specified, turn on FlatForGlobal for
+// all OS-es on VI and newer hardware to avoid assertion failures due
+// to missing ADDR64 variants of MUBUF instructions.
+// FIXME: moveToVALU should be able to handle converting addr64 MUBUF
+// instructions.
+
 def FeatureFlatForGlobal : SubtargetFeature<"flat-for-global",
   "FlatForGlobal",
   "true",
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 4acd55eb6120..974e79fff3d7 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -140,7 +140,7 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
-  if (STM.isAmdCodeObjectV2()) {
+  if (STM.isAmdCodeObjectV2(*MF)) {
     getSIProgramInfo(KernelInfo, *MF);
     EmitAmdKernelCodeT(*MF, KernelInfo);
   }
@@ -149,7 +149,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
-  if (MFI->isKernel() && STM.isAmdCodeObjectV2()) {
+  if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) {
     AMDGPUTargetStreamer *TS =
         static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
     SmallString<128> SymbolName;
@@ -779,7 +779,7 @@ void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
 
   // FIXME: Should use getKernArgSize
   header.kernarg_segment_byte_size =
-      STM.getKernArgSegmentSize(MFI->getABIArgOffset());
+    STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
   header.wavefront_sgpr_count = KernelInfo.NumSGPR;
   header.workitem_vgpr_count = KernelInfo.NumVGPR;
   header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2b4fc5397b18..5bf347e48650 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -727,14 +727,8 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   unsigned Opc
     = (VT == MVT::f64) ? AMDGPU::V_DIV_SCALE_F64 : AMDGPU::V_DIV_SCALE_F32;
 
-  // src0_modifiers, src0, src1_modifiers, src1, src2_modifiers, src2, clamp,
-  // omod
-  SDValue Ops[8];
-
-  SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
-  SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
-  SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
-  CurDAG->SelectNodeTo(N, Opc, VT, MVT::i1, Ops);
+  SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
 }
 
 bool AMDGPUDAGToDAGISel::isDSOffsetLegal(const SDValue &Base, unsigned Offset,
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e48c1943cb01..54caa2c5dfad 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -2855,6 +2855,9 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   SDLoc SL(N);
   switch (Opc) {
   case ISD::FADD: {
+    if (!mayIgnoreSignedZero(N0))
+      return SDValue();
+
     // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
     SDValue LHS = N0.getOperand(0);
     SDValue RHS = N0.getOperand(1);
@@ -2895,6 +2898,9 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   }
   case ISD::FMA:
   case ISD::FMAD: {
+    if (!mayIgnoreSignedZero(N0))
+      return SDValue();
+
     // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
     SDValue LHS = N0.getOperand(0);
     SDValue MHS = N0.getOperand(1);
@@ -3272,6 +3278,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_DATA_PTR)
   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
   NODE_NAME_CASE(KILL)
+  NODE_NAME_CASE(DUMMY_CHAIN)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
   NODE_NAME_CASE(SENDMSG)
   NODE_NAME_CASE(SENDMSGHALT)
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 69567aa5f713..f6adceac6f11 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -119,6 +119,16 @@ class AMDGPUTargetLowering : public TargetLowering {
 public:
   AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI);
 
+  bool mayIgnoreSignedZero(SDValue Op) const {
+    if (getTargetMachine().Options.UnsafeFPMath) // FIXME: nsz only
+      return true;
+
+    if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(Op))
+      return BO->Flags.hasNoSignedZeros();
+
+    return false;
+  }
+
   bool isFAbsFree(EVT VT) const override;
   bool isFNegFree(EVT VT) const override;
   bool isTruncateFree(EVT Src, EVT Dest) const override;
@@ -320,6 +330,7 @@ enum NodeType : unsigned {
   INTERP_P2,
   PC_ADD_REL_OFFSET,
   KILL,
+  DUMMY_CHAIN,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   STORE_MSKOR,
   LOAD_CONSTANT,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index f079c8d0c70c..d7fa28bdc001 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -54,6 +54,9 @@ def AMDGPUconstdata_ptr : SDNode<
 // This argument to this node is a dword address.
 def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
 
+// Force dependencies for vector trunc stores
+def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>;
+
 def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
 def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
 
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 74851aedbb21..c35a67de1d7f 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -48,6 +48,13 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
 
   ParseSubtargetFeatures(GPU, FullFS);
 
+  // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
+  // on VI and newer hardware to avoid assertion failures due to missing ADDR64
+  // variants of MUBUF instructions.
+  if (!hasAddr64() && !FS.contains("flat-for-global")) {
+    FlatForGlobal = true;
+  }
+
   // FIXME: I don't think think Evergreen has any useful support for
   // denormals, but should be checked. Should we issue a warning somewhere
   // if someone tries to enable these?
@@ -297,8 +304,9 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
   return EnableVGPRSpilling || !AMDGPU::isShader(F.getCallingConv());
 }
 
-unsigned SISubtarget::getKernArgSegmentSize(unsigned ExplicitArgBytes) const {
-  unsigned ImplicitBytes = getImplicitArgNumBytes();
+unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
+					    unsigned ExplicitArgBytes) const {
+  unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
   if (ImplicitBytes == 0)
     return ExplicitArgBytes;
 
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 51ba501bddd1..0e3cb7dc1f87 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -311,22 +311,31 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
     return EnableXNACK;
   }
 
-  bool isAmdCodeObjectV2() const {
-    return isAmdHsaOS() || isMesa3DOS();
+  bool isMesaKernel(const MachineFunction &MF) const {
+    return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv());
+  }
+
+  // Covers VS/PS/CS graphics shaders
+  bool isMesaGfxShader(const MachineFunction &MF) const {
+    return isMesa3DOS() && AMDGPU::isShader(MF.getFunction()->getCallingConv());
+  }
+
+  bool isAmdCodeObjectV2(const MachineFunction &MF) const {
+    return isAmdHsaOS() || isMesaKernel(MF);
   }
 
   /// \brief Returns the offset in bytes from the start of the input buffer
   ///        of the first explicit kernel argument.
-  unsigned getExplicitKernelArgOffset() const {
-    return isAmdCodeObjectV2() ? 0 : 36;
+  unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
+    return isAmdCodeObjectV2(MF) ? 0 : 36;
   }
 
   unsigned getAlignmentForImplicitArgPtr() const {
     return isAmdHsaOS() ? 8 : 4;
   }
 
-  unsigned getImplicitArgNumBytes() const {
-    if (isMesa3DOS())
+  unsigned getImplicitArgNumBytes(const MachineFunction &MF) const {
+    if (isMesaKernel(MF))
       return 16;
     if (isAmdHsaOS() && isOpenCLEnv())
       return 32;
@@ -585,7 +594,7 @@ class SISubtarget final : public AMDGPUSubtarget {
     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
   }
 
-  unsigned getKernArgSegmentSize(unsigned ExplictArgBytes) const;
+  unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const;
 
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index de7ce5cb9e47..77fee4356b65 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1115,7 +1115,10 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
     llvm_unreachable("Unsupported private trunc store");
   }
 
-  SDValue Chain = Store->getChain();
+  SDValue OldChain = Store->getChain();
+  bool VectorTrunc = (OldChain.getOpcode() == AMDGPUISD::DUMMY_CHAIN);
+  // Skip dummy
+  SDValue Chain = VectorTrunc ? OldChain->getOperand(0) : OldChain;
   SDValue BasePtr = Store->getBasePtr();
   SDValue Offset = Store->getOffset();
   EVT MemVT = Store->getMemoryVT();
@@ -1171,7 +1174,15 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
 
   // Store dword
   // TODO: Can we be smarter about MachinePointerInfo?
-  return DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo());
+  SDValue NewStore = DAG.getStore(Chain, DL, Value, Ptr, MachinePointerInfo());
+
+  // If we are part of expanded vector, make our neighbors depend on this store
+  if (VectorTrunc) {
+    // Make all other vector elements depend on this store
+    Chain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, NewStore);
+    DAG.ReplaceAllUsesOfValueWith(OldChain, Chain);
+  }
+  return NewStore;
 }
 
 SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -1191,6 +1202,17 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   // Neither LOCAL nor PRIVATE can do vectors at the moment
   if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
       VT.isVector()) {
+    if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && StoreNode->isTruncatingStore()) {
+      // Add an extra level of chain to isolate this vector
+      SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
+      // TODO: can the chain be replaced without creating a new store?
+      SDValue NewStore = DAG.getTruncStore(
+          NewChain, DL, Value, Ptr, StoreNode->getPointerInfo(),
+          MemVT, StoreNode->getAlignment(),
+          StoreNode->getMemOperand()->getFlags(), StoreNode->getAAInfo());
+      StoreNode = cast<StoreSDNode>(NewStore);
+    }
+
     return scalarizeVectorStore(StoreNode, DAG);
   }
 
@@ -1225,7 +1247,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
       // Put the mask in correct place
       SDValue Mask = DAG.getNode(ISD::SHL, DL, VT, MaskConstant, BitShift);
 
-      // Put the mask in correct place
+      // Put the value bits in correct place
       SDValue TruncValue = DAG.getNode(ISD::AND, DL, VT, Value, MaskConstant);
       SDValue ShiftedValue = DAG.getNode(ISD::SHL, DL, VT, TruncValue, BitShift);
 
@@ -1560,7 +1582,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
 
     unsigned ValBase = ArgLocs[In.getOrigArgIndex()].getLocMemOffset();
     unsigned PartOffset = VA.getLocMemOffset();
-    unsigned Offset = Subtarget->getExplicitKernelArgOffset() + VA.getLocMemOffset();
+    unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) + VA.getLocMemOffset();
 
     MachinePointerInfo PtrInfo(UndefValue::get(PtrTy), PartOffset - ValBase);
     SDValue Arg = DAG.getLoad(
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 19795bdde647..9210e66b0fe7 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -727,6 +727,20 @@ def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
 
 def MOV : R600_1OP <0x19, "MOV", []>;
 
+
+// This is a hack to get rid of DUMMY_CHAIN nodes.
+// Most DUMMY_CHAINs should be eliminated during legalization, but undef
+// values can sneak in some to selection.
+let isPseudo = 1, isCodeGenOnly = 1 in {
+def DUMMY_CHAIN : AMDGPUInst <
+  (outs),
+  (ins),
+  "DUMMY_CHAIN",
+  [(R600dummy_chain)]
+>;
+} // end let isPseudo = 1, isCodeGenOnly = 1
+
+
 let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
 
 class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index d0a69eafc58e..0b5715515880 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -237,7 +237,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
 
   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
-  if (ST.isAmdCodeObjectV2()) {
+  if (ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF)) {
     PreloadedPrivateBufferReg = TRI->getPreloadedValue(
       MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
   }
@@ -255,7 +255,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
-    assert(ST.isAmdCodeObjectV2());
+    assert(ST.isAmdCodeObjectV2(MF) || ST.isMesaGfxShader(MF));
     MRI.addLiveIn(PreloadedPrivateBufferReg);
     MBB.addLiveIn(PreloadedPrivateBufferReg);
   }
@@ -280,6 +280,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
   bool CopyBuffer = ResourceRegUsed &&
     PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
+    ST.isAmdCodeObjectV2(MF) &&
     ScratchRsrcReg != PreloadedPrivateBufferReg;
 
   // This needs to be careful of the copying order to avoid overwriting one of
@@ -303,24 +304,57 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
       .addReg(PreloadedPrivateBufferReg, RegState::Kill);
   }
 
-  if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) {
-    assert(!ST.isAmdCodeObjectV2());
+  if (ResourceRegUsed && (ST.isMesaGfxShader(MF) || (PreloadedPrivateBufferReg == AMDGPU::NoRegister))) {
+    assert(!ST.isAmdCodeObjectV2(MF));
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
-    unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
-    unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
     unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
     unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
 
     // Use relocations to get the pointer, and setup the other bits manually.
     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
-    BuildMI(MBB, I, DL, SMovB32, Rsrc0)
-      .addExternalSymbol("SCRATCH_RSRC_DWORD0")
-      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 
-    BuildMI(MBB, I, DL, SMovB32, Rsrc1)
-      .addExternalSymbol("SCRATCH_RSRC_DWORD1")
-      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+    if (MFI->hasPrivateMemoryInputPtr()) {
+      unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+
+      if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
+        const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
+
+        BuildMI(MBB, I, DL, Mov64, Rsrc01)
+          .addReg(PreloadedPrivateBufferReg)
+          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+      } else {
+        const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
+
+        PointerType *PtrTy =
+          PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()),
+                           AMDGPUAS::CONSTANT_ADDRESS);
+        MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+        auto MMO = MF.getMachineMemOperand(PtrInfo,
+                                           MachineMemOperand::MOLoad |
+                                           MachineMemOperand::MOInvariant |
+                                           MachineMemOperand::MODereferenceable,
+                                           0, 0);
+        BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
+          .addReg(PreloadedPrivateBufferReg)
+          .addImm(0) // offset
+          .addImm(0) // glc
+          .addMemOperand(MMO)
+          .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+      }
+    } else {
+      unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+      unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+
+      BuildMI(MBB, I, DL, SMovB32, Rsrc0)
+        .addExternalSymbol("SCRATCH_RSRC_DWORD0")
+        .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+      BuildMI(MBB, I, DL, SMovB32, Rsrc1)
+        .addExternalSymbol("SCRATCH_RSRC_DWORD1")
+        .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+
+    }
 
     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
       .addImm(Rsrc23 & 0xffffffff)
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 9140fe6cd148..b98f9f400ee7 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -842,7 +842,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (!AMDGPU::isShader(CallConv)) {
     assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
   } else {
-    assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() &&
+    assert(!Info->hasDispatchPtr() &&
            !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
            !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
            !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
@@ -850,6 +850,12 @@ SDValue SITargetLowering::LowerFormalArguments(
            !Info->hasWorkItemIDZ());
   }
 
+  if (Info->hasPrivateMemoryInputPtr()) {
+    unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI);
+    MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass);
+    CCInfo.AllocateReg(PrivateMemoryPtrReg);
+  }
+
   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
   if (Info->hasPrivateSegmentBuffer()) {
     unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
@@ -908,7 +914,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     if (VA.isMemLoc()) {
       VT = Ins[i].VT;
       EVT MemVT = VA.getLocVT();
-      const unsigned Offset = Subtarget->getExplicitKernelArgOffset() +
+      const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) +
                               VA.getLocMemOffset();
       // The first 36 bytes of the input buffer contains information about
       // thread group and global sizes.
@@ -1033,7 +1039,7 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
     HasStackObjects = true;
 
-  if (ST.isAmdCodeObjectV2()) {
+  if (ST.isAmdCodeObjectV2(MF)) {
     if (HasStackObjects) {
       // If we have stack objects, we unquestionably need the private buffer
       // resource. For the Code Object V2 ABI, this will be the first 4 user
@@ -2362,9 +2368,13 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   // TODO: Should this propagate fast-math-flags?
 
   switch (IntrinsicID) {
+  case Intrinsic::amdgcn_implicit_buffer_ptr: {
+    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
+    return CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, Reg, VT);
+  }
   case Intrinsic::amdgcn_dispatch_ptr:
   case Intrinsic::amdgcn_queue_ptr: {
-    if (!Subtarget->isAmdCodeObjectV2()) {
+    if (!Subtarget->isAmdCodeObjectV2(MF)) {
       DiagnosticInfoUnsupported BadIntrin(
           *MF.getFunction(), "unsupported hsa intrinsic without hsa target",
           DL.getDebugLoc());
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index e911817c451d..ecd46b95ca6f 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -77,7 +77,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     PrivateSegmentWaveByteOffset(false),
     WorkItemIDX(false),
     WorkItemIDY(false),
-    WorkItemIDZ(false) {
+    WorkItemIDZ(false),
+    PrivateMemoryInputPtr(false) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   const Function *F = MF.getFunction();
 
@@ -114,7 +115,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   if (HasStackObjects || MaySpill)
     PrivateSegmentWaveByteOffset = true;
 
-  if (ST.isAmdCodeObjectV2()) {
+  if (ST.isAmdCodeObjectV2(MF)) {
     if (HasStackObjects || MaySpill)
       PrivateSegmentBuffer = true;
 
@@ -126,6 +127,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
 
     if (F->hasFnAttribute("amdgpu-dispatch-id"))
       DispatchID = true;
+  } else if (ST.isMesaGfxShader(MF)) {
+    if (HasStackObjects || MaySpill)
+      PrivateMemoryInputPtr = true;
   }
 
   // We don't need to worry about accessing spills with flat instructions.
@@ -182,6 +186,13 @@ unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
   return FlatScratchInitUserSGPR;
 }
 
+unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
+  PrivateMemoryPtrUserSGPR = TRI.getMatchingSuperReg(
+    getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass);
+  NumUserSGPRs += 2;
+  return PrivateMemoryPtrUserSGPR;
+}
+
 SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
                                                        MachineFunction *MF,
                                                        unsigned FrameIndex,
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 3b4e233cd787..6fc8d18bceba 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -84,6 +84,9 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   unsigned ScratchRSrcReg;
   unsigned ScratchWaveOffsetReg;
 
+  // Input registers for non-HSA ABI
+  unsigned PrivateMemoryPtrUserSGPR;
+
   // Input registers setup for the HSA ABI.
   // User SGPRs in allocation order.
   unsigned PrivateSegmentBufferUserSGPR;
@@ -163,6 +166,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   bool WorkItemIDY : 1;
   bool WorkItemIDZ : 1;
 
+  // Private memory buffer
+  // Compute directly in sgpr[0:1]
+  // Other shaders indirect 64-bits at sgpr[0:1]
+  bool PrivateMemoryInputPtr : 1;
+
   MCPhysReg getNextUserSGPR() const {
     assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
     return AMDGPU::SGPR0 + NumUserSGPRs;
@@ -198,6 +206,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
   unsigned addDispatchID(const SIRegisterInfo &TRI);
   unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
+  unsigned addPrivateMemoryPtr(const SIRegisterInfo &TRI);
 
   // Add system SGPRs.
   unsigned addWorkGroupIDX() {
@@ -302,6 +311,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
     return WorkItemIDZ;
   }
 
+  bool hasPrivateMemoryInputPtr() const {
+    return PrivateMemoryInputPtr;
+  }
+
   unsigned getNumUserSGPRs() const {
     return NumUserSGPRs;
   }
@@ -338,6 +351,10 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
     return QueuePtrUserSGPR;
   }
 
+  unsigned getPrivateMemoryPtrUserSGPR() const {
+    return PrivateMemoryPtrUserSGPR;
+  }
+
   bool hasSpilledSGPRs() const {
     return HasSpilledSGPRs;
   }
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 8c4b24a4504d..a1ed5e8441df 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1108,10 +1108,12 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF,
   case SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
     return MFI->PrivateSegmentWaveByteOffsetSystemSGPR;
   case SIRegisterInfo::PRIVATE_SEGMENT_BUFFER:
-    assert(ST.isAmdCodeObjectV2() &&
-           "Non-CodeObjectV2 ABI currently uses relocations");
-    assert(MFI->hasPrivateSegmentBuffer());
-    return MFI->PrivateSegmentBufferUserSGPR;
+    if (ST.isAmdCodeObjectV2(MF)) {
+      assert(MFI->hasPrivateSegmentBuffer());
+      return MFI->PrivateSegmentBufferUserSGPR;
+    }
+    assert(MFI->hasPrivateMemoryInputPtr());
+    return MFI->PrivateMemoryPtrUserSGPR;
   case SIRegisterInfo::KERNARG_SEGMENT_PTR:
     assert(MFI->hasKernargSegmentPtr());
     return MFI->KernargSegmentPtrUserSGPR;
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 5efa64d25ce1..c2a4d4ba99b1 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -70,8 +70,10 @@ class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> {
 }
 
 class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> {
+  // v_div_scale_{f32|f64} do not support input modifiers.
+  let HasModifiers = 0;
   let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
-  let Asm64 = " $vdst, $sdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$clamp$omod";
+  let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
 }
 
 def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32> {
@@ -168,12 +170,14 @@ def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPU
 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
   let SchedRW = [WriteFloatFMA, WriteSALU];
   let hasExtraSrcRegAllocReq = 1;
+  let AsmMatchConverter = "";
 }
 
 // Double precision division pre-scale.
 def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64, [], 1> {
   let SchedRW = [WriteDouble, WriteSALU];
   let hasExtraSrcRegAllocReq = 1;
+  let AsmMatchConverter = "";
 }
 
 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_msad_u8>;
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 8ec9cb02813c..95db35ce8ffb 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -164,6 +164,9 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // Emit the rest of the function body.
   EmitFunctionBody();
 
+  // Emit the XRay table for this function.
+  emitXRayTable();
+
   // If we need V4T thumb mode Register Indirect Jump pads, emit them.
   // These are created per function, rather than per TU, since it's
   // relatively easy to exceed the thumb branch range within a TU.
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index fb4c689bcb59..1606c1576465 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -7571,11 +7571,11 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FLT_ROUNDS_:   return LowerFLT_ROUNDS_(Op, DAG);
   case ISD::MUL:           return LowerMUL(Op, DAG);
   case ISD::SDIV:
-    if (Subtarget->isTargetWindows())
+    if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
       return LowerDIV_Windows(Op, DAG, /* Signed */ true);
     return LowerSDIV(Op, DAG);
   case ISD::UDIV:
-    if (Subtarget->isTargetWindows())
+    if (Subtarget->isTargetWindows() && !Op.getValueType().isVector())
       return LowerDIV_Windows(Op, DAG, /* Signed */ false);
     return LowerUDIV(Op, DAG);
   case ISD::ADDC:
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index da96040d9996..08fe2bad281e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -31272,93 +31272,6 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
 }
 
-/// Check if truncation with saturation form type \p SrcVT to \p DstVT
-/// is valid for the given \p Subtarget.
-static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
-                                        const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasAVX512())
-    return false;
-
-  // FIXME: Scalar type may be supported if we move it to vector register.
-  if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
-    return false;
-
-  EVT SrcElVT = SrcVT.getScalarType();
-  EVT DstElVT = DstVT.getScalarType();
-  if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
-    return false;
-  if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
-    return false;
-  if (SrcVT.is512BitVector() || Subtarget.hasVLX())
-    return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
-  return false;
-}
-
-/// Return true if VPACK* instruction can be used for the given types
-/// and it is avalable on \p Subtarget.
-static bool
-isSATValidOnSSESubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
-  if (Subtarget.hasSSE2())
-    // v16i16 -> v16i8
-    if (SrcVT == MVT::v16i16 && DstVT == MVT::v16i8)
-      return true;
-  if (Subtarget.hasSSE41())
-    // v8i32 -> v8i16
-    if (SrcVT == MVT::v8i32 && DstVT == MVT::v8i16)
-      return true;
-  return false;
-}
-
-/// Detect a pattern of truncation with saturation:
-/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
-/// Return the source value to be truncated or SDValue() if the pattern was not
-/// matched.
-static SDValue detectUSatPattern(SDValue In, EVT VT) {
-  if (In.getOpcode() != ISD::UMIN)
-    return SDValue();
-
-  //Saturation with truncation. We truncate from InVT to VT.
-  assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
-    "Unexpected types for truncate operation");
-
-  APInt C;
-  if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
-    // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
-    // the element size of the destination type.
-    return APIntOps::isMask(VT.getScalarSizeInBits(), C) ? In.getOperand(0) :
-      SDValue();
-  }
-  return SDValue();
-}
-
-/// Detect a pattern of truncation with saturation:
-/// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
-/// The types should allow to use VPMOVUS* instruction on AVX512.
-/// Return the source value to be truncated or SDValue() if the pattern was not
-/// matched.
-static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
-                                       const X86Subtarget &Subtarget) {
-  if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
-    return SDValue();
-  return detectUSatPattern(In, VT);
-}
-
-static SDValue
-combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
-                        const X86Subtarget &Subtarget) {
-  SDValue USatVal = detectUSatPattern(In, VT);
-  if (USatVal) {
-    if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
-      return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
-    if (isSATValidOnSSESubtarget(In.getValueType(), VT, Subtarget)) {
-      SDValue Lo, Hi;
-      std::tie(Lo, Hi) = DAG.SplitVector(USatVal, DL);
-      return DAG.getNode(X86ISD::PACKUS, DL, VT, Lo, Hi);
-    }
-  }
-  return SDValue();
-}
-
 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
 /// X86ISD::AVG instruction.
@@ -31925,12 +31838,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
                           St->getPointerInfo(), St->getAlignment(),
                           St->getMemOperand()->getFlags());
 
-    if (SDValue Val =
-        detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
-      return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
-                             dl, Val, St->getBasePtr(),
-                             St->getMemoryVT(), St->getMemOperand(), DAG);
-
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     unsigned NumElems = VT.getVectorNumElements();
     assert(StVT != VT && "Cannot truncate to the same type");
@@ -32551,10 +32458,6 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
     return Avg;
 
-  // Try to combine truncation with unsigned saturation.
-  if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
-    return Val;
-
   // The bitcast source is a direct mmx result.
   // Detect bitcasts between i32 to x86mmx
   if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
@@ -33790,11 +33693,11 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // Try to synthesize horizontal adds from adds of shuffles.
+  // Try to synthesize horizontal subs from subs of shuffles.
   EVT VT = N->getValueType(0);
   if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
        (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
-      isHorizontalBinOp(Op0, Op1, true))
+      isHorizontalBinOp(Op0, Op1, false))
     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
 
   return OptimizeConditionalInDecrement(N, DAG);
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 6e30919246c7..7b0bddbbb831 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1436,6 +1436,14 @@ static bool canSinkInstructions(
     if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
         I->getType()->isTokenTy())
       return false;
+
+    // Conservatively return false if I is an inline-asm instruction. Sinking
+    // and merging inline-asm instructions can potentially create arguments
+    // that cannot satisfy the inline-asm constraints.
+    if (const auto *C = dyn_cast<CallInst>(I))
+      if (C->isInlineAsm())
+        return false;
+
     // Everything must have only one use too, apart from stores which
     // have no uses.
     if (!isa<StoreInst>(I) && !I->hasOneUse())
diff --git a/test/Analysis/BasicAA/pr31761.ll b/test/Analysis/BasicAA/pr31761.ll
new file mode 100644
index 000000000000..318dfdc9c246
--- /dev/null
+++ b/test/Analysis/BasicAA/pr31761.ll
@@ -0,0 +1,19 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+%struct.blam = type { i32, i32 }
+
+
+; CHECK-DAG: MayAlias: i32* %tmp, i32* %tmp3
+
+define i1 @ham(%struct.blam* %arg)  {
+  %isNull = icmp eq %struct.blam* %arg, null
+  %tmp = getelementptr  %struct.blam, %struct.blam* %arg, i64 0, i32 0
+  %tmp2 = getelementptr  %struct.blam, %struct.blam* %arg, i64 0, i32 1
+  %select = select i1 %isNull, i32* null, i32* %tmp2
+  %tmp3 = getelementptr  i32, i32* %select, i32 -1
+  ret i1 true
+}
diff --git a/test/CodeGen/AArch64/no-quad-ldp-stp.ll b/test/CodeGen/AArch64/no-quad-ldp-stp.ll
index 6324835b322b..fb030d291362 100644
--- a/test/CodeGen/AArch64/no-quad-ldp-stp.ll
+++ b/test/CodeGen/AArch64/no-quad-ldp-stp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+no-quad-ldst-pairs -verify-machineinstrs -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-eabi -mattr=+slow-paired-128 -verify-machineinstrs -asm-verbose=false | FileCheck %s
 ; RUN: llc < %s -mtriple=aarch64-eabi -mcpu=exynos-m1 -verify-machineinstrs -asm-verbose=false | FileCheck %s
 
 ; CHECK-LABEL: test_nopair_st
diff --git a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index 6495a1480df6..edad18e244d0 100644
--- a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; On Southern Islands GPUs the local address space(3) uses 32-bit pointers and
 ; the global address space(1) uses 64-bit pointers.  These tests check to make sure
diff --git a/test/CodeGen/AMDGPU/add.i16.ll b/test/CodeGen/AMDGPU/add.i16.ll
index a41d3071377d..6c5cdd3877d1 100644
--- a/test/CodeGen/AMDGPU/add.i16.ll
+++ b/test/CodeGen/AMDGPU/add.i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_add_i16:
diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll
index f37247361ece..a6247c735240 100644
--- a/test/CodeGen/AMDGPU/add.ll
+++ b/test/CodeGen/AMDGPU/add.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ;FUNC-LABEL: {{^}}test1:
diff --git a/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
index ad6843770fd6..a6d055891d4b 100644
--- a/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mattr=+promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE -check-prefix=HSA %s
 ; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-ALLOCA %s
 ; RUN: llc -mattr=-promote-alloca,-flat-for-global -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck  -check-prefix=GCN -check-prefix=GCN-ALLOCA -check-prefix=HSA %s
-; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s
-; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN  -check-prefix=GCN-ALLOCA %s
+; RUN: llc -mattr=+promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-PROMOTE %s
+; RUN: llc -mattr=-promote-alloca -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN  -check-prefix=GCN-ALLOCA %s
 
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
index f05f027fb3c8..e515ca00d184 100644
--- a/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; Legacy intrinsics that just read implicit parameters
diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll
index d3d3cec9bbbf..5d9dcf64debf 100644
--- a/test/CodeGen/AMDGPU/and.ll
+++ b/test/CodeGen/AMDGPU/and.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/anyext.ll b/test/CodeGen/AMDGPU/anyext.ll
index a1d3715a095c..87b4c86427c8 100644
--- a/test/CodeGen/AMDGPU/anyext.ll
+++ b/test/CodeGen/AMDGPU/anyext.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 6a2716cc903e..25eae0b41ae4 100644
--- a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC  %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SICI -check-prefix=GCN -check-prefix=FUNC  %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=SICI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}lds_atomic_cmpxchg_ret_i32_offset:
 ; SICI: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
diff --git a/test/CodeGen/AMDGPU/atomic_load_add.ll b/test/CodeGen/AMDGPU/atomic_load_add.ll
index 20c685447eef..4b014e09b630 100644
--- a/test/CodeGen/AMDGPU/atomic_load_add.ll
+++ b/test/CodeGen/AMDGPU/atomic_load_add.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}atomic_add_local:
diff --git a/test/CodeGen/AMDGPU/atomic_load_sub.ll b/test/CodeGen/AMDGPU/atomic_load_sub.ll
index 184d07ffad9c..c6e5b1136d7c 100644
--- a/test/CodeGen/AMDGPU/atomic_load_sub.ll
+++ b/test/CodeGen/AMDGPU/atomic_load_sub.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}atomic_sub_local:
diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll
index 24874ee7fa98..104dd45e8a1a 100644
--- a/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O0 -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCNNOOPT -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCNOPT -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}test_branch:
 ; GCNNOOPT: v_writelane_b32
diff --git a/test/CodeGen/AMDGPU/bfi_int.ll b/test/CodeGen/AMDGPU/bfi_int.ll
index 03349349735d..5156137fd78a 100644
--- a/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/test/CodeGen/AMDGPU/bfi_int.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=R600 %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI %s
 
 ; BFI_INT Definition pattern from ISA docs
 ; (y & x) | (z & ~x)
diff --git a/test/CodeGen/AMDGPU/bfm.ll b/test/CodeGen/AMDGPU/bfm.ll
index 73db87d7ae9e..790458d0d60c 100644
--- a/test/CodeGen/AMDGPU/bfm.ll
+++ b/test/CodeGen/AMDGPU/bfm.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}bfm_pattern:
diff --git a/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
index 2482fa761b19..3a55870c2882 100644
--- a/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
+++ b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; The bitcast should be pushed through the bitcasts so the vectors can
 ; be broken down and the shared components can be CSEd
diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll
index aca88a9ef2a3..43a4200cb3bd 100644
--- a/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/test/CodeGen/AMDGPU/bitreverse.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
 
 declare i16 @llvm.bitreverse.i16(i16) #1
 declare i32 @llvm.bitreverse.i32(i32) #1
diff --git a/test/CodeGen/AMDGPU/br_cc.f16.ll b/test/CodeGen/AMDGPU/br_cc.f16.ll
index 340d30b898e0..0072d384f217 100644
--- a/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}br_cc_f16
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/branch-condition-and.ll b/test/CodeGen/AMDGPU/branch-condition-and.ll
index 86d96435a29e..94616a4be8fd 100644
--- a/test/CodeGen/AMDGPU/branch-condition-and.ll
+++ b/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; This used to crash because during intermediate control flow lowering, there
 ; was a sequence
diff --git a/test/CodeGen/AMDGPU/bswap.ll b/test/CodeGen/AMDGPU/bswap.ll
index 23b93ce2f075..c68951731098 100644
--- a/test/CodeGen/AMDGPU/bswap.ll
+++ b/test/CodeGen/AMDGPU/bswap.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.bswap.i32(i32) nounwind readnone
 declare <2 x i32> @llvm.bswap.v2i32(<2 x i32>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/bug-vopc-commute.ll b/test/CodeGen/AMDGPU/bug-vopc-commute.ll
index 990671102757..7c02d8385462 100644
--- a/test/CodeGen/AMDGPU/bug-vopc-commute.ll
+++ b/test/CodeGen/AMDGPU/bug-vopc-commute.ll
@@ -1,7 +1,5 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-target triple = "amdgcn--"
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}main:
 ;
diff --git a/test/CodeGen/AMDGPU/build_vector.ll b/test/CodeGen/AMDGPU/build_vector.ll
index 65eacf5adc41..0a5774c601d3 100644
--- a/test/CodeGen/AMDGPU/build_vector.ll
+++ b/test/CodeGen/AMDGPU/build_vector.ll
@@ -1,6 +1,6 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI
 
 ; R600: {{^}}build_vector2:
 ; R600: MOV
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 82f88a079307..6db9a0761a01 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; OPT-LABEL: @test_no_sink_flat_small_offset_i32(
 ; OPT: getelementptr i32, i32 addrspace(4)* %in
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index f3425ca9fdd5..2ed2857ff340 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -1,9 +1,9 @@
 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tahiti < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-SI %s
 ; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=bonaire < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-CI %s
-; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
+; RUN: opt -S -codegenprepare -mtriple=amdgcn-unknown-unknown -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=OPT -check-prefix=OPT-VI %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; OPT-LABEL: @test_sink_global_small_offset_i32(
 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index 7aa6a53e9025..066ef951cc31 100644
--- a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -S -mtriple=amdgcn-- -codegenprepare < %s | FileCheck -check-prefix=OPT %s
-; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -codegenprepare < %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -codegenprepare < %s | FileCheck -check-prefix=OPT %s
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; This particular case will actually be worse in terms of code size
 ; from sinking into both.
diff --git a/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll b/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
deleted file mode 100644
index 8227d4c873ee..000000000000
--- a/test/CodeGen/AMDGPU/ci-use-flat-for-global.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s
-
-
-; There are no stack objects even though flat is used by default, so
-; flat_scratch_init should be disabled.
-
-; ALL-LABEL: {{^}}test:
-; HSA: .amd_kernel_code_t
-; HSA: enable_sgpr_flat_scratch_init = 0
-; HSA: .end_amd_kernel_code_t
-
-; ALL-NOT: flat_scr
-
-; HSA-DEFAULT: flat_store_dword
-; HSA-NODEFAULT: buffer_store_dword
-
-; NOHSA-DEFAULT: buffer_store_dword
-; NOHSA-NODEFAULT: flat_store_dword
-define void @test(i32 addrspace(1)* %out) {
-entry:
-  store i32 0, i32 addrspace(1)* %out
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/concat_vectors.ll b/test/CodeGen/AMDGPU/concat_vectors.ll
index a09ed1f73857..2e6be5d10f09 100644
--- a/test/CodeGen/AMDGPU/concat_vectors.ll
+++ b/test/CodeGen/AMDGPU/concat_vectors.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}test_concat_v1i32:
 ; 0x80f000 is the high 32 bits of the resource descriptor used by MUBUF
diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 3e167846c22c..0ff75ab58003 100644
--- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}fold_mi_v_and_0:
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 6918dff74d57..7434d745b259 100644
--- a/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/copy-to-reg.ll b/test/CodeGen/AMDGPU/copy-to-reg.ll
index fc875f6ef7a3..3422a889a520 100644
--- a/test/CodeGen/AMDGPU/copy-to-reg.ll
+++ b/test/CodeGen/AMDGPU/copy-to-reg.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -mattr=-promote-alloca -verify-machineinstrs < %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mattr=-promote-alloca -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -verify-machineinstrs < %s
 
 ; Test that CopyToReg instructions don't have non-register operands prior
 ; to being emitted.
diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll
index 13f38d25aa8b..1a0027dd4a3c 100644
--- a/test/CodeGen/AMDGPU/ctlz.ll
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 900938bab700..d390f64deeab 100644
--- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
index e53ad13464e8..9692236bb363 100644
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC -check-prefix=VI %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll
index 21f366687c74..cd5d805e5db3 100644
--- a/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 
 declare i64 @llvm.ctpop.i64(i64) nounwind readnone
 declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index 56fcb51fe14e..e33cc18eb05f 100644
--- a/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/cube.ll b/test/CodeGen/AMDGPU/cube.ll
index c5d1f86cea76..9b512c439b0e 100644
--- a/test/CodeGen/AMDGPU/cube.ll
+++ b/test/CodeGen/AMDGPU/cube.ll
@@ -14,7 +14,7 @@ declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
 ; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; GCN: buffer_store_dwordx4
+; GCN: _store_dwordx4
 define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %cubeid = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
   %cubesc = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
@@ -34,7 +34,7 @@ define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c)
 ; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
 ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-; GCN: buffer_store_dwordx4
+; GCN: _store_dwordx4
 define void @legacy_cube(<4 x float> addrspace(1)* %out, <4 x float> %abcx) #1 {
   %cube = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %abcx)
   store <4 x float> %cube, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index ed9b8273fa4a..7baaa81fba59 100644
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
index 864ac40260b3..d38411dcca61 100644
--- a/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
+++ b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -enable-no-nans-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NONAN -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
 
 declare float @llvm.fabs.f32(float) #1
diff --git a/test/CodeGen/AMDGPU/elf.ll b/test/CodeGen/AMDGPU/elf.ll
index e527b8511fda..628dd5ec839e 100644
--- a/test/CodeGen/AMDGPU/elf.ll
+++ b/test/CodeGen/AMDGPU/elf.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols -file-headers - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols -file-headers - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s
-; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols -file-headers - | FileCheck --check-prefix=ELF %s
-; RUN: llc < %s -march=amdgcn -mcpu=carrizo -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols -file-headers - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols -file-headers - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TONGA %s
+; RUN: llc < %s -march=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols -file-headers - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -march=amdgcn -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs -o - | FileCheck --check-prefix=CONFIG --check-prefix=TYPICAL %s
 
 ; Test that we don't try to produce a COFF file on windows
-; RUN: llc < %s -mtriple=amdgcn-pc-mingw -mcpu=SI -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols -file-headers - | FileCheck --check-prefix=ELF %s
+; RUN: llc < %s -mtriple=amdgcn-pc-mingw -verify-machineinstrs -filetype=obj | llvm-readobj -s -symbols -file-headers - | FileCheck --check-prefix=ELF %s
 
 ; ELF: Format: ELF64
 ; ELF: OS/ABI: AMDGPU_HSA (0x40)
diff --git a/test/CodeGen/AMDGPU/extload.ll b/test/CodeGen/AMDGPU/extload.ll
index 2cb5cf0422dc..8b3e087d1f45 100644
--- a/test/CodeGen/AMDGPU/extload.ll
+++ b/test/CodeGen/AMDGPU/extload.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FIXME: This seems to not ever actually become an extload
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
index d0b19c825ee9..4594379dae03 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v3f64_2:
 ; GCN: buffer_load_dwordx4
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index eea44b8a006c..c407f0efffb4 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v2i16:
 ; GCN: buffer_load_ushort
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index 0a51c39f026f..1df91c93329a 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; How the replacement of i64 stores with v2i32 stores resulted in
 ; breaking other users of the bitcast if they already existed
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 9005bfa07c2b..6f4ae827f432 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}extract_vector_elt_v1i8:
 ; GCN: buffer_load_ubyte
diff --git a/test/CodeGen/AMDGPU/fabs.ll b/test/CodeGen/AMDGPU/fabs.ll
index 3742b58d500a..98e7f9e3e9ad 100644
--- a/test/CodeGen/AMDGPU/fabs.ll
+++ b/test/CodeGen/AMDGPU/fabs.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/AMDGPU/fadd.f16.ll b/test/CodeGen/AMDGPU/fadd.f16.ll
index fb2d418b4436..9ca077564e2b 100644
--- a/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}fadd_f16
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fadd.ll b/test/CodeGen/AMDGPU/fadd.ll
index 11436794ac98..0f683f7bfa23 100644
--- a/test/CodeGen/AMDGPU/fadd.ll
+++ b/test/CodeGen/AMDGPU/fadd.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefix=SI -check-prefix=FUNC
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
 
 ; FUNC-LABEL: {{^}}fadd_f32:
diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll
index 19c17289da3d..6f0c9de8ebaf 100644
--- a/test/CodeGen/AMDGPU/fadd64.ll
+++ b/test/CodeGen/AMDGPU/fadd64.ll
@@ -23,7 +23,7 @@ define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
 ; CHECK-LABEL: {{^}}v_fadd_v2f64:
 ; CHECK: v_add_f64
 ; CHECK: v_add_f64
-; CHECK: buffer_store_dwordx4
+; CHECK: _store_dwordx4
 define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                           <2 x double> addrspace(1)* %in2) {
   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
@@ -36,7 +36,7 @@ define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspac
 ; CHECK-LABEL: {{^}}s_fadd_v2f64:
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
-; CHECK: buffer_store_dwordx4
+; CHECK: _store_dwordx4
 define void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) {
   %r2 = fadd <2 x double> %r0, %r1
   store <2 x double> %r2, <2 x double> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index fb693de0e39d..ad3992f4cd03 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -167,6 +167,6 @@ define void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1
 }
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind "target-features"="-fp16-denormals,-fp16-denormals" }
-attributes #3 = { nounwind "target-features"="+fp16-denormals,+fp64-denormals" }
+attributes #1 = { nounwind "target-features"="-flat-for-global" }
+attributes #2 = { nounwind "target-features"="-flat-for-global,-fp16-denormals,-fp16-denormals" }
+attributes #3 = { nounwind "target-features"="-flat-for-global,+fp16-denormals,+fp64-denormals" }
diff --git a/test/CodeGen/AMDGPU/fceil.ll b/test/CodeGen/AMDGPU/fceil.ll
index f23e8919d733..efdda78f852b 100644
--- a/test/CodeGen/AMDGPU/fceil.ll
+++ b/test/CodeGen/AMDGPU/fceil.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.ceil.f32(float) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fcmp.f16.ll b/test/CodeGen/AMDGPU/fcmp.f16.ll
index c6e1ed5a1f29..a62726f7f068 100644
--- a/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}fcmp_f16_lt
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fcmp64.ll b/test/CodeGen/AMDGPU/fcmp64.ll
index 053ab0ed7aaf..acce82fdfe53 100644
--- a/test/CodeGen/AMDGPU/fcmp64.ll
+++ b/test/CodeGen/AMDGPU/fcmp64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=tahiti -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
 
 ; CHECK-LABEL: {{^}}flt_f64:
 ; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
diff --git a/test/CodeGen/AMDGPU/fcopysign.f32.ll b/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 9f01a660be3a..632de18dafcb 100644
--- a/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.copysign.f32(float, float) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fcopysign.f64.ll b/test/CodeGen/AMDGPU/fcopysign.f64.ll
index b34a4695387f..12c942beee6c 100644
--- a/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 
 declare double @llvm.copysign.f64(double, double) nounwind readnone
 declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fdiv.f16.ll b/test/CodeGen/AMDGPU/fdiv.f16.ll
index fad45c6816d2..70b70bdaaaa7 100644
--- a/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; Make sure fdiv is promoted to f32.
 
diff --git a/test/CodeGen/AMDGPU/fdiv.f64.ll b/test/CodeGen/AMDGPU/fdiv.f64.ll
index 2ace0762c677..20f9e4df07fd 100644
--- a/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
 
 
 ; COMMON-LABEL: {{^}}fdiv_f64:
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
index a96035fca3dc..0e95de9c555c 100644
--- a/test/CodeGen/AMDGPU/fdiv.ll
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -252,8 +252,8 @@ define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> ad
   ret void
 }
 
-attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals" }
-attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals" }
-attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals" }
+attributes #0 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="-fp32-denormals,-flat-for-global" }
+attributes #1 = { nounwind "enable-unsafe-fp-math"="true" "target-features"="-fp32-denormals,-flat-for-global" }
+attributes #2 = { nounwind "enable-unsafe-fp-math"="false" "target-features"="+fp32-denormals,-flat-for-global" }
 
 !0 = !{float 2.500000e+00}
diff --git a/test/CodeGen/AMDGPU/ffloor.f64.ll b/test/CodeGen/AMDGPU/ffloor.f64.ll
index 904dfe31b15e..83ffbdfa23a5 100644
--- a/test/CodeGen/AMDGPU/ffloor.f64.ll
+++ b/test/CodeGen/AMDGPU/ffloor.f64.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
 
 declare double @llvm.fabs.f64(double %Val)
 declare double @llvm.floor.f64(double) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/ffloor.ll b/test/CodeGen/AMDGPU/ffloor.ll
index 61c46ac2bc03..d7f35a45075c 100644
--- a/test/CodeGen/AMDGPU/ffloor.ll
+++ b/test/CodeGen/AMDGPU/ffloor.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}floor_f32:
diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll
index 0cfe6888b335..55b5482d031f 100644
--- a/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=bonaire < %s | FileCheck  %s
-; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga < %s | FileCheck  %s
-; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=CHECK,HSA %s
+; RUN: llc -O0 -mtriple=amdgcn-mesa-mesa3d -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck  %s
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=CHECK,HSA %s
 
 ; Disable optimizations in case there are optimizations added that
 ; specialize away generic pointer accesses.
diff --git a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
new file mode 100644
index 000000000000..df9ba00c6974
--- /dev/null
+++ b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-DEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=HSA -check-prefix=HSA-NODEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=HSA-NOADDR64 -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=-flat-for-global < %s | FileCheck -check-prefix=NOHSA-DEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=kaveri -mattr=+flat-for-global < %s | FileCheck -check-prefix=NOHSA-NODEFAULT -check-prefix=ALL %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=NOHSA-NOADDR64 -check-prefix=ALL %s
+
+
+; There are no stack objects even though flat is used by default, so
+; flat_scratch_init should be disabled.
+
+; ALL-LABEL: {{^}}test:
+; HSA: .amd_kernel_code_t
+; HSA: enable_sgpr_flat_scratch_init = 0
+; HSA: .end_amd_kernel_code_t
+
+; ALL-NOT: flat_scr
+
+; HSA-DEFAULT: flat_store_dword
+; HSA-NODEFAULT: buffer_store_dword
+; HSA-NOADDR64: flat_store_dword
+
+; NOHSA-DEFAULT: buffer_store_dword
+; NOHSA-NODEFAULT: flat_store_dword
+; NOHSA-NOADDR64: flat_store_dword
+define void @test(i32 addrspace(1)* %out) {
+entry:
+  store i32 0, i32 addrspace(1)* %out
+  ret void
+}
+
+; HSA-DEFAULT: flat_store_dword
+; HSA-NODEFAULT: buffer_store_dword
+; HSA-NOADDR64: flat_store_dword
+
+; NOHSA-DEFAULT: buffer_store_dword
+; NOHSA-NODEFAULT: flat_store_dword
+; NOHSA-NOADDR64: flat_store_dword
+define void @test_addr64(i32 addrspace(1)* %out) {
+entry:
+  %out.addr = alloca i32 addrspace(1)*, align 4
+
+  store i32 addrspace(1)* %out, i32 addrspace(1)** %out.addr, align 4
+  %ld0 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 4
+
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %ld0, i32 0
+  store i32 1, i32 addrspace(1)* %arrayidx, align 4
+
+  %ld1 = load i32 addrspace(1)*, i32 addrspace(1)** %out.addr, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %ld1, i32 1
+  store i32 2, i32 addrspace(1)* %arrayidx1, align 4
+
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index f15cbef56b19..b71c8bcb76c7 100644
--- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -5,7 +5,7 @@
 ; RUN: llc -march=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=VI-NOXNACK  -check-prefix=GCN %s
 
 ; RUN: llc -march=amdgcn -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=stoney  -verify-machineinstrs < %s | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 
 ; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-CI -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-VI-NOXNACK -check-prefix=GCN %s
diff --git a/test/CodeGen/AMDGPU/fma.f64.ll b/test/CodeGen/AMDGPU/fma.f64.ll
index 0a55ef778557..cf6d7d824992 100644
--- a/test/CodeGen/AMDGPU/fma.f64.ll
+++ b/test/CodeGen/AMDGPU/fma.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare double @llvm.fma.f64(double, double, double) nounwind readnone
 declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fmax3.f64.ll b/test/CodeGen/AMDGPU/fmax3.f64.ll
index 4dae566d8489..4d42a4630e22 100644
--- a/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare double @llvm.maxnum.f64(double, double) nounwind readnone
 
diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll
index c0fde6e97f6f..7c01ca85f6b9 100644
--- a/test/CodeGen/AMDGPU/fmax3.ll
+++ b/test/CodeGen/AMDGPU/fmax3.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare float @llvm.maxnum.f32(float, float) nounwind readnone
 
diff --git a/test/CodeGen/AMDGPU/fmaxnum.f64.ll b/test/CodeGen/AMDGPU/fmaxnum.f64.ll
index de563cec3412..fec3a358a4fa 100644
--- a/test/CodeGen/AMDGPU/fmaxnum.f64.ll
+++ b/test/CodeGen/AMDGPU/fmaxnum.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare double @llvm.maxnum.f64(double, double) #0
 declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) #0
diff --git a/test/CodeGen/AMDGPU/fmaxnum.ll b/test/CodeGen/AMDGPU/fmaxnum.ll
index 6a7d760495bc..4058247a6da9 100644
--- a/test/CodeGen/AMDGPU/fmaxnum.ll
+++ b/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare float @llvm.maxnum.f32(float, float) #0
 declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll
index 2d1facfc3a40..3102ffdbdd28 100644
--- a/test/CodeGen/AMDGPU/fmin3.ll
+++ b/test/CodeGen/AMDGPU/fmin3.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare float @llvm.minnum.f32(float, float) nounwind readnone
 
diff --git a/test/CodeGen/AMDGPU/fminnum.ll b/test/CodeGen/AMDGPU/fminnum.ll
index 9081a1e89511..abd2b9d3e4d1 100644
--- a/test/CodeGen/AMDGPU/fminnum.ll
+++ b/test/CodeGen/AMDGPU/fminnum.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.minnum.f32(float, float) #0
diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 10acae092e9f..8663a2129fc0 100644
--- a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -1,5 +1,5 @@
 ; XUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
 ; make add an instruction if the fadd has more than one use.
diff --git a/test/CodeGen/AMDGPU/fmul.f16.ll b/test/CodeGen/AMDGPU/fmul.f16.ll
index 9ce4d7684fe5..4f47d2c8e755 100644
--- a/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}fmul_f16
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fmul.ll b/test/CodeGen/AMDGPU/fmul.ll
index 9064ad3814d6..d0c39b539456 100644
--- a/test/CodeGen/AMDGPU/fmul.ll
+++ b/test/CodeGen/AMDGPU/fmul.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}fmul_f32:
diff --git a/test/CodeGen/AMDGPU/fmuladd.f64.ll b/test/CodeGen/AMDGPU/fmuladd.f64.ll
index 0af44ef200ea..f5e64b3c5941 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f64.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f64.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -march=amdgcn -mcpu=verde  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,SI %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
 ; RUN: llc -march=amdgcn -mcpu=verde  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
-; RUN: llc -march=amdgcn -mcpu=tonga  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global  -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global  -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,VI %s
 
 ; GCN-LABEL: {{^}}fmuladd_f64:
 ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/AMDGPU/fnearbyint.ll b/test/CodeGen/AMDGPU/fnearbyint.ll
index 4fa9adaabdae..5423fadf81e2 100644
--- a/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s
 
 ; This should have the exactly the same output as the test for rint,
 ; so no need to check anything.
diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll
index d555d8d871de..3f9928c2b623 100644
--- a/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -enable-unsafe-fp-math -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
 
 ; --------------------------------------------------------------------------------
 ; fadd tests
@@ -7,8 +8,12 @@
 ; GCN-LABEL: {{^}}v_fneg_add_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
-; GCN-NEXT: buffer_store_dword [[RESULT]]
+
+; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
+
+; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
+; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
 define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -72,8 +77,12 @@ define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrsp
 ; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-NEXT: buffer_store_dword [[ADD]]
+
+; GCN-SAFE: v_subrev_f32_e32
+; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000,
+
+; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
 define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -92,8 +101,12 @@ define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
-; GCN-NEXT: buffer_store_dword [[ADD]]
+
+; GCN-SAFE: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
+
+; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
 define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -112,8 +125,12 @@ define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-NEXT: buffer_store_dword [[ADD]]
+
+; GCN-SAFE: v_sub_f32_e64 [[ADD:v[0-9]+]], -[[A]], [[B]]
+; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
+
+; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
 define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -133,10 +150,16 @@ define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
-; GCN-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
-; GCN-NEXT: buffer_store_dword [[NEG_A]]
+
+; GCN-SAFE: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1{{$}}
+; GCN-SAFE: v_xor_b32_e32 [[NEG_A:v[0-9]+]], [[A]], [[SIGNBIT]]
+; GCN-SAFE: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], [[ADD]], [[SIGNBIT]]
+
+; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
 define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -156,10 +179,15 @@ define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float add
 ; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
-; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
-; GCN-NEXT: buffer_store_dword [[MUL]]
+
+; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-SAFE-DAG: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[ADD]]
+
+; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
+; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
 define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -362,8 +390,12 @@ define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float add
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
-; GCN-NEXT: buffer_store_dword [[RESULT]]
+
+; GCN-SAFE: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[RESULT]]
+
+; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
 define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -436,8 +468,12 @@ define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
-; GCN-NEXT: buffer_store_dword [[FMA]]
+
+; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], [[C]]
+; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
+
+; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -459,8 +495,12 @@ define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
-; GCN-NEXT: buffer_store_dword [[FMA]]
+
+; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
+; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
+
+; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -482,8 +522,12 @@ define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
-; GCN-NEXT: buffer_store_dword [[FMA]]
+
+; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], -[[B]], [[C]]
+; GCN-SAFE: v_xor_b32_e32 v{{[[0-9]+}}, 0x80000000, [[FMA]]
+
+; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -506,8 +550,12 @@ define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspac
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
-; GCN-NEXT: buffer_store_dword [[FMA]]
+
+; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; GCN-SAFE: v_xor_b32_e32 v{{[[0-9]+}}, 0x80000000, [[FMA]]
+
+; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -530,8 +578,12 @@ define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspac
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
-; GCN-NEXT: buffer_store_dword [[FMA]]
+
+; GCN-NSZ-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NSZ-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
+
+; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
+; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -553,10 +605,15 @@ define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
-; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
-; GCN-NEXT: buffer_store_dword [[FMA]]
-; GCN-NEXT: buffer_store_dword [[NEG_A]]
+
+; GCN-SAFE: v_xor_b32
+; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], -[[A]],
+; GCN-SAFE: v_xor_b32
+
+; GCN-NSZ-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
+; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
+; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
 define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -579,10 +636,14 @@ define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float a
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
-; GCN-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
-; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
-; GCN-NEXT: buffer_store_dword [[MUL]]
+
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
+; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]]
+; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[FMA]]
+
+; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
+; GCN-NSZ-NEXT: buffer_store_dword [[NEG_FMA]]
+; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
 define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -610,8 +671,12 @@ define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float a
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
-; GCN-NEXT: buffer_store_dword [[RESULT]]
+
+; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[C]]
+
+; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
 define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index 62c73c6b93d4..d16e83fd4d5b 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 ; FIXME: Check something here. Currently it seems fabs + fneg aren't
 ; into 2 modifiers, although theoretically that should work.
diff --git a/test/CodeGen/AMDGPU/fneg.f64.ll b/test/CodeGen/AMDGPU/fneg.f64.ll
index 7627a4d32250..b7080f4622a3 100644
--- a/test/CodeGen/AMDGPU/fneg.f64.ll
+++ b/test/CodeGen/AMDGPU/fneg.f64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}fneg_f64:
 ; GCN: v_xor_b32
diff --git a/test/CodeGen/AMDGPU/fneg.ll b/test/CodeGen/AMDGPU/fneg.ll
index 941606ca0716..007c6dcadd9e 100644
--- a/test/CodeGen/AMDGPU/fneg.ll
+++ b/test/CodeGen/AMDGPU/fneg.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}s_fneg_f32:
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/test/CodeGen/AMDGPU/fp16_to_fp32.ll
index 35e9541692db..01bc53ff35a5 100644
--- a/test/CodeGen/AMDGPU/fp16_to_fp32.ll
+++ b/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
 
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/test/CodeGen/AMDGPU/fp16_to_fp64.ll
index 8b05d7b88a10..a9f493bf0ccd 100644
--- a/test/CodeGen/AMDGPU/fp16_to_fp64.ll
+++ b/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 
 declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
 
diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 346ad822f293..3e426e3e94b1 100644
--- a/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.ll b/test/CodeGen/AMDGPU/fp_to_sint.ll
index 381c375151e8..a2fa7a190745 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s --check-prefix=SI --check-prefix=FUNC --check-prefix=GCN
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s --check-prefix=VI --check-prefix=FUNC --check-prefix=GCN
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 
 declare float @llvm.fabs.f32(float) #1
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.ll b/test/CodeGen/AMDGPU/fp_to_uint.ll
index d089c291f461..cbff9f22b073 100644
--- a/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,FUNC,SI
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,FUNC,VI
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,FUNC,VI
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s -check-prefix=EG -check-prefix=FUNC
 
 declare float @llvm.fabs.f32(float) #1
diff --git a/test/CodeGen/AMDGPU/fpext.f16.ll b/test/CodeGen/AMDGPU/fpext.f16.ll
index 37796a999b18..c4f5d7cdfb5d 100644
--- a/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}fpext_f16_to_f32
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fpext.ll b/test/CodeGen/AMDGPU/fpext.ll
index ad06bdd90a9f..6dc84b01d734 100644
--- a/test/CodeGen/AMDGPU/fpext.ll
+++ b/test/CodeGen/AMDGPU/fpext.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}fpext_f32_to_f64:
 ; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
diff --git a/test/CodeGen/AMDGPU/fptosi.f16.ll b/test/CodeGen/AMDGPU/fptosi.f16.ll
index 3c973e09d22b..71f56d730e96 100644
--- a/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}fptosi_f16_to_i16
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fptoui.f16.ll b/test/CodeGen/AMDGPU/fptoui.f16.ll
index a74d6d333dbe..a6876624a0c6 100644
--- a/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}fptoui_f16_to_i16
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fptrunc.f16.ll b/test/CodeGen/AMDGPU/fptrunc.f16.ll
index a067960c5848..284fc53c8240 100644
--- a/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}fptrunc_f32_to_f16
 ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/fptrunc.ll b/test/CodeGen/AMDGPU/fptrunc.ll
index 1fd125964bae..0c7b67406a89 100644
--- a/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/test/CodeGen/AMDGPU/fptrunc.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN-UNSAFE %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN-UNSAFE %s
 
 ; FUNC-LABEL: {{^}}fptrunc_f64_to_f32:
 ; GCN: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll
index 29b90a43ab68..0651dce8d95c 100644
--- a/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/test/CodeGen/AMDGPU/fract.f64.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=FUNC %s
 
 ; RUN: llc -march=amdgcn -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=SI-UNSAFE -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=VI-UNSAFE -check-prefix=FUNC %s
 
 declare double @llvm.fabs.f64(double) #0
 declare double @llvm.floor.f64(double) #0
diff --git a/test/CodeGen/AMDGPU/fract.ll b/test/CodeGen/AMDGPU/fract.ll
index 7d713f483047..4e1a503b1298 100644
--- a/test/CodeGen/AMDGPU/fract.ll
+++ b/test/CodeGen/AMDGPU/fract.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN-UNSAFE -check-prefix=GCN %s
 
 declare float @llvm.fabs.f32(float) #0
 declare float @llvm.floor.f32(float) #0
diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll
index e0fc263294ab..039623c02194 100644
--- a/test/CodeGen/AMDGPU/frem.ll
+++ b/test/CodeGen/AMDGPU/frem.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -enable-misched < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-misched < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}frem_f32:
 ; GCN-DAG: buffer_load_dword [[X:v[0-9]+]], {{.*$}}
diff --git a/test/CodeGen/AMDGPU/fsqrt.f64.ll b/test/CodeGen/AMDGPU/fsqrt.f64.ll
index ce0881c329be..ed040436a61a 100644
--- a/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64:
 ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll
index f98cac6ade3a..b6526b8e0787 100644
--- a/test/CodeGen/AMDGPU/fsqrt.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/AMDGPU/fsub.f16.ll b/test/CodeGen/AMDGPU/fsub.f16.ll
index fb15edbaaffc..0b3c8ac2503d 100644
--- a/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}fsub_f16
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/ftrunc.ll b/test/CodeGen/AMDGPU/ftrunc.ll
index 1beeab65ade3..d0718394e7f1 100644
--- a/test/CodeGen/AMDGPU/ftrunc.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI --check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG --check-prefix=FUNC %s
 
 declare float @llvm.trunc.f32(float) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/gep-address-space.ll b/test/CodeGen/AMDGPU/gep-address-space.ll
index f5ab390ce686..f96463613e8e 100644
--- a/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=CHECK %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 
diff --git a/test/CodeGen/AMDGPU/global-directive.ll b/test/CodeGen/AMDGPU/global-directive.ll
index be775cf9292f..450b7d367429 100644
--- a/test/CodeGen/AMDGPU/global-directive.ll
+++ b/test/CodeGen/AMDGPU/global-directive.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; Make sure the GlobalDirective isn't merged with the function name
diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll
index 4d9992997163..2c7c02de1673 100644
--- a/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ b/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; XUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; FIXME: cypress is broken because the bigger testcases spill and it's not implemented
 
diff --git a/test/CodeGen/AMDGPU/global_atomics.ll b/test/CodeGen/AMDGPU/global_atomics.ll
index 743ad7c278be..909ceb5546c6 100644
--- a/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/test/CodeGen/AMDGPU/global_atomics.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_offset:
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
diff --git a/test/CodeGen/AMDGPU/global_atomics_i64.ll b/test/CodeGen/AMDGPU/global_atomics_i64.ll
index 2bae66d5aea8..f66c6c7b531a 100644
--- a/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}atomic_add_i64_offset:
 ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
index 1f9b536cd80b..d07843e9dd27 100644
--- a/test/CodeGen/AMDGPU/gv-const-addrspace.ll
+++ b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
diff --git a/test/CodeGen/AMDGPU/gv-offset-folding.ll b/test/CodeGen/AMDGPU/gv-offset-folding.ll
index c75fdb35dd0e..af5ee8e66750 100644
--- a/test/CodeGen/AMDGPU/gv-offset-folding.ll
+++ b/test/CodeGen/AMDGPU/gv-offset-folding.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -relocation-model=static < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -relocation-model=static < %s | FileCheck %s
 
 @lds = external addrspace(3) global [4 x i32]
 
diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll
index f2bb3f9d110a..aa22e83fade2 100644
--- a/test/CodeGen/AMDGPU/half.ll
+++ b/test/CodeGen/AMDGPU/half.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; half args should be promoted to float for SI and lower.
 
diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
index ae6c1891a84c..a4e599230b74 100644
--- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
@@ -2,9 +2,9 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx701 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI701 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx702 | FileCheck --check-prefix=HSA --check-prefix=HSA-CI702 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=HSA --check-prefix=HSA-CI700 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=tonga | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI801 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI802 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris10 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=polaris11 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx800 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI800 %s
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll
index fd9f469b057d..d4912776debd 100644
--- a/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}br_i1_phi:
diff --git a/test/CodeGen/AMDGPU/icmp64.ll b/test/CodeGen/AMDGPU/icmp64.ll
index 3d42eac44297..33ad0c9199b9 100644
--- a/test/CodeGen/AMDGPU/icmp64.ll
+++ b/test/CodeGen/AMDGPU/icmp64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}test_i64_eq:
 ; SI: v_cmp_eq_u64
diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll
index 0412c5da4d53..ef6008aa5fde 100644
--- a/test/CodeGen/AMDGPU/imm.ll
+++ b/test/CodeGen/AMDGPU/imm.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; Use a 64-bit value with lo bits that can be represented as an inline constant
 ; GCN-LABEL: {{^}}i64_imm_inline_lo:
diff --git a/test/CodeGen/AMDGPU/imm16.ll b/test/CodeGen/AMDGPU/imm16.ll
index ed970287abbf..2e73eb06502f 100644
--- a/test/CodeGen/AMDGPU/imm16.ll
+++ b/test/CodeGen/AMDGPU/imm16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 
 ; FIXME: Merge into imm.ll
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 528e12b76ce0..208e55c143ac 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
-; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
 
 ; Tests for indirect addressing on SI, which is implemented using dynamic
 ; indexing of vectors.
diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll
index 4c9ef2e61f83..4db87c3c1b64 100644
--- a/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA16 -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mattr=-promote-alloca,+max-private-element-size-4 -verify-machineinstrs < %s | FileCheck -check-prefix=SI-ALLOCA4 -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=SI-PROMOTE -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=CI-ALLOCA16 -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=CI-PROMOTE -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca,+max-private-element-size-16 -verify-machineinstrs < %s | FileCheck -check-prefix=CI-ALLOCA16 -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=CI-PROMOTE -check-prefix=SI %s
 
 declare void @llvm.amdgcn.s.barrier() #0
 
diff --git a/test/CodeGen/AMDGPU/infinite-loop.ll b/test/CodeGen/AMDGPU/infinite-loop.ll
index 7233aa57fd78..3e0b695934c7 100644
--- a/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}infinite_loop:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e7
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
index cef1b114cd32..db1a0c67436d 100644
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}inline_asm:
 ; CHECK: s_endpgm
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 2c538b16e743..65ac693a4f44 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 
 ; FIXME: Broken on evergreen
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir
index 7cc9c7c1d923..85cd903a405d 100644
--- a/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -63,7 +63,7 @@ body: |
     S_BRANCH %bb.3
 
   bb.3:
-    %vgpr4, %vcc = V_DIV_SCALE_F32 0, %vgpr1, 0, %vgpr1, 0, %vgpr3, 0, 0, implicit %exec
+    %vgpr4, %vcc = V_DIV_SCALE_F32 %vgpr1, %vgpr1, %vgpr3, implicit %exec
     %vgpr0 = V_DIV_FMAS_F32 0, %vgpr1, 0, %vgpr2, 0, %vgpr3, 0, 0, implicit %vcc, implicit %exec
     S_ENDPGM
 
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index b1f20fd995fd..95a68319f8af 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=SI,GCN,MESA-GCN,FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,MESA-VI,MESA-GCN,FUNC
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefixes=VI,GCN,HSA-VI,FUNC
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
index fb0e15eb0cb9..ea9754a390b6 100644
--- a/test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
-; RUN: llc -march=amdgcn -mcpu=carrizo < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
 
 ; ALL-LABEL: {{^}}large_alloca_pixel_shader:
 ; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
index d56b48457285..77dd4b134982 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
index bf5d492dca41..ee47b14c496d 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
index 88e9c6529778..2336109f4dad 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare float @llvm.fabs.f32(float) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index 93911d4a91f1..9c845e84bc12 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2
 declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index 181d68c8ea75..22097418eec4 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2
 declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
index 2247485f2993..011a0fdbd219 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.fabs.f16(half %a)
 declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
index 5d9f1b824a69..410ac59279a5 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.amdgcn.cos.f16(half %a)
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
index d69cd7c6068c..6d262cf497ac 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.amdgcn.div.fixup.f16(half %a, half %b, half %c)
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
index f9b390eca0c2..cc1504f2bc8d 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 declare float @llvm.amdgcn.div.fixup.f32(float, float, float) nounwind readnone
 declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index af934ede040a..d408fe9f87f6 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
-; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
 
 ; FIXME: Enable for VI.
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 38e4b8440d32..8e5c62c31db5 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -322,7 +322,8 @@ define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float a
 ; SI-LABEL: {{^}}test_div_scale_f32_fabs_num:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], |[[A]]|
+; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[A]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[ABS_A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
@@ -344,7 +345,8 @@ define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspa
 ; SI-LABEL: {{^}}test_div_scale_f32_fabs_den:
 ; SI-DAG: buffer_load_dword [[A:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; SI-DAG: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
-; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], |[[B]]|, |[[B]]|, [[A]]
+; SI: v_and_b32_e32 [[ABS_B:v[0-9]+]], 0x7fffffff, [[B]]
+; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[ABS_B]], [[ABS_B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
 define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
index c1af0ecb3edc..d8c1af036a34 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.amdgcn.fract.f16(half %a)
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
index 1cca9eb6a77a..a75267b8d693 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
 
 declare float @llvm.amdgcn.fract.f32(float) #0
 declare double @llvm.amdgcn.fract.f64(double) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
index 51a1e0776728..7521224058f3 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare i16 @llvm.amdgcn.frexp.exp.i16.f16(half %a)
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
index 94e92f6f1690..9c49f175f2b5 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s  | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s  | FileCheck -check-prefix=GCN %s
 
 declare float @llvm.fabs.f32(float) #0
 declare double @llvm.fabs.f64(double) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
index 4b33549537af..706537d7e21c 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.amdgcn.frexp.mant.f16(half %a)
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
index a4b8d7fa58da..6720cbe9d8da 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b)
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
index 4825c3a479c1..303446b63315 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
@@ -1,10 +1,10 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI  %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-;GCN-LABEL: {{^}}mbcnt_intrinsics:
-;GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0
-;SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
-;VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]]
+; GCN-LABEL: {{^}}mbcnt_intrinsics:
+; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0
+; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
+; VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]]
 
 define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) {
 main_body:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
index a85fc7e13fd8..35fdba8f34a3 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s
-; RUN: llc -O0 -march=amdgcn -mcpu=tonga -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-OPT %s
+; RUN: llc -O0 -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefix=VI -check-prefix=VI-NOOPT %s
 
 ; FIXME: The register allocator / scheduler should be able to avoid these hazards.
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
index 3fda4a59df29..f0b8e2a0293f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.amdgcn.rcp.f16(half %a)
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index a9d52b006df0..436ffff692c6 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
 
 declare i32 @llvm.amdgcn.readlane(i32, i32) #0
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index 7b1373b13f3a..5f40e0d0986f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
 
 declare float @llvm.amdgcn.rsq.clamp.f32(float) #1
 declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
index 74415e3658a4..2022d0289862 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.amdgcn.rsq.f16(half %a)
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
index 012f6cd82925..c644288977a3 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare float @llvm.amdgcn.rsq.f32(float) #0
 declare double @llvm.amdgcn.rsq.f64(double) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
index 372cba6eb67b..d8eda10fdfd8 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
@@ -6,10 +6,10 @@ declare i64 @llvm.amdgcn.s.memrealtime() #0
 ; GCN-DAG: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-DAG: s_load_dwordx2
 ; GCN: lgkmcnt
-; GCN: buffer_store_dwordx2
+; GCN: _store_dwordx2
 ; GCN-NOT: lgkmcnt
 ; GCN: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
-; GCN: buffer_store_dwordx2
+; GCN: _store_dwordx2
 define void @test_s_memrealtime(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.amdgcn.s.memrealtime()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
index 8ce2d48733c6..ff9d74619788 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 declare i64 @llvm.amdgcn.s.memtime() #0
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
index d73986ac5753..6083ec885a86 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
 
 ; CHECK-LABEL: {{^}}test1:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
index a348af259435..d453d03cded8 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 declare i32 @llvm.amdgcn.sffbh.i32(i32) #1
 declare i32 @llvm.AMDGPU.flbit.i32(i32) #1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
index 0ebe012fe1c5..fac0e352614c 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.amdgcn.sin.f16(half %a)
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
index 9dc4554b88a4..e3692fc5906c 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
 
 declare float @llvm.amdgcn.sin.f32(float) #0
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
index 7757e411553b..caac6ddbeb80 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index 8fc12890fad0..1f18173f40a4 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=CI-HSA  %s
-; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -mcpu=carrizo -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=CO-V2 -check-prefix=VI-HSA  %s
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=SI-MESA %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=MESA -check-prefix=VI-MESA %s
 ; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,SI-MESA %s
-; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=ALL,CO-V2,VI-MESA %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare i32 @llvm.amdgcn.workitem.id.y() #0
diff --git a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index dc984e91eec7..112e29ed22a7 100644
--- a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.ceil.f16(half %a)
 declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index bb3a5a4dea7b..ba354ed0b124 100644
--- a/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.cos.f16(half %a)
 declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
index 9165698cb815..7fa56911efdc 100644
--- a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.exp2.f16(half %a)
 declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index a7bdb1e9428a..60dfd734ee73 100644
--- a/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.floor.f16(half %a)
 declare <2 x half> @llvm.floor.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index e6f85ef08f48..3431267e3943 100644
--- a/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.fma.f16(half %a, half %b, half %c)
 declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
diff --git a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index af7bd2703948..3bc85bdc29ef 100644
--- a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-FLUSH %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
 ; RUN: llc -march=amdgcn -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SI-DENORM %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
 
 declare half @llvm.fmuladd.f16(half %a, half %b, half %c)
 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
diff --git a/test/CodeGen/AMDGPU/llvm.log2.f16.ll b/test/CodeGen/AMDGPU/llvm.log2.f16.ll
index 8c35412b1686..8d1a8973cb4e 100644
--- a/test/CodeGen/AMDGPU/llvm.log2.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.log2.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.log2.f16(half %a)
 declare <2 x half> @llvm.log2.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 0f75f7a5a492..8adc01b7b8c7 100644
--- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.maxnum.f16(half %a, half %b)
 declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
diff --git a/test/CodeGen/AMDGPU/llvm.memcpy.ll b/test/CodeGen/AMDGPU/llvm.memcpy.ll
index ccb383e0d027..009338d273f5 100644
--- a/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* nocapture, i8 addrspace(3)* nocapture, i32, i32, i1) nounwind
 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 6bf2e9ba2e32..4cc1deb2095c 100644
--- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.minnum.f16(half %a, half %b)
 declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index 13ebee41e844..a5b07e072fa5 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SI-NOHSA -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI  -check-prefix=VI-NOHSA -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index ad9b66c9038c..3657940f36fd 100644
--- a/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.rint.f16(half %a)
 declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll
index 86002662e0b7..7e8f8ff172e8 100644
--- a/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}round_f32:
diff --git a/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index 8374a75370b2..b01932f69b06 100644
--- a/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.sin.f16(half %a)
 declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index 8217ebf46738..69125b0bcfdc 100644
--- a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.sqrt.f16(half %a)
 declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index 04054c2b70fe..9f84b432209d 100644
--- a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare half @llvm.trunc.f16(half %a)
 declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
diff --git a/test/CodeGen/AMDGPU/load-constant-f64.ll b/test/CodeGen/AMDGPU/load-constant-f64.ll
index f94a3785a685..1b42a9e96e01 100644
--- a/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}constant_load_f64:
 ; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
diff --git a/test/CodeGen/AMDGPU/load-constant-i1.ll b/test/CodeGen/AMDGPU/load-constant-i1.ll
index f15e4f484ffa..104af10036c1 100644
--- a/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}constant_load_i1:
diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll
index eb79767e62be..f7be1291040f 100644
--- a/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}constant_load_i16:
diff --git a/test/CodeGen/AMDGPU/load-constant-i32.ll b/test/CodeGen/AMDGPU/load-constant-i32.ll
index 56b9e3a187c6..d1ff1c706c40 100644
--- a/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}constant_load_i32:
diff --git a/test/CodeGen/AMDGPU/load-constant-i64.ll b/test/CodeGen/AMDGPU/load-constant-i64.ll
index e4656a2b2ac6..0d071a10b49a 100644
--- a/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/AMDGPU/load-constant-i8.ll b/test/CodeGen/AMDGPU/load-constant-i8.ll
index 39a299fcd327..9fdc4ebfd854 100644
--- a/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/AMDGPU/load-global-f32.ll b/test/CodeGen/AMDGPU/load-global-f32.ll
index 23f4a6079e81..805c0a7a39c7 100644
--- a/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
diff --git a/test/CodeGen/AMDGPU/load-global-f64.ll b/test/CodeGen/AMDGPU/load-global-f64.ll
index a86cc5a6d3d4..dc1a9432283e 100644
--- a/test/CodeGen/AMDGPU/load-global-f64.ll
+++ b/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}global_load_f64:
 ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
diff --git a/test/CodeGen/AMDGPU/load-global-i1.ll b/test/CodeGen/AMDGPU/load-global-i1.ll
index ebfec781087e..e2e90cac8cc1 100644
--- a/test/CodeGen/AMDGPU/load-global-i1.ll
+++ b/test/CodeGen/AMDGPU/load-global-i1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}global_load_i1:
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll
index 7bd131e6516c..88d6b7b99d30 100644
--- a/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-SI,FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,GCN-NOHSA-VI,FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=EGCM -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=EGCM -check-prefix=FUNC %s
 
diff --git a/test/CodeGen/AMDGPU/load-global-i32.ll b/test/CodeGen/AMDGPU/load-global-i32.ll
index 86f5d4eb4d5d..e3335347a63f 100644
--- a/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/AMDGPU/load-global-i64.ll b/test/CodeGen/AMDGPU/load-global-i64.ll
index 305b954c78f9..dd4ce2c10ebd 100644
--- a/test/CodeGen/AMDGPU/load-global-i64.ll
+++ b/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-HSA -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NOHSA -check-prefix=FUNC %s
 
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll
index b183b6ccd622..c880700f347b 100644
--- a/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,SI,FUNC %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-HSA,SI,FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-NOHSA,VI,FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
diff --git a/test/CodeGen/AMDGPU/load-local-i32.ll b/test/CodeGen/AMDGPU/load-local-i32.ll
index 271e563024c0..280f9658ef8d 100644
--- a/test/CodeGen/AMDGPU/load-local-i32.ll
+++ b/test/CodeGen/AMDGPU/load-local-i32.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/AMDGPU/load-local-i8.ll b/test/CodeGen/AMDGPU/load-local-i8.ll
index 02b59e89c3f3..9ffc74213dd5 100644
--- a/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -142,7 +142,7 @@ define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8>
 ; GCN: s_mov_b32 m0
 ; GCN: ds_read_u16
 ; FIXME: Need to optimize this sequence to avoid extra shift on VI.
-;         t23: i16 = srl t39, Constant:i32<8> 
+;         t23: i16 = srl t39, Constant:i32<8>
 ;          t31: i32 = any_extend t23
 ;        t33: i32 = sign_extend_inreg t31, ValueType:ch:i8
 
@@ -708,10 +708,11 @@ define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8>
 ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16:
 
 ; EG: LDS_READ_RET
+; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
+; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-; EG-DAG: ASHR
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
@@ -740,14 +741,15 @@ define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8>
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
+; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
@@ -786,6 +788,11 @@ define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
+; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
@@ -798,10 +805,6 @@ define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: ASHR
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
@@ -860,6 +863,11 @@ define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
+; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
+; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
@@ -884,14 +892,6 @@ define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: ASHR
-; EG-DAG: ASHR
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
diff --git a/test/CodeGen/AMDGPU/load-weird-sizes.ll b/test/CodeGen/AMDGPU/load-weird-sizes.ll
index b9f7018b8107..bc5e4945fb04 100644
--- a/test/CodeGen/AMDGPU/load-weird-sizes.ll
+++ b/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=CI-HSA -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=R600 -check-prefix=CM -check-prefix=FUNC %s
 
diff --git a/test/CodeGen/AMDGPU/local-64.ll b/test/CodeGen/AMDGPU/local-64.ll
index 5cabd4dd81be..a7cee43187c1 100644
--- a/test/CodeGen/AMDGPU/local-64.ll
+++ b/test/CodeGen/AMDGPU/local-64.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck --check-prefix=SI --check-prefix=BOTH %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=BOTH %s
 
 ; BOTH-LABEL: {{^}}local_i32_load
 ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
diff --git a/test/CodeGen/AMDGPU/local-atomics.ll b/test/CodeGen/AMDGPU/local-atomics.ll
index ce82ff5475bc..6714a28aa43a 100644
--- a/test/CodeGen/AMDGPU/local-atomics.ll
+++ b/test/CodeGen/AMDGPU/local-atomics.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CIVI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_ret_i32:
diff --git a/test/CodeGen/AMDGPU/local-atomics64.ll b/test/CodeGen/AMDGPU/local-atomics64.ll
index 34be6511a602..c88917812eda 100644
--- a/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64:
 ; GCN: ds_wrxchg_rtn_b64
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
index 2ef045dbb8eb..dc43e8613ddf 100644
--- a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
+++ b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
 
 ; This used to fail due to a v_add_i32 instruction with an illegal immediate
 ; operand that was created during Local Stack Slot Allocation. Test case derived
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-offset.ll b/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
index 256f6e87c84b..9b9d844cb0c6 100644
--- a/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
+++ b/test/CodeGen/AMDGPU/local-stack-slot-offset.ll
@@ -1,5 +1,5 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=verde -mattr=+vgpr-spilling -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -mattr=-promote-alloca -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
 
 ; Allocate two stack slots of 2052 bytes each requiring a total of 4104 bytes.
 ; Extracting the last element of each does not fit into the offset field of
diff --git a/test/CodeGen/AMDGPU/lshl.ll b/test/CodeGen/AMDGPU/lshl.ll
index 9ac988d38d1b..8468437c2c1f 100644
--- a/test/CodeGen/AMDGPU/lshl.ll
+++ b/test/CodeGen/AMDGPU/lshl.ll
@@ -1,5 +1,5 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
 
 ;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
 
diff --git a/test/CodeGen/AMDGPU/lshr.ll b/test/CodeGen/AMDGPU/lshr.ll
index 50e444ac26b3..c8ab7871434e 100644
--- a/test/CodeGen/AMDGPU/lshr.ll
+++ b/test/CodeGen/AMDGPU/lshr.ll
@@ -1,5 +1,5 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
 
 ;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
 
diff --git a/test/CodeGen/AMDGPU/mad_int24.ll b/test/CodeGen/AMDGPU/mad_int24.ll
index f177608a62fc..f149ea0a6a0e 100644
--- a/test/CodeGen/AMDGPU/mad_int24.ll
+++ b/test/CodeGen/AMDGPU/mad_int24.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
 
diff --git a/test/CodeGen/AMDGPU/mad_uint24.ll b/test/CodeGen/AMDGPU/mad_uint24.ll
index 214534710653..9fde950f822c 100644
--- a/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -1,8 +1,8 @@
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ; RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=EG --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC
-; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC
+; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI --check-prefix=FUNC
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index c3d1a49c493a..6722aa79dd5d 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 ; FIXME: Enable VI
 
diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll
index 718508526bb1..27fbf58d26c6 100644
--- a/test/CodeGen/AMDGPU/madmk.ll
+++ b/test/CodeGen/AMDGPU/madmk.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
  ; FIXME: None of these trigger madmk emission anymore. It is still
  ; possible, but requires the correct registers to be used which is
diff --git a/test/CodeGen/AMDGPU/max.i16.ll b/test/CodeGen/AMDGPU/max.i16.ll
index 0b0e026c5fa1..3f2a87f20691 100644
--- a/test/CodeGen/AMDGPU/max.i16.ll
+++ b/test/CodeGen/AMDGPU/max.i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=amdgcn -mcpu=fiji -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/merge-store-usedef.ll b/test/CodeGen/AMDGPU/merge-store-usedef.ll
index e4a36d7e6914..1c82aeb9b7f5 100644
--- a/test/CodeGen/AMDGPU/merge-store-usedef.ll
+++ b/test/CodeGen/AMDGPU/merge-store-usedef.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test1:
 ; CHECK: ds_write_b32
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
index 13d56535303f..19d0117d64a9 100644
--- a/test/CodeGen/AMDGPU/min.ll
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
diff --git a/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll b/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
index b528577a7eaa..7d26ab88c701 100644
--- a/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
+++ b/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
@@ -1,5 +1,4 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
 
 ; Test that buffer_load_format with VGPR resource descriptor is properly
 ; legalized.
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
index c2fae4cfd2b2..7910b70d8cf2 100644
--- a/test/CodeGen/AMDGPU/mul.ll
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s -check-prefix=FUNC
 
 ; mul24 and mad24 are affected
diff --git a/test/CodeGen/AMDGPU/mul_int24.ll b/test/CodeGen/AMDGPU/mul_int24.ll
index 0beb9524ed79..6f7dfe2e13eb 100644
--- a/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/test/CodeGen/AMDGPU/mul_int24.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
 
diff --git a/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 76869337730a..004d36f00e51 100644
--- a/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/operand-spacing.ll b/test/CodeGen/AMDGPU/operand-spacing.ll
index 20420a84de6f..127f3da220e7 100644
--- a/test/CodeGen/AMDGPU/operand-spacing.ll
+++ b/test/CodeGen/AMDGPU/operand-spacing.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=VI -check-prefix=GCN %s
 
 ; Make sure there isn't an extra space between the instruction name and first operands.
 
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
index 3e254850a934..eca6909d4eb9 100644
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/AMDGPU/rcp-pattern.ll b/test/CodeGen/AMDGPU/rcp-pattern.ll
index 58d411772f14..b7cc6d47cd87 100644
--- a/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
diff --git a/test/CodeGen/AMDGPU/readcyclecounter.ll b/test/CodeGen/AMDGPU/readcyclecounter.ll
index e6d0efd0ff94..7965b061fe5b 100644
--- a/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -8,11 +8,11 @@ declare i64 @llvm.readcyclecounter() #0
 ; VI-DAG: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-DAG: s_load_dwordx2
 ; GCN: lgkmcnt
-; GCN: buffer_store_dwordx2
+; GCN: store_dwordx2
 ; GCN-NOT: lgkmcnt
 ; SI: s_memtime s{{\[[0-9]+:[0-9]+\]}}
 ; VI: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
-; GCN: buffer_store_dwordx2
+; GCN: store_dwordx2
 define void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.readcyclecounter()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
index c902cb9e1dfb..dd67dc488dbf 100644
--- a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
+++ b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}reduce_i64_load_align_4_width_to_i32:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll b/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
index 6e95f4c7521f..909644850750 100644
--- a/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
+++ b/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -o /dev/null < %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o /dev/null < %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -o /dev/null < %s
 
 ; The register coalescer introduces a verifier error which later
 ; results in a crash during scheduling.
diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll
index ad8d00c36393..412202fa5d51 100644
--- a/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}no_reorder_v2f64_global_load_store:
 ; SI: buffer_load_dwordx4
diff --git a/test/CodeGen/AMDGPU/rotl.i64.ll b/test/CodeGen/AMDGPU/rotl.i64.ll
index 3f4ceb7e0310..b60c470de97c 100644
--- a/test/CodeGen/AMDGPU/rotl.i64.ll
+++ b/test/CodeGen/AMDGPU/rotl.i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
 
 ; BOTH-LABEL: {{^}}s_rotl_i64:
diff --git a/test/CodeGen/AMDGPU/rotr.i64.ll b/test/CodeGen/AMDGPU/rotr.i64.ll
index 586de44a566c..58a1efe08079 100644
--- a/test/CodeGen/AMDGPU/rotr.i64.ll
+++ b/test/CodeGen/AMDGPU/rotr.i64.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=BOTH %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=BOTH %s
 
 ; BOTH-LABEL: {{^}}s_rotr_i64:
diff --git a/test/CodeGen/AMDGPU/rotr.ll b/test/CodeGen/AMDGPU/rotr.ll
index 044f9ffe6d63..55d180077cc7 100644
--- a/test/CodeGen/AMDGPU/rotr.ll
+++ b/test/CodeGen/AMDGPU/rotr.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=R600 -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}rotr_i32:
diff --git a/test/CodeGen/AMDGPU/s_addk_i32.ll b/test/CodeGen/AMDGPU/s_addk_i32.ll
index bd6596a14920..f776faca8397 100644
--- a/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}s_addk_i32_k0:
 ; SI: s_load_dword [[VAL:s[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll
index e422270fc4ac..0164c45083a2 100644
--- a/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}s_movk_i32_k0:
 ; SI-DAG: s_mov_b32 [[LO_S_IMM:s[0-9]+]], 0xffff{{$}}
diff --git a/test/CodeGen/AMDGPU/s_mulk_i32.ll b/test/CodeGen/AMDGPU/s_mulk_i32.ll
index a1462193ebb3..e83b368cc1cb 100644
--- a/test/CodeGen/AMDGPU/s_mulk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_mulk_i32.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 ; SI-LABEL: {{^}}s_mulk_i32_k0:
 ; SI: s_load_dword [[VAL:s[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 55b392a32729..32df16778a91 100644
--- a/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; XXX - Why the packing?
 ; FUNC-LABEL: {{^}}scalar_to_vector_v2i32:
diff --git a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
index 886d4a1dcb5c..ccfde7b9adc5 100644
--- a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
 
 ; FUNC-LABEL: {{^}}cluster_arg_loads:
 ; FIXME: Due to changes in the load clustering heuristics.  We no longer
diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll
index 66caad4677b6..bafd6a50ccfe 100644
--- a/test/CodeGen/AMDGPU/sdiv.ll
+++ b/test/CodeGen/AMDGPU/sdiv.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; The code generated by sdiv is long and complex and may frequently change.
diff --git a/test/CodeGen/AMDGPU/sdivrem24.ll b/test/CodeGen/AMDGPU/sdivrem24.ll
index ccabd3c2a969..349a7821da17 100644
--- a/test/CodeGen/AMDGPU/sdivrem24.ll
+++ b/test/CodeGen/AMDGPU/sdivrem24.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}sdiv24_i8:
diff --git a/test/CodeGen/AMDGPU/sdivrem64.ll b/test/CodeGen/AMDGPU/sdivrem64.ll
index a7ce948acd4f..28fdb69e1ada 100644
--- a/test/CodeGen/AMDGPU/sdivrem64.ll
+++ b/test/CodeGen/AMDGPU/sdivrem64.ll
@@ -1,5 +1,5 @@
 ;RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
-;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
 ;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
 ;FUNC-LABEL: {{^}}s_test_sdiv:
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
index d9d311cd032b..73dadde884ae 100644
--- a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}add_select_fabs_fabs_f32:
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/select-i1.ll b/test/CodeGen/AMDGPU/select-i1.ll
index adc054b4895b..07dcb2153384 100644
--- a/test/CodeGen/AMDGPU/select-i1.ll
+++ b/test/CodeGen/AMDGPU/select-i1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FIXME: This should go in existing select.ll test, except the current testcase there is broken on SI
 
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
index 240235138c8d..759abe2f2e9a 100644
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; Test expansion of scalar selects on vectors.
 ; Evergreen not enabled since it seems to be having problems with doubles.
diff --git a/test/CodeGen/AMDGPU/select.f16.ll b/test/CodeGen/AMDGPU/select.f16.ll
index 84084c912543..19fe8d9b2326 100644
--- a/test/CodeGen/AMDGPU/select.f16.ll
+++ b/test/CodeGen/AMDGPU/select.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}select_f16
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/selectcc-opt.ll b/test/CodeGen/AMDGPU/selectcc-opt.ll
index 25670afec247..0f46d4c7ea06 100644
--- a/test/CodeGen/AMDGPU/selectcc-opt.ll
+++ b/test/CodeGen/AMDGPU/selectcc-opt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/AMDGPU/setcc-opt.ll b/test/CodeGen/AMDGPU/setcc-opt.ll
index e5af10e95228..4ab6da085634 100644
--- a/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_0:
diff --git a/test/CodeGen/AMDGPU/setcc64.ll b/test/CodeGen/AMDGPU/setcc64.ll
index 5c6804e071df..1f86277e0bc6 100644
--- a/test/CodeGen/AMDGPU/setcc64.ll
+++ b/test/CodeGen/AMDGPU/setcc64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s| FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; XXX: Merge this into setcc, once R600 supports 64-bit operations
 
diff --git a/test/CodeGen/AMDGPU/seto.ll b/test/CodeGen/AMDGPU/seto.ll
index 9b5d6b5dbd62..01e4a7fda5d2 100644
--- a/test/CodeGen/AMDGPU/seto.ll
+++ b/test/CodeGen/AMDGPU/seto.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index 84f9b7bb8064..4c58261709c4 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FIXME: i16 promotion pass ruins the scalar cases when legal.
diff --git a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
index df67fcca22fe..f44ae6e09e9f 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 ; Copy VGPR -> SGPR used twice as an instruction operand, which is then
diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll
index 31ce4a681992..013f5253b369 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
 
 ; This test checks that no VGPR to SGPR copies are created by the register
 ; allocator.
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index 486465cb7172..972349c24453 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; XUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; XUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll
index a6be2eda33b3..6e45759fa058 100644
--- a/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=+load-store-opt -enable-misched < %s | FileCheck -check-prefix=SI %s
 
 ; Test that doing a shift of a pointer with a constant add will be
 ; folded into the constant offset addressing mode even if the add has
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf.ll b/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 175fa0e9f950..d658b229fd37 100644
--- a/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:
 
diff --git a/test/CodeGen/AMDGPU/si-lod-bias.ll b/test/CodeGen/AMDGPU/si-lod-bias.ll
index bca34af3e270..8e846d7a238e 100644
--- a/test/CodeGen/AMDGPU/si-lod-bias.ll
+++ b/test/CodeGen/AMDGPU/si-lod-bias.ll
@@ -1,5 +1,5 @@
 ;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
 
 ; This shader has the potential to generated illegal VGPR to SGPR copies if
 ; the wrong register class is used for the REG_SEQUENCE instructions.
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index f035da83d18c..e61b4051124a 100644
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
 
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
diff --git a/test/CodeGen/AMDGPU/si-spill-cf.ll b/test/CodeGen/AMDGPU/si-spill-cf.ll
index 30aa2d550f65..06f9277080a8 100644
--- a/test/CodeGen/AMDGPU/si-spill-cf.ll
+++ b/test/CodeGen/AMDGPU/si-spill-cf.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s -verify-machineinstrs | FileCheck -check-prefix=SI %s
 
 ; If this occurs it is likely due to reordering and the restore was
diff --git a/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
index e9a3a7f3fcff..062f5245af10 100644
--- a/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s
-; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SGPR %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=SMEM %s
 
 ; Make sure this doesn't crash.
 ; ALL-LABEL: {{^}}test:
diff --git a/test/CodeGen/AMDGPU/si-vector-hang.ll b/test/CodeGen/AMDGPU/si-vector-hang.ll
index c7d85a0340cc..dd8783df5c3c 100644
--- a/test/CodeGen/AMDGPU/si-vector-hang.ll
+++ b/test/CodeGen/AMDGPU/si-vector-hang.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
 
 ; CHECK: {{^}}test_8_min_char:
 ; CHECK: buffer_store_byte
diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll
index 05938170eba1..875351c59961 100644
--- a/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/test/CodeGen/AMDGPU/sign_extend.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
 
 ; GCN-LABEL: {{^}}s_sext_i1_to_i32:
 ; GCN: v_cndmask_b32_e64
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index b0ae1c2969f9..5df8105116cc 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 
 ; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
 
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.ll b/test/CodeGen/AMDGPU/sint_to_fp.ll
index bae9bead46ab..4c8fea12bada 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_i32_to_f32:
diff --git a/test/CodeGen/AMDGPU/sitofp.f16.ll b/test/CodeGen/AMDGPU/sitofp.f16.ll
index 0910d7718440..1395fa2bfea0 100644
--- a/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}sitofp_i16_to_f16
 ; GCN: buffer_load_{{sshort|ushort}} v[[A_I16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/smed3.ll b/test/CodeGen/AMDGPU/smed3.ll
index 9b977fc54630..985c73904f43 100644
--- a/test/CodeGen/AMDGPU/smed3.ll
+++ b/test/CodeGen/AMDGPU/smed3.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll
index 83465206e066..ce5d92451647 100644
--- a/test/CodeGen/AMDGPU/sminmax.ll
+++ b/test/CodeGen/AMDGPU/sminmax.ll
@@ -181,8 +181,8 @@ define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32
 }
 
 ; FUNC-LABEL: {{^}}v_min_max_i32:
-; GCN: buffer_load_dword [[VAL0:v[0-9]+]]
-; GCN: buffer_load_dword [[VAL1:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL0:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[VAL1:v[0-9]+]]
 
 ; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
 ; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
diff --git a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
index f82ac041969c..daac5b92b1ef 100644
--- a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VCCZ-BUG %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOVCCZ-BUG %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NOVCCZ-BUG %s
 
 ; GCN-FUNC: {{^}}vccz_workaround:
 ; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0x0
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index 476da9486dff..9b118425f9cb 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s
+; RUN: llc < %s -march=amdgcn -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s
 ; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=CI --check-prefix=GCN  %s
 ; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s
 
diff --git a/test/CodeGen/AMDGPU/sopk-compares.ll b/test/CodeGen/AMDGPU/sopk-compares.ll
index 6b708d9af5db..74acc5bc961c 100644
--- a/test/CodeGen/AMDGPU/sopk-compares.ll
+++ b/test/CodeGen/AMDGPU/sopk-compares.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; Since this intrinsic is exposed as a constant after isel, use it to
 ; defeat the DAG's compare with constant canonicalizations.
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index 710547426e3c..ad7c86fe7919 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index bbd954356322..6b006fd936d7 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
-; XUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
diff --git a/test/CodeGen/AMDGPU/store-global.ll b/test/CodeGen/AMDGPU/store-global.ll
index 0e68cf450578..5d49795a68ec 100644
--- a/test/CodeGen/AMDGPU/store-global.ll
+++ b/test/CodeGen/AMDGPU/store-global.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cayman < %s | FileCheck -check-prefix=CM -check-prefix=FUNC %s
 
diff --git a/test/CodeGen/AMDGPU/store-v3i64.ll b/test/CodeGen/AMDGPU/store-v3i64.ll
index 03ae01afbcd0..78db2d37724b 100644
--- a/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}global_store_v3i64:
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll
index 065bac3488d2..b5d5f56b2796 100644
--- a/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/test/CodeGen/AMDGPU/sub.i16.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
index f7aa4bc2c6dc..a331475820a0 100644
--- a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
+++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI %s
 
 ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
 ; CHECK: buffer_load_dword v
diff --git a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
index b065fb266cec..7a4bced9d436 100644
--- a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
+++ b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
diff --git a/test/CodeGen/AMDGPU/trunc-store-i1.ll b/test/CodeGen/AMDGPU/trunc-store-i1.ll
index b1e879f26301..da2a5b43dad5 100644
--- a/test/CodeGen/AMDGPU/trunc-store-i1.ll
+++ b/test/CodeGen/AMDGPU/trunc-store-i1.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
 
 
 ; SI-LABEL: {{^}}global_truncstore_i32_to_i1:
diff --git a/test/CodeGen/AMDGPU/trunc-store.ll b/test/CodeGen/AMDGPU/trunc-store.ll
index cf5c00e65b7d..c6727e1e1273 100644
--- a/test/CodeGen/AMDGPU/trunc-store.ll
+++ b/test/CodeGen/AMDGPU/trunc-store.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8:
 ; SI: buffer_store_dwordx4
diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll
index 11438f267ad0..35af7119a300 100644
--- a/test/CodeGen/AMDGPU/uaddo.ll
+++ b/test/CodeGen/AMDGPU/uaddo.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs< %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll
index 02383a972056..da88d2a8e8cb 100644
--- a/test/CodeGen/AMDGPU/udiv.ll
+++ b/test/CodeGen/AMDGPU/udiv.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}udiv_i32:
diff --git a/test/CodeGen/AMDGPU/udivrem.ll b/test/CodeGen/AMDGPU/udivrem.ll
index 078df5307334..17f4ebf175d9 100644
--- a/test/CodeGen/AMDGPU/udivrem.ll
+++ b/test/CodeGen/AMDGPU/udivrem.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}test_udivrem:
diff --git a/test/CodeGen/AMDGPU/udivrem24.ll b/test/CodeGen/AMDGPU/udivrem24.ll
index 147b95560935..6d145f1dbf09 100644
--- a/test/CodeGen/AMDGPU/udivrem24.ll
+++ b/test/CodeGen/AMDGPU/udivrem24.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}udiv24_i8:
diff --git a/test/CodeGen/AMDGPU/udivrem64.ll b/test/CodeGen/AMDGPU/udivrem64.ll
index 72e6af9a6eea..da61a841ff35 100644
--- a/test/CodeGen/AMDGPU/udivrem64.ll
+++ b/test/CodeGen/AMDGPU/udivrem64.ll
@@ -1,5 +1,5 @@
 ;RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=FUNC %s
-;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
+;RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=FUNC %s
 ;RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
 ;FUNC-LABEL: {{^}}test_udiv:
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index 22d563d40b74..cd816b27fce6 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 
 ; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600
 
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.ll b/test/CodeGen/AMDGPU/uint_to_fp.ll
index aae055083663..3003226ca1a4 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_i32_to_f32:
diff --git a/test/CodeGen/AMDGPU/uitofp.f16.ll b/test/CodeGen/AMDGPU/uitofp.f16.ll
index 73d9e8c9c222..faab5ca5db73 100644
--- a/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}uitofp_i16_to_f16
 ; GCN: buffer_load_ushort v[[A_I16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/umed3.ll b/test/CodeGen/AMDGPU/umed3.ll
index a26eb8f9ada9..a2e485d36225 100644
--- a/test/CodeGen/AMDGPU/umed3.ll
+++ b/test/CodeGen/AMDGPU/umed3.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
diff --git a/test/CodeGen/AMDGPU/unaligned-load-store.ll b/test/CodeGen/AMDGPU/unaligned-load-store.ll
index 453c981db1a5..0f76a54975e6 100644
--- a/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=+unaligned-buffer-access -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=UNALIGNED %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs< %s | FileCheck -check-prefix=SI -check-prefix=ALIGNED %s
 
 ; SI-LABEL: {{^}}local_unaligned_load_store_i16:
 ; SI: ds_read_u8
diff --git a/test/CodeGen/AMDGPU/uniform-cfg.ll b/test/CodeGen/AMDGPU/uniform-cfg.ll
index a0060bd368be..154ac361e797 100644
--- a/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -machine-sink-split-probability-threshold=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}uniform_if_scc:
 ; GCN-DAG: s_cmp_eq_u32 s{{[0-9]+}}, 0
diff --git a/test/CodeGen/AMDGPU/urecip.ll b/test/CodeGen/AMDGPU/urecip.ll
index daacc771708a..d58d2dc2d963 100644
--- a/test/CodeGen/AMDGPU/urecip.ll
+++ b/test/CodeGen/AMDGPU/urecip.ll
@@ -1,7 +1,7 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
-;CHECK: v_rcp_iflag_f32_e32
+; CHECK: v_rcp_iflag_f32_e32
 
 define void @test(i32 %p, i32 %q) {
    %i = udiv i32 %p, %q
diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll
index 62841ec2d6c5..9e2cfa34e0b9 100644
--- a/test/CodeGen/AMDGPU/urem.ll
+++ b/test/CodeGen/AMDGPU/urem.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; The code generated by urem is long and complex and may frequently
diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 87fcfbade14a..82bdc261b112 100644
--- a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 declare float @llvm.fma.f32(float, float, float) #1
 declare double @llvm.fma.f64(double, double, double) #1
diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll
index 1c61671f9cd3..1cd49feb0d88 100644
--- a/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll
index 16aed5928b0a..9a2dc743d6c9 100644
--- a/test/CodeGen/AMDGPU/v_mac.ll
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}mac_vvv:
 ; GCN: buffer_load_dword [[A:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0{{$}}
diff --git a/test/CodeGen/AMDGPU/v_mac_f16.ll b/test/CodeGen/AMDGPU/v_mac_f16.ll
index ecd5b01545d6..151f2cc9fc73 100644
--- a/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -1,10 +1,10 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}mac_f16
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
-; GCN: buffer_load_ushort v[[C_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[A_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[B_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_ushort v[[C_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32:[0-9]+]], v[[C_F16]]
@@ -283,9 +283,9 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}mac_v2f16
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]]
 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
@@ -306,7 +306,7 @@ entry:
 ; VI:  v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[C_V2_F16]]
 ; VI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
 ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
-; GCN: buffer_store_dword v[[R_V2_F16]]
+; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
 define void @mac_v2f16(
     <2 x half> addrspace(1)* %r,
diff --git a/test/CodeGen/AMDGPU/v_madak_f16.ll b/test/CodeGen/AMDGPU/v_madak_f16.ll
index fd5ad3e3d605..df220d7a977b 100644
--- a/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}madak_f16
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index df7cf09e9933..a8908f87fbf6 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
 ; RUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
 
 ; This ends up using all 256 registers and requires register
 ; scavenging which will fail to find an unsued register.
diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll
index 0cd706b642d7..fe5be7526b19 100644
--- a/test/CodeGen/AMDGPU/vselect.ll
+++ b/test/CodeGen/AMDGPU/vselect.ll
@@ -1,5 +1,5 @@
 ;RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=VI --check-prefix=FUNC %s
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=VI --check-prefix=FUNC %s
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}test_select_v2i32:
diff --git a/test/CodeGen/AMDGPU/wait.ll b/test/CodeGen/AMDGPU/wait.ll
index 152a474cde7e..621c582fcefd 100644
--- a/test/CodeGen/AMDGPU/wait.ll
+++ b/test/CodeGen/AMDGPU/wait.ll
@@ -1,7 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT
-; RUN: llc -march=amdgcn --misched=ilpmax -mcpu=SI -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX
-; RUN: llc -march=amdgcn --misched=ilpmax -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=DEFAULT
+; RUN: llc -march=amdgcn --misched=ilpmax -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX
+; RUN: llc -march=amdgcn --misched=ilpmax -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace %s --check-prefix=ILPMAX
 ; The ilpmax scheduler is used for the second test to get the ordering we want for the test.
 
 ; DEFAULT-LABEL: {{^}}main:
diff --git a/test/CodeGen/AMDGPU/waitcnt-flat.ll b/test/CodeGen/AMDGPU/waitcnt-flat.ll
index 38dbf2794fc5..d29bae45d8c2 100644
--- a/test/CodeGen/AMDGPU/waitcnt-flat.ll
+++ b/test/CodeGen/AMDGPU/waitcnt-flat.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck --check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji | FileCheck --check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global | FileCheck --check-prefix=GCN %s
 
 ; If flat_store_dword and flat_load_dword use different registers for the data
 ; operand, this test is not broken.  It just means it is no longer testing
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index 53f4c0a91742..bf02d4c3b311 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 
diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll
index b30cb73f6dac..572099617605 100644
--- a/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/test/CodeGen/AMDGPU/zero_extend.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefix=SI
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=SI
+; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=SI
 ; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=R600
 
 ; R600: {{^}}s_mad_zext_i32_to_i64:
diff --git a/test/CodeGen/ARM/neon_div.ll b/test/CodeGen/ARM/neon_div.ll
index e185c2a8afbc..23b626e0ce51 100644
--- a/test/CodeGen/ARM/neon_div.ll
+++ b/test/CodeGen/ARM/neon_div.ll
@@ -1,49 +1,58 @@
-; RUN: llc -mtriple=arm-eabi -mattr=+neon -pre-RA-sched=source -disable-post-ra %s -o - \
-; RUN:  | FileCheck %s
+; RUN: llc -mtriple arm-eabi -mattr=+neon -disable-post-ra -pre-RA-sched source %s -o - | FileCheck %s
+; RUN: llc -mtriple thumbv7-windows-itanium -mattr=+neon -disable-post-ra -pre-RA-sched source %s -o - | FileCheck %s
 
 define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK: vrecpe.f32
-;CHECK: vmovn.i32
-;CHECK: vrecpe.f32
-;CHECK: vmovn.i32
-;CHECK: vmovn.i16
-	%tmp1 = load <8 x i8>, <8 x i8>* %A
-	%tmp2 = load <8 x i8>, <8 x i8>* %B
-	%tmp3 = sdiv <8 x i8> %tmp1, %tmp2
-	ret <8 x i8> %tmp3
+  %tmp1 = load <8 x i8>, <8 x i8>* %A
+  %tmp2 = load <8 x i8>, <8 x i8>* %B
+  %tmp3 = sdiv <8 x i8> %tmp1, %tmp2
+  ret <8 x i8> %tmp3
 }
 
+; CHECK-LABEL: sdivi8:
+; CHECK: vrecpe.f32
+; CHECK: vmovn.i32
+; CHECK: vrecpe.f32
+; CHECK: vmovn.i32
+; CHECK: vmovn.i16
+
 define <8 x i8> @udivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
-;CHECK: vrecpe.f32
-;CHECK: vrecps.f32
-;CHECK: vmovn.i32
-;CHECK: vrecpe.f32
-;CHECK: vrecps.f32
-;CHECK: vmovn.i32
-;CHECK: vqmovun.s16
-	%tmp1 = load <8 x i8>, <8 x i8>* %A
-	%tmp2 = load <8 x i8>, <8 x i8>* %B
-	%tmp3 = udiv <8 x i8> %tmp1, %tmp2
-	ret <8 x i8> %tmp3
+  %tmp1 = load <8 x i8>, <8 x i8>* %A
+  %tmp2 = load <8 x i8>, <8 x i8>* %B
+  %tmp3 = udiv <8 x i8> %tmp1, %tmp2
+  ret <8 x i8> %tmp3
 }
 
+; CHECK-LABEL: udivi8:
+; CHECK: vrecpe.f32
+; CHECK: vrecps.f32
+; CHECK: vmovn.i32
+; CHECK: vrecpe.f32
+; CHECK: vrecps.f32
+; CHECK: vmovn.i32
+; CHECK: vqmovun.s16
+
 define <4 x i16> @sdivi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK: vrecpe.f32
-;CHECK: vrecps.f32
-;CHECK: vmovn.i32
-	%tmp1 = load <4 x i16>, <4 x i16>* %A
-	%tmp2 = load <4 x i16>, <4 x i16>* %B
-	%tmp3 = sdiv <4 x i16> %tmp1, %tmp2
-	ret <4 x i16> %tmp3
+  %tmp1 = load <4 x i16>, <4 x i16>* %A
+  %tmp2 = load <4 x i16>, <4 x i16>* %B
+  %tmp3 = sdiv <4 x i16> %tmp1, %tmp2
+  ret <4 x i16> %tmp3
 }
 
+; CHECK-LABEL: sdivi16:
+; CHECK: vrecpe.f32
+; CHECK: vrecps.f32
+; CHECK: vmovn.i32
+
 define <4 x i16> @udivi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
-;CHECK: vrecpe.f32
-;CHECK: vrecps.f32
-;CHECK: vrecps.f32
-;CHECK: vmovn.i32
-	%tmp1 = load <4 x i16>, <4 x i16>* %A
-	%tmp2 = load <4 x i16>, <4 x i16>* %B
-	%tmp3 = udiv <4 x i16> %tmp1, %tmp2
-	ret <4 x i16> %tmp3
+  %tmp1 = load <4 x i16>, <4 x i16>* %A
+  %tmp2 = load <4 x i16>, <4 x i16>* %B
+  %tmp3 = udiv <4 x i16> %tmp1, %tmp2
+  ret <4 x i16> %tmp3
 }
+
+; CHECK-LABEL: udivi16:
+; CHECK: vrecpe.f32
+; CHECK: vrecps.f32
+; CHECK: vrecps.f32
+; CHECK: vmovn.i32
+
diff --git a/test/CodeGen/ARM/vector-load.ll b/test/CodeGen/ARM/vector-load.ll
index a638c2bdb9be..ed734723a86d 100644
--- a/test/CodeGen/ARM/vector-load.ll
+++ b/test/CodeGen/ARM/vector-load.ll
@@ -251,3 +251,13 @@ define <4 x i32> @zextload_v8i8tov8i32_fake_update(<4 x i8>** %ptr) {
         %zlA = zext <4 x i8> %lA to <4 x i32>
 	ret <4 x i32> %zlA
 }
+
+; CHECK-LABEL: test_silly_load:
+; CHECK: ldr {{r[0-9]+}}, [r0, #24]
+; CHECK: vld1.8 {d{{[0-9]+}}, d{{[0-9]+}}}, [r0:128]!
+; CHECK: vldr d{{[0-9]+}}, [r0]
+
+define void @test_silly_load(<28 x i8>* %addr) {
+  load volatile <28 x i8>, <28 x i8>* %addr
+  ret void
+}
diff --git a/test/CodeGen/ARM/xray-armv6-attribute-instrumentation.ll b/test/CodeGen/ARM/xray-armv6-attribute-instrumentation.ll
index 8a8c667b2d8d..93c3cb14fb73 100644
--- a/test/CodeGen/ARM/xray-armv6-attribute-instrumentation.ll
+++ b/test/CodeGen/ARM/xray-armv6-attribute-instrumentation.ll
@@ -23,3 +23,9 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always"
 ; CHECK-LABEL: Ltmp1:
 ; CHECK-NEXT:  bx	lr
 }
+; CHECK:       .p2align 4
+; CHECK-NEXT:  .long {{.*}}Lxray_synthetic_0
+; CHECK-NEXT:  .section {{.*}}xray_instr_map{{.*}}
+; CHECK-LABEL: Lxray_synthetic_0:
+; CHECK:       .long {{.*}}Lxray_sled_0
+; CHECK:       .long {{.*}}Lxray_sled_1
diff --git a/test/CodeGen/ARM/xray-armv7-attribute-instrumentation.ll b/test/CodeGen/ARM/xray-armv7-attribute-instrumentation.ll
index 07852cd68796..d14590b88679 100644
--- a/test/CodeGen/ARM/xray-armv7-attribute-instrumentation.ll
+++ b/test/CodeGen/ARM/xray-armv7-attribute-instrumentation.ll
@@ -23,3 +23,9 @@ define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always"
 ; CHECK-LABEL: Ltmp1:
 ; CHECK-NEXT:  bx lr
 }
+; CHECK:       .p2align 4
+; CHECK-NEXT:  .long {{.*}}Lxray_synthetic_0
+; CHECK-NEXT:  .section {{.*}}xray_instr_map{{.*}}
+; CHECK-LABEL: Lxray_synthetic_0:
+; CHECK:       .long {{.*}}Lxray_sled_0
+; CHECK:       .long {{.*}}Lxray_sled_1
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index c729b988cfb8..789ca2413940 100755
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -39,29 +39,3 @@ define <16 x i8> @trunc_16_8(<16 x i16> %A) nounwind uwtable readnone ssp{
   %B = trunc <16 x i16> %A to <16 x i8>
   ret <16 x i8> %B
 }
-
-define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
-; CHECK-LABEL: usat_trunc_wb_256:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x6 = trunc <16 x i16> %x5 to <16 x i8>
-  ret <16 x i8> %x6
-}
-
-define <8 x i16> @usat_trunc_dw_256(<8 x i32> %i) {
-; CHECK-LABEL: usat_trunc_dw_256:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vzeroupper
-; CHECK-NEXT:    retq
-  %x3 = icmp ult <8 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-  %x5 = select <8 x i1> %x3, <8 x i32> %i, <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-  %x6 = trunc <8 x i32> %x5 to <8 x i16>
-  ret <8 x i16> %x6
-}
diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll
index fb6c55b26e7c..646697b82c2d 100644
--- a/test/CodeGen/X86/avx512-trunc.ll
+++ b/test/CodeGen/X86/avx512-trunc.ll
@@ -500,208 +500,3 @@ define void @trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) #0 {
     store <8 x i8> %x, <8 x i8>* %res
     ret void
 }
-
-
-define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
-; KNL-LABEL: usat_trunc_wb_256_mem:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    vmovdqu %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_wb_256_mem:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpmovuswb %ymm0, (%rdi)
-; SKX-NEXT:    retq
-  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x6 = trunc <16 x i16> %x5 to <16 x i8>
-  store <16 x i8> %x6, <16 x i8>* %res, align 1
-  ret void
-}
-
-define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
-; KNL-LABEL: usat_trunc_wb_256:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_wb_256:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpmovuswb %ymm0, %xmm0
-; SKX-NEXT:    retq
-  %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x6 = trunc <16 x i16> %x5 to <16 x i8>
-  ret <16 x i8> %x6
-}
-
-define void @usat_trunc_wb_128_mem(<8 x i16> %i, <8 x i8>* %res) {
-; KNL-LABEL: usat_trunc_wb_128_mem:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpminuw {{.*}}(%rip), %xmm0, %xmm0
-; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; KNL-NEXT:    vmovq %xmm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_wb_128_mem:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpmovuswb %xmm0, (%rdi)
-; SKX-NEXT:    retq
-  %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x6 = trunc <8 x i16> %x5 to <8 x i8>
-  store <8 x i8> %x6, <8 x i8>* %res, align 1
-  ret void
-}
-
-define void @usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) {
-; ALL-LABEL: usat_trunc_db_512_mem:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vpmovusdb %zmm0, (%rdi)
-; ALL-NEXT:    retq
-  %x3 = icmp ult <16 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x6 = trunc <16 x i32> %x5 to <16 x i8>
-  store <16 x i8> %x6, <16 x i8>* %res, align 1
-  ret void
-}
-
-define void @usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) {
-; ALL-LABEL: usat_trunc_qb_512_mem:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vpmovusqb %zmm0, (%rdi)
-; ALL-NEXT:    retq
-  %x3 = icmp ult <8 x i64> %i, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
-  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
-  %x6 = trunc <8 x i64> %x5 to <8 x i8>
-  store <8 x i8> %x6, <8 x i8>* %res, align 1
-  ret void
-}
-
-define void @usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) {
-; ALL-LABEL: usat_trunc_qd_512_mem:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vpmovusqd %zmm0, (%rdi)
-; ALL-NEXT:    retq
-  %x3 = icmp ult <8 x i64> %i, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
-  %x6 = trunc <8 x i64> %x5 to <8 x i32>
-  store <8 x i32> %x6, <8 x i32>* %res, align 1
-  ret void
-}
-
-define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
-; ALL-LABEL: usat_trunc_qw_512_mem:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vpmovusqw %zmm0, (%rdi)
-; ALL-NEXT:    retq
-  %x3 = icmp ult <8 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
-  %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
-  %x6 = trunc <8 x i64> %x5 to <8 x i16>
-  store <8 x i16> %x6, <8 x i16>* %res, align 1
-  ret void
-}
-
-define <32 x i8> @usat_trunc_db_1024(<32 x i32> %i) {
-; KNL-LABEL: usat_trunc_db_1024:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpmovusdb %zmm0, %xmm0
-; KNL-NEXT:    vpmovusdb %zmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_db_1024:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
-; SKX-NEXT:    vpminud %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vpminud %zmm2, %zmm0, %zmm0
-; SKX-NEXT:    vpmovdw %zmm0, %ymm0
-; SKX-NEXT:    vpmovdw %zmm1, %ymm1
-; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; SKX-NEXT:    vpmovwb %zmm0, %ymm0
-; SKX-NEXT:    retq
-  %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x6 = trunc <32 x i32> %x5 to <32 x i8>
-  ret <32 x i8> %x6
-}
-
-define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
-; KNL-LABEL: usat_trunc_db_1024_mem:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpmovusdb %zmm0, %xmm0
-; KNL-NEXT:    vpmovusdb %zmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; KNL-NEXT:    vmovdqu %ymm0, (%rdi)
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_db_1024_mem:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm2
-; SKX-NEXT:    vpminud %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vpminud %zmm2, %zmm0, %zmm0
-; SKX-NEXT:    vpmovdw %zmm0, %ymm0
-; SKX-NEXT:    vpmovdw %zmm1, %ymm1
-; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; SKX-NEXT:    vpmovwb %zmm0, (%rdi)
-; SKX-NEXT:    retq
-  %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
-  %x6 = trunc <32 x i32> %x5 to <32 x i8>
-  store <32 x i8>%x6, <32 x i8>* %p, align 1
-  ret void
-}
-
-define <16 x i16> @usat_trunc_dw_512(<16 x i32> %i) {
-; ALL-LABEL: usat_trunc_dw_512:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vpmovusdw %zmm0, %ymm0
-; ALL-NEXT:    retq
-  %x3 = icmp ult <16 x i32> %i, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-  %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
-  %x6 = trunc <16 x i32> %x5 to <16 x i16>
-  ret <16 x i16> %x6
-}
-
-define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) {
-; ALL-LABEL: usat_trunc_wb_128:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vpminuw {{.*}}(%rip), %xmm0, %xmm0
-; ALL-NEXT:    retq
-  %x3 = icmp ult <8 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x5 = select <8 x i1> %x3, <8 x i16> %i, <8 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
-  %x6 = trunc <8 x i16> %x5 to <8 x i8>
-  ret <8 x i8>%x6
-}
-
-define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) {
-; KNL-LABEL: usat_trunc_qw_1024:
-; KNL:       ## BB#0:
-; KNL-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm2
-; KNL-NEXT:    vpminuq %zmm2, %zmm1, %zmm1
-; KNL-NEXT:    vpminuq %zmm2, %zmm0, %zmm0
-; KNL-NEXT:    vpmovqd %zmm0, %ymm0
-; KNL-NEXT:    vpmovqd %zmm1, %ymm1
-; KNL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; KNL-NEXT:    vpmovdw %zmm0, %ymm0
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: usat_trunc_qw_1024:
-; SKX:       ## BB#0:
-; SKX-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm2
-; SKX-NEXT:    vpminuq %zmm2, %zmm1, %zmm1
-; SKX-NEXT:    vpminuq %zmm2, %zmm0, %zmm0
-; SKX-NEXT:    vpmovqd %zmm0, %ymm0
-; SKX-NEXT:    vpmovqd %zmm1, %ymm1
-; SKX-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
-; SKX-NEXT:    vpmovdw %zmm0, %ymm0
-; SKX-NEXT:    retq
-  %x3 = icmp ult <16 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
-  %x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
-  %x6 = trunc <16 x i64> %x5 to <16 x i16>
-  ret <16 x i16> %x6
-}
-
diff --git a/test/CodeGen/X86/phaddsub.ll b/test/CodeGen/X86/phaddsub.ll
index 44ad05ec6ed7..08015258867b 100644
--- a/test/CodeGen/X86/phaddsub.ll
+++ b/test/CodeGen/X86/phaddsub.ll
@@ -225,3 +225,61 @@ define <4 x i32> @phsubd4(<4 x i32> %x) {
   %r = sub <4 x i32> %a, %b
   ret <4 x i32> %r
 }
+
+define <8 x i16> @phsubw1_reverse(<8 x i16> %x, <8 x i16> %y) {
+; SSSE3-LABEL: phsubw1_reverse:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    pshufb %xmm3, %xmm4
+; SSSE3-NEXT:    movdqa %xmm0, %xmm2
+; SSSE3-NEXT:    pshufb %xmm3, %xmm2
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSSE3-NEXT:    pshufb %xmm3, %xmm1
+; SSSE3-NEXT:    pshufb %xmm3, %xmm0
+; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-NEXT:    psubw %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; AVX-LABEL: phsubw1_reverse:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vpsubw %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %r = sub <8 x i16> %a, %b
+  ret <8 x i16> %r
+}
+
+define <4 x i32> @phsubd1_reverse(<4 x i32> %x, <4 x i32> %y) {
+; SSSE3-LABEL: phsubd1_reverse:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    movaps %xmm0, %xmm2
+; SSSE3-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
+; SSSE3-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; SSSE3-NEXT:    psubd %xmm0, %xmm2
+; SSSE3-NEXT:    movdqa %xmm2, %xmm0
+; SSSE3-NEXT:    retq
+;
+; AVX-LABEL: phsubd1_reverse:
+; AVX:       # BB#0:
+; AVX-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    retq
+  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %r = sub <4 x i32> %a, %b
+  ret <4 x i32> %r
+}
+
diff --git a/test/MC/AMDGPU/vop3.s b/test/MC/AMDGPU/vop3.s
index 6a62df0d9561..21fbc644bb5c 100644
--- a/test/MC/AMDGPU/vop3.s
+++ b/test/MC/AMDGPU/vop3.s
@@ -352,10 +352,6 @@ v_div_scale_f32  v24, vcc, v22, v22, v20
 // SICI: v_div_scale_f32 v24, vcc, v22, v22, v20 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0x52,0x04]
 // VI:   v_div_scale_f32 v24, vcc, v22, v22, v20 ; encoding: [0x18,0x6a,0xe0,0xd1,0x16,0x2d,0x52,0x04]
 
-v_div_scale_f32  v24, vcc, s[10:11], v22, v20
-// SICI: v_div_scale_f32 v24, vcc, s[10:11], v22, v20 ; encoding: [0x18,0x6a,0xda,0xd2,0x0a,0x2c,0x52,0x04]
-// VI:   v_div_scale_f32 v24, vcc, s[10:11], v22, v20 ; encoding: [0x18,0x6a,0xe0,0xd1,0x0a,0x2c,0x52,0x04]
-
 v_div_scale_f32  v24, s[10:11], v22, v22, v20
 // SICI: v_div_scale_f32 v24, s[10:11], v22, v22, v20 ; encoding: [0x18,0x0a,0xda,0xd2,0x16,0x2d,0x52,0x04]
 // VI:   v_div_scale_f32 v24, s[10:11], v22, v22, v20 ; encoding: [0x18,0x0a,0xe0,0xd1,0x16,0x2d,0x52,0x04]
@@ -365,8 +361,8 @@ v_div_scale_f32  v24, vcc, v22, 1.0, v22
 // VI:   v_div_scale_f32 v24, vcc, v22, 1.0, v22 ; encoding: [0x18,0x6a,0xe0,0xd1,0x16,0xe5,0x59,0x04]
 
 v_div_scale_f32  v24, vcc, v22, v22, -2.0
-// SICI: v_div_scale_f32 v24, vcc, v22, v22, -2.0 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0xd2,0x83]
-// VI:   v_div_scale_f32 v24, vcc, v22, v22, -2.0 ; encoding: [0x18,0x6a,0xe0,0xd1,0x16,0x2d,0xd2,0x83]
+// SICI: v_div_scale_f32 v24, vcc, v22, v22, -2.0 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0xd6,0x03]
+// VI:   v_div_scale_f32 v24, vcc, v22, v22, -2.0 ; encoding: [0x18,0x6a,0xe0,0xd1,0x16,0x2d,0xd6,0x03]
 
 v_div_scale_f32 v24, vcc, v22, v22, 0xc0000000
 // SICI: v_div_scale_f32 v24, vcc, v22, v22, -2.0 ; encoding: [0x18,0x6a,0xda,0xd2,0x16,0x2d,0xd6,0x03]
diff --git a/test/Transforms/SimplifyCFG/sink-common-code.ll b/test/Transforms/SimplifyCFG/sink-common-code.ll
index 953d96edadaf..0f7bfa8516c9 100644
--- a/test/Transforms/SimplifyCFG/sink-common-code.ll
+++ b/test/Transforms/SimplifyCFG/sink-common-code.ll
@@ -768,6 +768,30 @@ if.end:
 ; CHECK-NOT: exact
 ; CHECK: }
 
+; Check that simplifycfg doesn't sink and merge inline-asm instructions.
+
+define i32 @test_inline_asm1(i32 %c, i32 %r6) {
+entry:
+  %tobool = icmp eq i32 %c, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  %0 = call i32 asm "rorl $2, $0", "=&r,0,n,~{dirflag},~{fpsr},~{flags}"(i32 %r6, i32 8)
+  br label %if.end
+
+if.else:
+  %1 = call i32 asm "rorl $2, $0", "=&r,0,n,~{dirflag},~{fpsr},~{flags}"(i32 %r6, i32 6)
+  br label %if.end
+
+if.end:
+  %r6.addr.0 = phi i32 [ %0, %if.then ], [ %1, %if.else ]
+  ret i32 %r6.addr.0
+}
+
+; CHECK-LABEL: @test_inline_asm1(
+; CHECK: call i32 asm "rorl $2, $0", "=&r,0,n,~{dirflag},~{fpsr},~{flags}"(i32 %r6, i32 8)
+; CHECK: call i32 asm "rorl $2, $0", "=&r,0,n,~{dirflag},~{fpsr},~{flags}"(i32 %r6, i32 6)
+
 declare i32 @call_target()
 
 define void @test_operand_bundles(i1 %cond, i32* %ptr) {
diff --git a/utils/sanitizers/ubsan_blacklist.txt b/utils/sanitizers/ubsan_blacklist.txt
new file mode 100644
index 000000000000..49975866ea41
--- /dev/null
+++ b/utils/sanitizers/ubsan_blacklist.txt
@@ -0,0 +1,7 @@
+# This blacklist should be applied when LLVM is built
+# with -fsanitize=undefined instrumentation. It exists
+# because libstdc++ has some undefined behavior issues
+# in some of the headers, in particular, stl_tree.h.
+
+# upcast of address with insufficient space for an object of type std::_Rb_tree_node<...>
+src:*bits/stl_tree.h

From d1b6c770bea9b3da66c6c1eb9a7c483ed7e55c1e Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Sun, 29 Jan 2017 20:58:47 +0000
Subject: [PATCH 2/5] Vendor import of clang release_40 branch r293443:
 https://llvm.org/svn/llvm-project/cfe/branches/release_40@293443

---
 lib/AST/ASTContext.cpp                 |  3 ++-
 lib/CodeGen/CodeGenFunction.cpp        |  3 +--
 lib/Index/IndexDecl.cpp                |  8 +++++++-
 lib/Sema/SemaInit.cpp                  |  2 +-
 test/Index/Core/index-source.m         | 12 ++++++------
 test/Index/Core/index-subkinds.m       | 12 ++++++------
 test/Index/index-decls.m               |  4 ++--
 test/Index/index-module.m              |  2 +-
 test/OpenMP/openmp_seh.c               | 18 ++++++++++++++++++
 test/SemaCXX/constant-expression.cpp   | 11 +++++++++++
 test/SemaCXX/new-delete-cxx0x.cpp      |  4 +++-
 tools/libclang/CXIndexDataConsumer.cpp |  7 ++++---
 tools/libclang/CXIndexDataConsumer.h   |  2 +-
 13 files changed, 63 insertions(+), 25 deletions(-)
 create mode 100644 test/OpenMP/openmp_seh.c

diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp
index d03c22af5b29..b531a66dbab3 100644
--- a/lib/AST/ASTContext.cpp
+++ b/lib/AST/ASTContext.cpp
@@ -9025,7 +9025,8 @@ bool ASTContext::DeclMustBeEmitted(const Decl *D) {
 
   // Variables that have initialization with side-effects are required.
   if (VD->getInit() && VD->getInit()->HasSideEffects(*this) &&
-      !VD->evaluateValue())
+      // We can get a value-dependent initializer during error recovery.
+      (VD->getInit()->isValueDependent() || !VD->evaluateValue()))
     return true;
 
   // Likewise, variables with tuple-like bindings are required if their
diff --git a/lib/CodeGen/CodeGenFunction.cpp b/lib/CodeGen/CodeGenFunction.cpp
index 137c69420ddf..e142a21b1e74 100644
--- a/lib/CodeGen/CodeGenFunction.cpp
+++ b/lib/CodeGen/CodeGenFunction.cpp
@@ -112,9 +112,8 @@ CodeGenFunction::~CodeGenFunction() {
   if (FirstBlockInfo)
     destroyBlockInfos(FirstBlockInfo);
 
-  if (getLangOpts().OpenMP) {
+  if (getLangOpts().OpenMP && CurFn)
     CGM.getOpenMPRuntime().functionFinished(*this);
-  }
 }
 
 CharUnits CodeGenFunction::getNaturalPointeeTypeAlignment(QualType T,
diff --git a/lib/Index/IndexDecl.cpp b/lib/Index/IndexDecl.cpp
index 7d60aad3895d..3b4f3f81399d 100644
--- a/lib/Index/IndexDecl.cpp
+++ b/lib/Index/IndexDecl.cpp
@@ -92,7 +92,13 @@ class IndexingDeclVisitor : public ConstDeclVisitor<IndexingDeclVisitor, bool> {
       Relations.emplace_back((unsigned)SymbolRole::RelationAccessorOf,
                              AssociatedProp);
 
-    if (!IndexCtx.handleDecl(D, (unsigned)SymbolRole::Dynamic, Relations))
+    // getLocation() returns beginning token of a method declaration, but for
+    // indexing purposes we want to point to the base name.
+    SourceLocation MethodLoc = D->getSelectorStartLoc();
+    if (MethodLoc.isInvalid())
+      MethodLoc = D->getLocation();
+
+    if (!IndexCtx.handleDecl(D, MethodLoc, (unsigned)SymbolRole::Dynamic, Relations))
       return false;
     IndexCtx.indexTypeSourceInfo(D->getReturnTypeSourceInfo(), D);
     bool hasIBActionAndFirst = D->hasAttr<IBActionAttr>();
diff --git a/lib/Sema/SemaInit.cpp b/lib/Sema/SemaInit.cpp
index 45eff5ee6b62..b053c83c3f69 100644
--- a/lib/Sema/SemaInit.cpp
+++ b/lib/Sema/SemaInit.cpp
@@ -1684,7 +1684,7 @@ void InitListChecker::CheckArrayType(const InitializedEntity &Entity,
     // If this is an incomplete array type, the actual type needs to
     // be calculated here.
     llvm::APSInt Zero(maxElements.getBitWidth(), maxElements.isUnsigned());
-    if (maxElements == Zero) {
+    if (maxElements == Zero && !Entity.isVariableLengthArrayNew()) {
       // Sizing an array implicitly to zero is not allowed by ISO C,
       // but is supported by GNU.
       SemaRef.Diag(IList->getLocStart(),
diff --git a/test/Index/Core/index-source.m b/test/Index/Core/index-source.m
index 3debcd262b24..880028df6310 100644
--- a/test/Index/Core/index-source.m
+++ b/test/Index/Core/index-source.m
@@ -3,10 +3,10 @@
 @interface Base
 // CHECK: [[@LINE-1]]:12 | class/ObjC | Base | c:objc(cs)Base | _OBJC_CLASS_$_Base | Decl | rel: 0
 -(void)meth;
-// CHECK: [[@LINE-1]]:1 | instance-method/ObjC | meth | c:objc(cs)Base(im)meth | -[Base meth] | Decl,Dyn,RelChild | rel: 1
+// CHECK: [[@LINE-1]]:8 | instance-method/ObjC | meth | c:objc(cs)Base(im)meth | -[Base meth] | Decl,Dyn,RelChild | rel: 1
 // CHECK-NEXT: RelChild | Base | c:objc(cs)Base
 +(Base*)class_meth;
-// CHECK: [[@LINE-1]]:1 | class-method/ObjC | class_meth | c:objc(cs)Base(cm)class_meth | +[Base class_meth] | Decl,Dyn,RelChild | rel: 1
+// CHECK: [[@LINE-1]]:9 | class-method/ObjC | class_meth | c:objc(cs)Base(cm)class_meth | +[Base class_meth] | Decl,Dyn,RelChild | rel: 1
 // CHECK: [[@LINE-2]]:3 | class/ObjC | Base | c:objc(cs)Base | _OBJC_CLASS_$_Base | Ref,RelCont | rel: 1
 // CHECK-NEXT: RelCont | class_meth | c:objc(cs)Base(cm)class_meth
 
@@ -92,7 +92,7 @@ extern int setjmp(jmp_buf);
 
 @class I1;
 @interface I1
-// CHECK: [[@LINE+1]]:1 | instance-method/ObjC | meth | c:objc(cs)I1(im)meth | -[I1 meth] | Decl,Dyn,RelChild | rel: 1
+// CHECK: [[@LINE+1]]:8 | instance-method/ObjC | meth | c:objc(cs)I1(im)meth | -[I1 meth] | Decl,Dyn,RelChild | rel: 1
 -(void)meth;
 @end
 
@@ -117,7 +117,7 @@ extern int setjmp(jmp_buf);
 // CHECK-NEXT: RelChild | I2 | c:objc(cs)I2
 @synthesize prop = _prop;
 
-// CHECK: [[@LINE+5]]:1 | instance-method(IB)/ObjC | doAction:foo: | c:objc(cs)I2(im)doAction:foo: | -[I2 doAction:foo:] | Def,Dyn,RelChild | rel: 1
+// CHECK: [[@LINE+5]]:12 | instance-method(IB)/ObjC | doAction:foo: | c:objc(cs)I2(im)doAction:foo: | -[I2 doAction:foo:] | Def,Dyn,RelChild | rel: 1
 // CHECK-NEXT: RelChild | I2 | c:objc(cs)I2
 // CHECK: [[@LINE+3]]:22 | class/ObjC | I1 | c:objc(cs)I1 | _OBJC_CLASS_$_I1 | Ref,RelCont,RelIBType | rel: 1
 // CHECK-NEXT: RelCont,RelIBType | doAction:foo: | c:objc(cs)I2(im)doAction:foo:
@@ -127,11 +127,11 @@ extern int setjmp(jmp_buf);
 
 @interface I3
 @property (readwrite) id prop;
-// CHECK: [[@LINE+3]]:1 | instance-method/acc-get/ObjC | prop | c:objc(cs)I3(im)prop | -[I3 prop] | Decl,Dyn,RelChild,RelAcc | rel: 2
+// CHECK: [[@LINE+3]]:6 | instance-method/acc-get/ObjC | prop | c:objc(cs)I3(im)prop | -[I3 prop] | Decl,Dyn,RelChild,RelAcc | rel: 2
 // CHECK-NEXT: RelChild | I3 | c:objc(cs)I3
 // CHECK-NEXT: RelAcc | prop | c:objc(cs)I3(py)prop
 -(id)prop;
-// CHECK: [[@LINE+3]]:1 | instance-method/acc-set/ObjC | setProp: | c:objc(cs)I3(im)setProp: | -[I3 setProp:] | Decl,Dyn,RelChild,RelAcc | rel: 2
+// CHECK: [[@LINE+3]]:8 | instance-method/acc-set/ObjC | setProp: | c:objc(cs)I3(im)setProp: | -[I3 setProp:] | Decl,Dyn,RelChild,RelAcc | rel: 2
 // CHECK-NEXT: RelChild | I3 | c:objc(cs)I3
 // CHECK-NEXT: RelAcc | prop | c:objc(cs)I3(py)prop
 -(void)setProp:(id)p;
diff --git a/test/Index/Core/index-subkinds.m b/test/Index/Core/index-subkinds.m
index 6783f6dde308..15d60b151896 100644
--- a/test/Index/Core/index-subkinds.m
+++ b/test/Index/Core/index-subkinds.m
@@ -9,11 +9,11 @@
 @end
 // CHECK: [[@LINE+1]]:17 | class(test)/ObjC | MyTestCase | c:objc(cs)MyTestCase | <no-cgname> | Def | rel: 0
 @implementation MyTestCase
-// CHECK: [[@LINE+1]]:1 | instance-method(test)/ObjC | testMe | c:objc(cs)MyTestCase(im)testMe | -[MyTestCase testMe] | Def,Dyn,RelChild | rel: 1
+// CHECK: [[@LINE+1]]:8 | instance-method(test)/ObjC | testMe | c:objc(cs)MyTestCase(im)testMe | -[MyTestCase testMe] | Def,Dyn,RelChild | rel: 1
 -(void)testMe {}
-// CHECK: [[@LINE+1]]:1 | instance-method/ObjC | testResult | c:objc(cs)MyTestCase(im)testResult | -[MyTestCase testResult] | Def,Dyn,RelChild | rel: 1
+// CHECK: [[@LINE+1]]:6 | instance-method/ObjC | testResult | c:objc(cs)MyTestCase(im)testResult | -[MyTestCase testResult] | Def,Dyn,RelChild | rel: 1
 -(id)testResult { return 0; }
-// CHECK: [[@LINE+1]]:1 | instance-method/ObjC | testWithInt: | c:objc(cs)MyTestCase(im)testWithInt: | -[MyTestCase testWithInt:] | Def,Dyn,RelChild | rel: 1
+// CHECK: [[@LINE+1]]:8 | instance-method/ObjC | testWithInt: | c:objc(cs)MyTestCase(im)testWithInt: | -[MyTestCase testWithInt:] | Def,Dyn,RelChild | rel: 1
 -(void)testWithInt:(int)i {}
 @end
 
@@ -22,7 +22,7 @@
 @end
 // CHECK: [[@LINE+1]]:17 | class(test)/ObjC | SubTestCase | c:objc(cs)SubTestCase | <no-cgname> | Def | rel: 0
 @implementation SubTestCase
-// CHECK: [[@LINE+1]]:1 | instance-method(test)/ObjC | testIt2 | c:objc(cs)SubTestCase(im)testIt2 | -[SubTestCase testIt2] | Def,Dyn,RelChild | rel: 1
+// CHECK: [[@LINE+1]]:8 | instance-method(test)/ObjC | testIt2 | c:objc(cs)SubTestCase(im)testIt2 | -[SubTestCase testIt2] | Def,Dyn,RelChild | rel: 1
 -(void)testIt2 {}
 @end
 
@@ -34,7 +34,7 @@
 // CHECK: [[@LINE+2]]:17 | class(test)/ObjC | MyTestCase | c:objc(cs)MyTestCase | _OBJC_CLASS_$_MyTestCase | Ref,RelCont | rel: 1
 // CHECK: [[@LINE+1]]:28 | extension/ObjC | MyTestCase | c:objc(cy)MyTestCase@cat | <no-cgname> | Def | rel: 0
 @implementation MyTestCase(cat)
-// CHECK: [[@LINE+1]]:1 | instance-method(test)/ObjC | testInCat | c:objc(cs)MyTestCase(im)testInCat | -[MyTestCase(cat) testInCat] | Def,Dyn,RelChild | rel: 1
+// CHECK: [[@LINE+1]]:9 | instance-method(test)/ObjC | testInCat | c:objc(cs)MyTestCase(im)testInCat | -[MyTestCase(cat) testInCat] | Def,Dyn,RelChild | rel: 1
 - (void)testInCat {}
 @end
 
@@ -47,7 +47,7 @@
 @property (readonly) IBOutlet id prop;
 // CHECK: [[@LINE+1]]:54 | instance-property(IB,IBColl)/ObjC | propColl | c:objc(cs)IBCls(py)propColl | <no-cgname> | Decl,RelChild | rel: 1
 @property (readonly) IBOutletCollection(NSButton) id propColl;
-// CHECK: [[@LINE+1]]:1 | instance-method(IB)/ObjC | doIt | c:objc(cs)IBCls(im)doIt | -[IBCls doIt] | Decl,Dyn,RelChild | rel: 1
+// CHECK: [[@LINE+1]]:12 | instance-method(IB)/ObjC | doIt | c:objc(cs)IBCls(im)doIt | -[IBCls doIt] | Decl,Dyn,RelChild | rel: 1
 -(IBAction)doIt;
 @end
 
diff --git a/test/Index/index-decls.m b/test/Index/index-decls.m
index a39d9e3bfa6b..7f7f11576ab8 100644
--- a/test/Index/index-decls.m
+++ b/test/Index/index-decls.m
@@ -58,7 +58,7 @@ int test1() {
 // RUN: c-index-test -index-file %s -target x86_64-apple-macosx10.7 > %t
 // RUN: FileCheck %s -input-file=%t
 // CHECK: [indexDeclaration]: kind: objc-class | name: I | {{.*}} | loc: 1:12
-// CHECK: [indexDeclaration]: kind: objc-instance-method | name: prop | {{.*}} | loc: 3:2
+// CHECK: [indexDeclaration]: kind: objc-instance-method | name: prop | {{.*}} | loc: 3:7
 // CHECK: [indexDeclaration]: kind: objc-property | name: prop | {{.*}} | loc: 2:25
 // CHECK: [indexDeclaration]: kind: objc-category | name:  | {{.*}} | loc: 6:12
 // CHECK: [indexDeclaration]: kind: objc-instance-method | name: setProp: | {{.*}} | loc: 7:33
@@ -82,5 +82,5 @@ int test1() {
 // CHECK-NOT: [indexDeclaration]: kind: objc-instance-method {{.*}} loc: 37:
 // CHECK-NOT: [indexDeclaration]: kind: objc-instance-method {{.*}} loc: 43:
 
-// CHECK: [indexDeclaration]: kind: objc-instance-method | name: meth | {{.*}} loc: 54:1 | {{.*}} | isRedecl: 0 | isDef: 0 |
+// CHECK: [indexDeclaration]: kind: objc-instance-method | name: meth | {{.*}} loc: 54:8 | {{.*}} | isRedecl: 0 | isDef: 0 |
 // CHECK: [indexDeclaration]: kind: objc-property | name: c | USR: c:objc(cs)I5(cpy)c | lang: ObjC | cursor: ObjCPropertyDecl=c:55:23 [class,] | loc: 55:23
diff --git a/test/Index/index-module.m b/test/Index/index-module.m
index d1c0b36b389e..51faea2d32bb 100644
--- a/test/Index/index-module.m
+++ b/test/Index/index-module.m
@@ -52,7 +52,7 @@ int glob;
 // CHECK-TMOD-NEXT: [indexDeclaration]: kind: objc-class | name: Module | {{.*}} | loc: [[TMOD_MODULE_H]]:15:12
 // CHECK-TMOD-NEXT:      <ObjCContainerInfo>: kind: interface
 // CHECK-TMOD-NEXT: [indexDeclaration]: kind: objc-class-method | name: version | {{.*}} | loc: [[TMOD_MODULE_H]]:16:1
-// CHECK-TMOD-NEXT: [indexDeclaration]: kind: objc-class-method | name: alloc | {{.*}} | loc: [[TMOD_MODULE_H]]:17:1
+// CHECK-TMOD-NEXT: [indexDeclaration]: kind: objc-class-method | name: alloc | {{.*}} | loc: [[TMOD_MODULE_H]]:17:2
 // CHECK-TMOD-NEXT: [indexDeclaration]: kind: typedef | name: FILE | {{.*}} | loc: [[TMOD_MODULE_H]]:30:3
 // CHECK-TMOD-NEXT: [indexDeclaration]: kind: struct | name: __sFILE | {{.*}} | loc: [[TMOD_MODULE_H]]:28:16
 // CHECK-TMOD-NEXT: [indexDeclaration]: kind: field | name: _offset | {{.*}} | loc: [[TMOD_MODULE_H]]:29:7
diff --git a/test/OpenMP/openmp_seh.c b/test/OpenMP/openmp_seh.c
new file mode 100644
index 000000000000..74dccec125e3
--- /dev/null
+++ b/test/OpenMP/openmp_seh.c
@@ -0,0 +1,18 @@
+// RUN: %clang_cc1 -verify -triple x86_64-pc-windows-msvc19.0.0 -fopenmp -fms-compatibility -x c++ -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+// REQUIRES: x86-registered-target
+extern "C" {
+void __cpuid(int[4], int);
+}
+
+// CHECK-LABEL: @main
+int main(void) {
+  __try {
+    int info[4];
+    __cpuid(info, 1);
+  } __except (1) {
+  }
+
+  return 0;
+}
+
diff --git a/test/SemaCXX/constant-expression.cpp b/test/SemaCXX/constant-expression.cpp
index f82a69209379..69e846bf0ec2 100644
--- a/test/SemaCXX/constant-expression.cpp
+++ b/test/SemaCXX/constant-expression.cpp
@@ -143,3 +143,14 @@ namespace rdar16064952 {
 }
 
 char PR17381_ice = 1000000 * 1000000; // expected-warning {{overflow}} expected-warning {{changes value}}
+
+namespace PR31701 {
+  struct C {
+    template<int i> static int n; // expected-warning {{extension}}
+  };
+  template <int M> class D;
+  template <int M>
+  template<int i> void D<M>::set() { // expected-error {{from class 'D<M>' without definition}}
+    const C c = C::n<i>;
+  }
+}
diff --git a/test/SemaCXX/new-delete-cxx0x.cpp b/test/SemaCXX/new-delete-cxx0x.cpp
index c55152510e1b..4ef586f2e565 100644
--- a/test/SemaCXX/new-delete-cxx0x.cpp
+++ b/test/SemaCXX/new-delete-cxx0x.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -triple=i686-pc-linux-gnu
+// RUN: %clang_cc1 -fsyntax-only -verify %s -std=c++11 -triple=i686-pc-linux-gnu -pedantic
 
 void ugly_news(int *ip) {
   (void)new int[-1]; // expected-error {{array size is negative}}
@@ -29,6 +29,7 @@ void fn(int n) {
   (void) new int[2] {1, 2};
   (void) new S[2] {1, 2};
   (void) new S[3] {1, 2};
+  (void) new S[n] {};
   // C++11 [expr.new]p19:
   //   If the new-expression creates an object or an array of objects of class
   //   type, access and ambiguity control are done for the allocation function,
@@ -44,6 +45,7 @@ void fn(int n) {
   (void) new T[2] {1, 2}; // ok
   (void) new T[3] {1, 2}; // expected-error {{no matching constructor}} expected-note {{in implicit initialization of array element 2}}
   (void) new T[n] {1, 2}; // expected-error {{no matching constructor}} expected-note {{in implicit initialization of trailing array elements in runtime-sized array new}}
+  (void) new T[n] {}; // expected-error {{no matching constructor}} expected-note {{in implicit initialization of trailing array elements in runtime-sized array new}}
 }
 
 struct U {
diff --git a/tools/libclang/CXIndexDataConsumer.cpp b/tools/libclang/CXIndexDataConsumer.cpp
index 1981cabbbe4c..cb8aebfd0c48 100644
--- a/tools/libclang/CXIndexDataConsumer.cpp
+++ b/tools/libclang/CXIndexDataConsumer.cpp
@@ -95,7 +95,7 @@ class IndexingDeclVisitor : public ConstDeclVisitor<IndexingDeclVisitor, bool> {
     if (isa<ObjCImplDecl>(LexicalDC) && !D->isThisDeclarationADefinition())
       DataConsumer.handleSynthesizedObjCMethod(D, DeclLoc, LexicalDC);
     else
-      DataConsumer.handleObjCMethod(D);
+      DataConsumer.handleObjCMethod(D, DeclLoc);
     return true;
   }
 
@@ -801,7 +801,8 @@ bool CXIndexDataConsumer::handleObjCCategoryImpl(const ObjCCategoryImplDecl *D)
   return handleObjCContainer(D, CategoryLoc, getCursor(D), CatDInfo);
 }
 
-bool CXIndexDataConsumer::handleObjCMethod(const ObjCMethodDecl *D) {
+bool CXIndexDataConsumer::handleObjCMethod(const ObjCMethodDecl *D,
+                                           SourceLocation Loc) {
   bool isDef = D->isThisDeclarationADefinition();
   bool isContainer = isDef;
   bool isSkipped = false;
@@ -814,7 +815,7 @@ bool CXIndexDataConsumer::handleObjCMethod(const ObjCMethodDecl *D) {
   DeclInfo DInfo(!D->isCanonicalDecl(), isDef, isContainer);
   if (isSkipped)
     DInfo.flags |= CXIdxDeclFlag_Skipped;
-  return handleDecl(D, D->getLocation(), getCursor(D), DInfo);
+  return handleDecl(D, Loc, getCursor(D), DInfo);
 }
 
 bool CXIndexDataConsumer::handleSynthesizedObjCProperty(
diff --git a/tools/libclang/CXIndexDataConsumer.h b/tools/libclang/CXIndexDataConsumer.h
index 718a2a18b1b3..16162cb83f77 100644
--- a/tools/libclang/CXIndexDataConsumer.h
+++ b/tools/libclang/CXIndexDataConsumer.h
@@ -418,7 +418,7 @@ class CXIndexDataConsumer : public index::IndexDataConsumer {
   bool handleObjCCategory(const ObjCCategoryDecl *D);
   bool handleObjCCategoryImpl(const ObjCCategoryImplDecl *D);
 
-  bool handleObjCMethod(const ObjCMethodDecl *D);
+  bool handleObjCMethod(const ObjCMethodDecl *D, SourceLocation Loc);
 
   bool handleSynthesizedObjCProperty(const ObjCPropertyImplDecl *D);
   bool handleSynthesizedObjCMethod(const ObjCMethodDecl *D, SourceLocation Loc,

From 3dfdcbdf6f1f1ecb2acd9c1fb600d5ebf11219e7 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Sun, 29 Jan 2017 20:58:54 +0000
Subject: [PATCH 3/5] Vendor import of compiler-rt release_40 branch r293443:
 https://llvm.org/svn/llvm-project/compiler-rt/branches/release_40@293443

---
 lib/builtins/arm/comparesf2.S | 2 +-
 lib/xray/xray_arm.cc          | 7 ++++++-
 test/xray/lit.cfg             | 8 +++++++-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/lib/builtins/arm/comparesf2.S b/lib/builtins/arm/comparesf2.S
index e8095650e4fb..a5b7e36487d6 100644
--- a/lib/builtins/arm/comparesf2.S
+++ b/lib/builtins/arm/comparesf2.S
@@ -283,7 +283,7 @@ DEFINE_COMPILERRT_FUNCTION(__unordsf2)
 END_COMPILERRT_FUNCTION(__unordsf2)
 
 #if defined(COMPILER_RT_ARMHF_TARGET)
-DEFINE_COMPILERRT_FUNCTION(__aeabi_fcmpum):
+DEFINE_COMPILERRT_FUNCTION(__aeabi_fcmpum)
 	vmov s0, r0
 	vmov s1, r1
 	b SYMBOL_NAME(__unordsf2)
diff --git a/lib/xray/xray_arm.cc b/lib/xray/xray_arm.cc
index d89322e833e5..f5e2cd2a93c2 100644
--- a/lib/xray/xray_arm.cc
+++ b/lib/xray/xray_arm.cc
@@ -19,6 +19,8 @@
 #include <atomic>
 #include <cassert>
 
+extern "C" void __clear_cache(void* start, void* end);
+
 namespace __xray {
 
 uint64_t cycleFrequency() XRAY_NEVER_INSTRUMENT {
@@ -116,8 +118,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
   //   B #20
 
   uint32_t *FirstAddress = reinterpret_cast<uint32_t *>(Sled.Address);
+  uint32_t *CurAddress = FirstAddress + 1;
   if (Enable) {
-    uint32_t *CurAddress = FirstAddress + 1;
     CurAddress =
         Write32bitLoadR0(CurAddress, reinterpret_cast<uint32_t>(FuncId));
     CurAddress =
@@ -125,6 +127,7 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
     *CurAddress = uint32_t(PatchOpcodes::PO_BlxIp);
     CurAddress++;
     *CurAddress = uint32_t(PatchOpcodes::PO_PopR0Lr);
+    CurAddress++;
     std::atomic_store_explicit(
         reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
         uint32_t(PatchOpcodes::PO_PushR0Lr), std::memory_order_release);
@@ -133,6 +136,8 @@ inline static bool patchSled(const bool Enable, const uint32_t FuncId,
         reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
         uint32_t(PatchOpcodes::PO_B20), std::memory_order_release);
   }
+  __clear_cache(reinterpret_cast<char*>(FirstAddress),
+      reinterpret_cast<char*>(CurAddress));
   return true;
 }
 
diff --git a/test/xray/lit.cfg b/test/xray/lit.cfg
index 5d030e101452..9142ad13618c 100644
--- a/test/xray/lit.cfg
+++ b/test/xray/lit.cfg
@@ -30,8 +30,14 @@ config.substitutions.append(
 # Default test suffixes.
 config.suffixes = ['.c', '.cc', '.cpp']
 
-if config.host_os not in ['Linux'] or config.host_arch.find('64') == -1:
+if config.host_os not in ['Linux']:
   config.unsupported = True
+elif '64' not in config.host_arch:
+  if 'arm' in config.host_arch:
+    if '-mthumb' in config.target_cflags:
+      config.unsupported = True
+  else:
+    config.unsupported = True
 
 # Allow tests to use REQUIRES=stable-runtime.  For use when you cannot use XFAIL
 # e.g. because the test sometimes passes, sometimes fails.

From 3efcd4e54b8b374c90b362d763a726b9e025efae Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Sun, 29 Jan 2017 20:59:02 +0000
Subject: [PATCH 4/5] Vendor import of libc++ release_40 branch r293443:
 https://llvm.org/svn/llvm-project/libcxx/branches/release_40@293443

---
 include/__config                              |  9 ------
 include/__string                              | 12 ++++---
 include/__threading_support                   | 22 ++++++++-----
 include/deque                                 | 23 +++++++++++--
 include/forward_list                          | 12 ++++++-
 include/list                                  | 28 ++++++++++++++--
 include/queue                                 |  7 +++-
 include/stack                                 |  7 +++-
 include/vector                                | 23 +++++++++++--
 test/libcxx/test/config.py                    |  9 ++++++
 .../queue/queue.defn/emplace.pass.cpp         | 14 ++++++--
 .../stack/stack.defn/emplace.pass.cpp         |  9 ++++++
 .../deque.modifiers/emplace_back.pass.cpp     |  9 +++++-
 .../deque.modifiers/emplace_front.pass.cpp    | 21 +++++++++++-
 .../emplace_front.pass.cpp                    | 19 +++++++++++
 .../list/list.modifiers/emplace_back.pass.cpp | 20 ++++++++++++
 .../list.modifiers/emplace_front.pass.cpp     | 20 ++++++++++++
 .../vector.bool/emplace_back.pass.cpp         | 30 ++++++++++++++---
 .../vector.modifiers/emplace_back.pass.cpp    | 32 +++++++++++++++++++
 www/cxx1z_status.html                         |  4 +--
 20 files changed, 286 insertions(+), 44 deletions(-)

diff --git a/include/__config b/include/__config
index ddfab31f7fac..340c573a2a40 100644
--- a/include/__config
+++ b/include/__config
@@ -403,15 +403,6 @@ namespace std {
 #define _LIBCPP_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK __attribute__((__no_sanitize__("unsigned-integer-overflow")))
 #endif 
 
-// A constexpr version of __builtin_memcmp was added in clang 4.0
-#if __has_builtin(__builtin_memcmp)
-# ifdef __apple_build_version__
-// No shipping version of Apple's clang has constexpr __builtin_memcmp
-# elif __clang_major__ > 3
-#  define _LIBCPP_BUILTIN_MEMCMP_ISCONSTEXPR
-# endif
-#endif
-
 #elif defined(_LIBCPP_COMPILER_GCC)
 
 #define _ALIGNAS(x) __attribute__((__aligned__(x)))
diff --git a/include/__string b/include/__string
index c85b874f054b..75608cea71d9 100644
--- a/include/__string
+++ b/include/__string
@@ -243,7 +243,7 @@ char_traits<char>::compare(const char_type* __s1, const char_type* __s2, size_t
 {
     if (__n == 0)
         return 0;
-#ifdef _LIBCPP_BUILTIN_MEMCMP_ISCONSTEXPR
+#if __has_feature(cxx_constexpr_string_builtins)
     return __builtin_memcmp(__s1, __s2, __n);
 #elif _LIBCPP_STD_VER <= 14
     return memcmp(__s1, __s2, __n);
@@ -265,7 +265,9 @@ char_traits<char>::find(const char_type* __s, size_t __n, const char_type& __a)
 {
     if (__n == 0)
         return NULL;
-#if _LIBCPP_STD_VER <= 14
+#if __has_feature(cxx_constexpr_string_builtins)
+    return __builtin_char_memchr(__s, to_int_type(__a), __n);
+#elif _LIBCPP_STD_VER <= 14
     return (const char_type*) memchr(__s, to_int_type(__a), __n);
 #else
     for (; __n; --__n)
@@ -331,7 +333,7 @@ char_traits<wchar_t>::compare(const char_type* __s1, const char_type* __s2, size
 {
     if (__n == 0)
         return 0;
-#if __has_builtin(__builtin_wmemcmp)
+#if __has_feature(cxx_constexpr_string_builtins)
     return __builtin_wmemcmp(__s1, __s2, __n);
 #elif _LIBCPP_STD_VER <= 14
     return wmemcmp(__s1, __s2, __n);
@@ -351,7 +353,7 @@ inline _LIBCPP_CONSTEXPR_AFTER_CXX14
 size_t
 char_traits<wchar_t>::length(const char_type* __s) _NOEXCEPT
 {
-#if __has_builtin(__builtin_wcslen)
+#if __has_feature(cxx_constexpr_string_builtins)
     return __builtin_wcslen(__s);
 #elif _LIBCPP_STD_VER <= 14
     return wcslen(__s);
@@ -369,7 +371,7 @@ char_traits<wchar_t>::find(const char_type* __s, size_t __n, const char_type& __
 {
     if (__n == 0)
         return NULL;
-#if __has_builtin(__builtin_wmemchr)
+#if __has_feature(cxx_constexpr_string_builtins)
         return __builtin_wmemchr(__s, __a, __n);
 #elif _LIBCPP_STD_VER <= 14
     return wmemchr(__s, __a, __n);
diff --git a/include/__threading_support b/include/__threading_support
index a672a9f80eda..817a2cfb934f 100644
--- a/include/__threading_support
+++ b/include/__threading_support
@@ -40,6 +40,12 @@
 #define _LIBCPP_THREAD_ABI_VISIBILITY inline _LIBCPP_INLINE_VISIBILITY
 #endif
 
+#if defined(__FreeBSD__) && defined(__clang__) && __has_attribute(no_thread_safety_analysis)
+#define _LIBCPP_NO_THREAD_SAFETY_ANALYSIS __attribute__((no_thread_safety_analysis))
+#else
+#define _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
+#endif
+
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 #if defined(_LIBCPP_HAS_THREAD_API_PTHREAD)
@@ -98,25 +104,25 @@ typedef DWORD __libcpp_tls_key;
 _LIBCPP_THREAD_ABI_VISIBILITY
 int __libcpp_recursive_mutex_init(__libcpp_recursive_mutex_t *__m);
 
-_LIBCPP_THREAD_ABI_VISIBILITY
+_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
 int __libcpp_recursive_mutex_lock(__libcpp_recursive_mutex_t *__m);
 
-_LIBCPP_THREAD_ABI_VISIBILITY
+_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
 int __libcpp_recursive_mutex_trylock(__libcpp_recursive_mutex_t *__m);
 
-_LIBCPP_THREAD_ABI_VISIBILITY
+_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
 int __libcpp_recursive_mutex_unlock(__libcpp_recursive_mutex_t *__m);
 
 _LIBCPP_THREAD_ABI_VISIBILITY
 int __libcpp_recursive_mutex_destroy(__libcpp_recursive_mutex_t *__m);
 
-_LIBCPP_THREAD_ABI_VISIBILITY
+_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
 int __libcpp_mutex_lock(__libcpp_mutex_t *__m);
 
-_LIBCPP_THREAD_ABI_VISIBILITY
+_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
 int __libcpp_mutex_trylock(__libcpp_mutex_t *__m);
 
-_LIBCPP_THREAD_ABI_VISIBILITY
+_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
 int __libcpp_mutex_unlock(__libcpp_mutex_t *__m);
 
 _LIBCPP_THREAD_ABI_VISIBILITY
@@ -129,10 +135,10 @@ int __libcpp_condvar_signal(__libcpp_condvar_t* __cv);
 _LIBCPP_THREAD_ABI_VISIBILITY
 int __libcpp_condvar_broadcast(__libcpp_condvar_t* __cv);
 
-_LIBCPP_THREAD_ABI_VISIBILITY
+_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
 int __libcpp_condvar_wait(__libcpp_condvar_t* __cv, __libcpp_mutex_t* __m);
 
-_LIBCPP_THREAD_ABI_VISIBILITY
+_LIBCPP_THREAD_ABI_VISIBILITY _LIBCPP_NO_THREAD_SAFETY_ANALYSIS
 int __libcpp_condvar_timedwait(__libcpp_condvar_t *__cv, __libcpp_mutex_t *__m,
                                timespec *__ts);
 
diff --git a/include/deque b/include/deque
index 0454162a8f73..92801540f2e7 100644
--- a/include/deque
+++ b/include/deque
@@ -110,8 +110,8 @@ public:
     void push_front(value_type&& v);
     void push_back(const value_type& v);
     void push_back(value_type&& v);
-    template <class... Args> reference emplace_front(Args&&... args);
-    template <class... Args> reference emplace_back(Args&&... args);
+    template <class... Args> reference emplace_front(Args&&... args);  // reference in C++17
+    template <class... Args> reference emplace_back(Args&&... args);   // reference in C++17
     template <class... Args> iterator emplace(const_iterator p, Args&&... args);
     iterator insert(const_iterator p, const value_type& v);
     iterator insert(const_iterator p, value_type&& v);
@@ -1342,8 +1342,13 @@ public:
     void push_back(const value_type& __v);
 #ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
 #ifndef _LIBCPP_HAS_NO_VARIADICS
+#if _LIBCPP_STD_VER > 14
     template <class... _Args> reference emplace_front(_Args&&... __args);
-    template <class... _Args> reference emplace_back(_Args&&... __args);
+    template <class... _Args> reference emplace_back (_Args&&... __args);
+#else
+    template <class... _Args> void      emplace_front(_Args&&... __args);
+    template <class... _Args> void      emplace_back (_Args&&... __args);
+#endif
     template <class... _Args> iterator emplace(const_iterator __p, _Args&&... __args);
 #endif  // _LIBCPP_HAS_NO_VARIADICS
     void push_front(value_type&& __v);
@@ -1822,7 +1827,11 @@ deque<_Tp, _Allocator>::push_back(value_type&& __v)
 
 template <class _Tp, class _Allocator>
 template <class... _Args>
+#if _LIBCPP_STD_VER > 14
 typename deque<_Tp, _Allocator>::reference
+#else
+void
+#endif
 deque<_Tp, _Allocator>::emplace_back(_Args&&... __args)
 {
     allocator_type& __a = __base::__alloc();
@@ -1832,7 +1841,9 @@ deque<_Tp, _Allocator>::emplace_back(_Args&&... __args)
     __alloc_traits::construct(__a, _VSTD::addressof(*__base::end()),
                               _VSTD::forward<_Args>(__args)...);
     ++__base::size();
+#if _LIBCPP_STD_VER > 14
     return *--__base::end();
+#endif
 }
 
 #endif  // _LIBCPP_HAS_NO_VARIADICS
@@ -1870,7 +1881,11 @@ deque<_Tp, _Allocator>::push_front(value_type&& __v)
 
 template <class _Tp, class _Allocator>
 template <class... _Args>
+#if _LIBCPP_STD_VER > 14
 typename deque<_Tp, _Allocator>::reference
+#else
+void
+#endif
 deque<_Tp, _Allocator>::emplace_front(_Args&&... __args)
 {
     allocator_type& __a = __base::__alloc();
@@ -1880,7 +1895,9 @@ deque<_Tp, _Allocator>::emplace_front(_Args&&... __args)
     __alloc_traits::construct(__a, _VSTD::addressof(*--__base::begin()), _VSTD::forward<_Args>(__args)...);
     --__base::__start_;
     ++__base::size();
+#if _LIBCPP_STD_VER > 14
     return *__base::begin();
+#endif
 }
 
 #endif  // _LIBCPP_HAS_NO_VARIADICS
diff --git a/include/forward_list b/include/forward_list
index ce9a4b1bca90..879f2d3ce087 100644
--- a/include/forward_list
+++ b/include/forward_list
@@ -87,7 +87,7 @@ public:
     reference       front();
     const_reference front() const;
 
-    template <class... Args> reference emplace_front(Args&&... args);
+    template <class... Args> reference emplace_front(Args&&... args);  // reference in C++17
     void push_front(const value_type& v);
     void push_front(value_type&& v);
 
@@ -747,7 +747,11 @@ public:
 
 #ifndef _LIBCPP_HAS_NO_RVALUE_REFERENCES
 #ifndef _LIBCPP_HAS_NO_VARIADICS
+#if _LIBCPP_STD_VER > 14
     template <class... _Args> reference emplace_front(_Args&&... __args);
+#else
+    template <class... _Args> void      emplace_front(_Args&&... __args);
+#endif
 #endif
     void push_front(value_type&& __v);
 #endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
@@ -1103,7 +1107,11 @@ forward_list<_Tp, _Alloc>::assign(initializer_list<value_type> __il)
 
 template <class _Tp, class _Alloc>
 template <class... _Args>
+#if _LIBCPP_STD_VER > 14
 typename forward_list<_Tp, _Alloc>::reference
+#else
+void
+#endif
 forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args)
 {
     __node_allocator& __a = base::__alloc();
@@ -1113,7 +1121,9 @@ forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args)
                                   _VSTD::forward<_Args>(__args)...);
     __h->__next_ = base::__before_begin()->__next_;
     base::__before_begin()->__next_ = __h.release();
+#if _LIBCPP_STD_VER > 14
     return base::__before_begin()->__next_->__value_;
+#endif
 }
 
 #endif  // _LIBCPP_HAS_NO_VARIADICS
diff --git a/include/list b/include/list
index 76c50a156093..fa148db516bc 100644
--- a/include/list
+++ b/include/list
@@ -93,10 +93,10 @@ public:
     size_type max_size() const noexcept;
 
     template <class... Args>
-        reference emplace_front(Args&&... args);
+        reference emplace_front(Args&&... args); // reference in C++17
     void pop_front();
     template <class... Args>
-        reference emplace_back(Args&&... args);
+        reference emplace_back(Args&&... args);  // reference in C++17
     void pop_back();
     void push_front(const value_type& x);
     void push_front(value_type&& x);
@@ -969,9 +969,17 @@ public:
     void push_back(value_type&& __x);
 #ifndef _LIBCPP_HAS_NO_VARIADICS
     template <class... _Args>
+#if _LIBCPP_STD_VER > 14
        reference emplace_front(_Args&&... __args);
+#else
+       void      emplace_front(_Args&&... __args);
+#endif
     template <class... _Args>
+#if _LIBCPP_STD_VER > 14
         reference emplace_back(_Args&&... __args);
+#else
+       void       emplace_back(_Args&&... __args);
+#endif
     template <class... _Args>
         iterator emplace(const_iterator __p, _Args&&... __args);
 #endif  // _LIBCPP_HAS_NO_VARIADICS
@@ -1600,7 +1608,11 @@ list<_Tp, _Alloc>::push_back(value_type&& __x)
 
 template <class _Tp, class _Alloc>
 template <class... _Args>
+#if _LIBCPP_STD_VER > 14
 typename list<_Tp, _Alloc>::reference
+#else
+void
+#endif
 list<_Tp, _Alloc>::emplace_front(_Args&&... __args)
 {
     __node_allocator& __na = base::__node_alloc();
@@ -1609,12 +1621,20 @@ list<_Tp, _Alloc>::emplace_front(_Args&&... __args)
     __node_alloc_traits::construct(__na, _VSTD::addressof(__hold->__value_), _VSTD::forward<_Args>(__args)...);
     __link_nodes_at_front(__hold.get()->__as_link(), __hold.get()->__as_link());
     ++base::__sz();
+#if _LIBCPP_STD_VER > 14
     return __hold.release()->__value_;
+#else
+    __hold.release();
+#endif
 }
 
 template <class _Tp, class _Alloc>
 template <class... _Args>
+#if _LIBCPP_STD_VER > 14
 typename list<_Tp, _Alloc>::reference
+#else
+void
+#endif
 list<_Tp, _Alloc>::emplace_back(_Args&&... __args)
 {
     __node_allocator& __na = base::__node_alloc();
@@ -1624,7 +1644,11 @@ list<_Tp, _Alloc>::emplace_back(_Args&&... __args)
     __link_pointer __nl = __hold->__as_link();
     __link_nodes_at_back(__nl, __nl);
     ++base::__sz();
+#if _LIBCPP_STD_VER > 14
     return __hold.release()->__value_;
+#else
+    __hold.release();
+#endif
 }
 
 template <class _Tp, class _Alloc>
diff --git a/include/queue b/include/queue
index 271c203e1d24..57d420c7406c 100644
--- a/include/queue
+++ b/include/queue
@@ -63,7 +63,7 @@ public:
 
     void push(const value_type& v);
     void push(value_type&& v);
-    template <class... Args> reference emplace(Args&&... args);
+    template <class... Args> reference emplace(Args&&... args); // reference in C++17
     void pop();
 
     void swap(queue& q) noexcept(is_nothrow_swappable_v<Container>)
@@ -292,8 +292,13 @@ public:
 #ifndef _LIBCPP_HAS_NO_VARIADICS
     template <class... _Args>
         _LIBCPP_INLINE_VISIBILITY
+#if _LIBCPP_STD_VER > 14
         reference emplace(_Args&&... __args)
             { return c.emplace_back(_VSTD::forward<_Args>(__args)...);}
+#else
+        void     emplace(_Args&&... __args)
+            {        c.emplace_back(_VSTD::forward<_Args>(__args)...);}
+#endif
 #endif  // _LIBCPP_HAS_NO_VARIADICS
 #endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
diff --git a/include/stack b/include/stack
index 228864e4709b..c797ea5ec6e9 100644
--- a/include/stack
+++ b/include/stack
@@ -55,7 +55,7 @@ public:
 
     void push(const value_type& x);
     void push(value_type&& x);
-    template <class... Args> reference emplace(Args&&... args);
+    template <class... Args> reference emplace(Args&&... args); // reference in C++17
     void pop();
 
     void swap(stack& c) noexcept(is_nothrow_swappable_v<Container>)
@@ -199,8 +199,13 @@ public:
 #ifndef _LIBCPP_HAS_NO_VARIADICS
     template <class... _Args>
         _LIBCPP_INLINE_VISIBILITY
+#if _LIBCPP_STD_VER > 14
         reference emplace(_Args&&... __args)
         { return c.emplace_back(_VSTD::forward<_Args>(__args)...);}
+#else
+        void      emplace(_Args&&... __args)
+        {        c.emplace_back(_VSTD::forward<_Args>(__args)...);}
+#endif
 #endif  // _LIBCPP_HAS_NO_VARIADICS
 #endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
diff --git a/include/vector b/include/vector
index 4dd0e527ff57..ded057b10c8b 100644
--- a/include/vector
+++ b/include/vector
@@ -99,7 +99,7 @@ public:
     void push_back(const value_type& x);
     void push_back(value_type&& x);
     template <class... Args>
-        reference emplace_back(Args&&... args);
+        reference emplace_back(Args&&... args); // reference in C++17
     void pop_back();
 
     template <class... Args> iterator emplace(const_iterator position, Args&&... args);
@@ -218,7 +218,7 @@ public:
     const_reference back() const;
 
     void push_back(const value_type& x);
-    template <class... Args> reference emplace_back(Args&&... args);  // C++14
+    template <class... Args> reference emplace_back(Args&&... args);  // C++14; reference in C++17
     void pop_back();
 
     template <class... Args> iterator emplace(const_iterator position, Args&&... args);  // C++14
@@ -679,7 +679,11 @@ public:
 #ifndef _LIBCPP_HAS_NO_VARIADICS
     template <class... _Args>
         _LIBCPP_INLINE_VISIBILITY
+#if _LIBCPP_STD_VER > 14
         reference emplace_back(_Args&&... __args);
+#else
+        void      emplace_back(_Args&&... __args);
+#endif
 #endif  // _LIBCPP_HAS_NO_VARIADICS
 #endif  // _LIBCPP_HAS_NO_RVALUE_REFERENCES
     _LIBCPP_INLINE_VISIBILITY
@@ -1625,7 +1629,11 @@ vector<_Tp, _Allocator>::__emplace_back_slow_path(_Args&&... __args)
 template <class _Tp, class _Allocator>
 template <class... _Args>
 inline
+#if _LIBCPP_STD_VER > 14
 typename vector<_Tp, _Allocator>::reference
+#else
+void
+#endif
 vector<_Tp, _Allocator>::emplace_back(_Args&&... __args)
 {
     if (this->__end_ < this->__end_cap())
@@ -1639,7 +1647,9 @@ vector<_Tp, _Allocator>::emplace_back(_Args&&... __args)
     }
     else
         __emplace_back_slow_path(_VSTD::forward<_Args>(__args)...);
+#if _LIBCPP_STD_VER > 14
     return this->back();
+#endif
 }
 
 #endif  // _LIBCPP_HAS_NO_VARIADICS
@@ -2336,9 +2346,16 @@ public:
     void push_back(const value_type& __x);
 #if _LIBCPP_STD_VER > 11
     template <class... _Args>
-    _LIBCPP_INLINE_VISIBILITY reference emplace_back(_Args&&... __args) {
+#if _LIBCPP_STD_VER > 14
+    _LIBCPP_INLINE_VISIBILITY reference emplace_back(_Args&&... __args)
+#else
+    _LIBCPP_INLINE_VISIBILITY void      emplace_back(_Args&&... __args)
+#endif
+    {
         push_back ( value_type ( _VSTD::forward<_Args>(__args)... ));
+#if _LIBCPP_STD_VER > 14
         return this->back();
+#endif
     }
 #endif
 
diff --git a/test/libcxx/test/config.py b/test/libcxx/test/config.py
index 5fcb8f4a9b3f..3d790249ce66 100644
--- a/test/libcxx/test/config.py
+++ b/test/libcxx/test/config.py
@@ -403,6 +403,15 @@ def configure_default_compile_flags(self):
         if not std:
             # Choose the newest possible language dialect if none is given.
             possible_stds = ['c++1z', 'c++14', 'c++11', 'c++03']
+            if self.cxx.type == 'gcc':
+                maj_v, _, _ = self.cxx.version
+                maj_v = int(maj_v)
+                if maj_v < 7:
+                    possible_stds.remove('c++1z')
+                # FIXME: How many C++14 tests actually fail under GCC 5 and 6?
+                # Should we XFAIL them individually instead?
+                if maj_v <= 6:
+                    possible_stds.remove('c++14')
             for s in possible_stds:
                 if self.cxx.hasCompileFlag('-std=%s' % s):
                     std = s
diff --git a/test/std/containers/container.adaptors/queue/queue.defn/emplace.pass.cpp b/test/std/containers/container.adaptors/queue/queue.defn/emplace.pass.cpp
index 77d822a0794d..4142bc3694d1 100644
--- a/test/std/containers/container.adaptors/queue/queue.defn/emplace.pass.cpp
+++ b/test/std/containers/container.adaptors/queue/queue.defn/emplace.pass.cpp
@@ -12,25 +12,35 @@
 // <queue>
 
 // template <class... Args> reference emplace(Args&&... args);
+// return type is 'reference' in C++17; 'void' before
+
 
 #include <queue>
 #include <cassert>
 
+#include "test_macros.h"
+
 #include "../../../Emplaceable.h"
 
 int main()
 {
     typedef Emplaceable T;
     std::queue<Emplaceable> q;
+#if TEST_STD_VER > 14
     T& r1 = q.emplace(1, 2.5);
     assert(&r1 == &q.back());
     T& r2 = q.emplace(2, 3.5);
     assert(&r2 == &q.back());
     T& r3 = q.emplace(3, 4.5);
     assert(&r3 == &q.back());
+    assert(&r1 == &q.front());
+#else
+    q.emplace(1, 2.5);
+    q.emplace(2, 3.5);
+    q.emplace(3, 4.5);
+#endif
+
     assert(q.size() == 3);
     assert(q.front() == Emplaceable(1, 2.5));
     assert(q.back() == Emplaceable(3, 4.5));
-    assert(&r3 == &q.back());
-    assert(&r1 == &q.front());
 }
diff --git a/test/std/containers/container.adaptors/stack/stack.defn/emplace.pass.cpp b/test/std/containers/container.adaptors/stack/stack.defn/emplace.pass.cpp
index 71fe903b74a8..8b19972eb32e 100644
--- a/test/std/containers/container.adaptors/stack/stack.defn/emplace.pass.cpp
+++ b/test/std/containers/container.adaptors/stack/stack.defn/emplace.pass.cpp
@@ -12,22 +12,31 @@
 // <stack>
 
 // template <class... Args> reference emplace(Args&&... args);
+// return type is 'reference' in C++17; 'void' before
 
 #include <stack>
 #include <cassert>
 
+#include "test_macros.h"
+
 #include "../../../Emplaceable.h"
 
 int main()
 {
     typedef Emplaceable T;
     std::stack<Emplaceable> q;
+#if TEST_STD_VER > 14
     T& r1 = q.emplace(1, 2.5);
     assert(&r1 == &q.top());
     T& r2 = q.emplace(2, 3.5);
     assert(&r2 == &q.top());
     T& r3 = q.emplace(3, 4.5);
     assert(&r3 == &q.top());
+#else
+    q.emplace(1, 2.5);
+    q.emplace(2, 3.5);
+    q.emplace(3, 4.5);
+#endif
     assert(q.size() == 3);
     assert(q.top() == Emplaceable(3, 4.5));
 }
diff --git a/test/std/containers/sequences/deque/deque.modifiers/emplace_back.pass.cpp b/test/std/containers/sequences/deque/deque.modifiers/emplace_back.pass.cpp
index e3a35362b186..13773fb1f5f8 100644
--- a/test/std/containers/sequences/deque/deque.modifiers/emplace_back.pass.cpp
+++ b/test/std/containers/sequences/deque/deque.modifiers/emplace_back.pass.cpp
@@ -12,6 +12,7 @@
 // <deque>
 
 // template <class... Args> reference emplace_back(Args&&... args);
+// return type is 'reference' in C++17; 'void' before
 
 #include <deque>
 #include <cstddef>
@@ -49,15 +50,21 @@ void
 test(C& c1)
 {
     typedef typename C::iterator I;
-    typedef typename C::reference Ref;
     std::size_t c1_osize = c1.size();
+#if TEST_STD_VER > 14
+    typedef typename C::reference Ref;
     Ref ref = c1.emplace_back(Emplaceable(1, 2.5));
+#else
+              c1.emplace_back(Emplaceable(1, 2.5));
+#endif
     assert(c1.size() == c1_osize + 1);
     assert(distance(c1.begin(), c1.end())
                == static_cast<std::ptrdiff_t>(c1.size()));
     I i = c1.end();
     assert(*--i == Emplaceable(1, 2.5));
+#if TEST_STD_VER > 14
     assert(&(*i) == &ref);
+#endif
 }
 
 template <class C>
diff --git a/test/std/containers/sequences/deque/deque.modifiers/emplace_front.pass.cpp b/test/std/containers/sequences/deque/deque.modifiers/emplace_front.pass.cpp
index 26c700de7253..4c272be48da1 100644
--- a/test/std/containers/sequences/deque/deque.modifiers/emplace_front.pass.cpp
+++ b/test/std/containers/sequences/deque/deque.modifiers/emplace_front.pass.cpp
@@ -12,6 +12,7 @@
 // <deque>
 
 // template <class... Args> reference emplace_front(Args&&... args);
+// return type is 'reference' in C++17; 'void' before
 
 #include <deque>
 #include <cstddef>
@@ -20,6 +21,7 @@
 #include "test_macros.h"
 #include "../../../Emplaceable.h"
 #include "min_allocator.h"
+#include "test_allocator.h"
 
 template <class C>
 C
@@ -48,15 +50,21 @@ void
 test(C& c1)
 {
     typedef typename C::iterator I;
-    typedef typename C::reference Ref;
     std::size_t c1_osize = c1.size();
+#if TEST_STD_VER > 14
+    typedef typename C::reference Ref;
     Ref res_ref = c1.emplace_front(Emplaceable(1, 2.5));
+#else
+                  c1.emplace_front(Emplaceable(1, 2.5));
+#endif
     assert(c1.size() == c1_osize + 1);
     assert(distance(c1.begin(), c1.end())
                == static_cast<std::ptrdiff_t>(c1.size()));
     I i = c1.begin();
     assert(*i == Emplaceable(1, 2.5));
+#if TEST_STD_VER > 14
     assert(&res_ref == &(*i));
+#endif
 }
 
 template <class C>
@@ -84,4 +92,15 @@ int main()
         for (int j = 0; j < N; ++j)
             testN<std::deque<Emplaceable, min_allocator<Emplaceable>> >(rng[i], rng[j]);
     }
+    {
+        std::deque<Tag_X, TaggingAllocator<Tag_X>> c;
+        c.emplace_front();
+        assert(c.size() == 1);
+        c.emplace_front(1, 2, 3);
+        assert(c.size() == 2);
+        c.emplace_front();
+        assert(c.size() == 3);
+        c.emplace_front(1, 2, 3);
+        assert(c.size() == 4);
+    }
 }
diff --git a/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp b/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp
index 589e71894b8c..34591a1e1962 100644
--- a/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp
+++ b/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp
@@ -12,10 +12,13 @@
 // <forward_list>
 
 // template <class... Args> reference emplace_front(Args&&... args);
+// return type is 'reference' in C++17; 'void' before
 
 #include <forward_list>
 #include <cassert>
 
+#include "test_macros.h"
+
 #include "../../../Emplaceable.h"
 #include "min_allocator.h"
 
@@ -25,6 +28,7 @@ int main()
         typedef Emplaceable T;
         typedef std::forward_list<T> C;
         C c;
+#if TEST_STD_VER > 14
         T& r1 = c.emplace_front();
         assert(c.front() == Emplaceable());
         assert(&r1 == &c.front());
@@ -32,6 +36,13 @@ int main()
         T& r2 = c.emplace_front(1, 2.5);
         assert(c.front() == Emplaceable(1, 2.5));
         assert(&r2 == &c.front());
+#else
+        c.emplace_front();
+        assert(c.front() == Emplaceable());
+        assert(distance(c.begin(), c.end()) == 1);
+        c.emplace_front(1, 2.5);
+        assert(c.front() == Emplaceable(1, 2.5));
+#endif
         assert(*next(c.begin()) == Emplaceable());
         assert(distance(c.begin(), c.end()) == 2);
     }
@@ -39,6 +50,7 @@ int main()
         typedef Emplaceable T;
         typedef std::forward_list<T, min_allocator<T>> C;
         C c;
+#if TEST_STD_VER > 14
         T& r1 = c.emplace_front();
         assert(c.front() == Emplaceable());
         assert(&r1 == &c.front());
@@ -46,6 +58,13 @@ int main()
         T& r2 = c.emplace_front(1, 2.5);
         assert(c.front() == Emplaceable(1, 2.5));
         assert(&r2 == &c.front());
+#else
+        c.emplace_front();
+        assert(c.front() == Emplaceable());
+        assert(distance(c.begin(), c.end()) == 1);
+        c.emplace_front(1, 2.5);
+        assert(c.front() == Emplaceable(1, 2.5));
+#endif
         assert(*next(c.begin()) == Emplaceable());
         assert(distance(c.begin(), c.end()) == 2);
     }
diff --git a/test/std/containers/sequences/list/list.modifiers/emplace_back.pass.cpp b/test/std/containers/sequences/list/list.modifiers/emplace_back.pass.cpp
index 2aae2b9b09eb..c63b0d8fbdb0 100644
--- a/test/std/containers/sequences/list/list.modifiers/emplace_back.pass.cpp
+++ b/test/std/containers/sequences/list/list.modifiers/emplace_back.pass.cpp
@@ -12,10 +12,12 @@
 // <list>
 
 // template <class... Args> reference emplace_back(Args&&... args);
+// return type is 'reference' in C++17; 'void' before
 
 #include <list>
 #include <cassert>
 
+#include "test_macros.h"
 #include "min_allocator.h"
 
 class A
@@ -37,6 +39,7 @@ int main()
 {
     {
     std::list<A> c;
+#if TEST_STD_VER > 14
     A& r1 = c.emplace_back(2, 3.5);
     assert(c.size() == 1);
     assert(&r1 == &c.back());
@@ -45,6 +48,14 @@ int main()
     A& r2 = c.emplace_back(3, 4.5);
     assert(c.size() == 2);
     assert(&r2 == &c.back());
+#else
+    c.emplace_back(2, 3.5);
+    assert(c.size() == 1);
+    assert(c.front().geti() == 2);
+    assert(c.front().getd() == 3.5);
+    c.emplace_back(3, 4.5);
+    assert(c.size() == 2);
+#endif
     assert(c.front().geti() == 2);
     assert(c.front().getd() == 3.5);
     assert(c.back().geti() == 3);
@@ -52,6 +63,7 @@ int main()
     }
     {
     std::list<A, min_allocator<A>> c;
+#if TEST_STD_VER > 14
     A& r1 = c.emplace_back(2, 3.5);
     assert(c.size() == 1);
     assert(&r1 == &c.back());
@@ -60,6 +72,14 @@ int main()
     A& r2 = c.emplace_back(3, 4.5);
     assert(c.size() == 2);
     assert(&r2 == &c.back());
+#else
+    c.emplace_back(2, 3.5);
+    assert(c.size() == 1);
+    assert(c.front().geti() == 2);
+    assert(c.front().getd() == 3.5);
+    c.emplace_back(3, 4.5);
+    assert(c.size() == 2);
+#endif
     assert(c.front().geti() == 2);
     assert(c.front().getd() == 3.5);
     assert(c.back().geti() == 3);
diff --git a/test/std/containers/sequences/list/list.modifiers/emplace_front.pass.cpp b/test/std/containers/sequences/list/list.modifiers/emplace_front.pass.cpp
index 994dac258f8c..a9be19e6f940 100644
--- a/test/std/containers/sequences/list/list.modifiers/emplace_front.pass.cpp
+++ b/test/std/containers/sequences/list/list.modifiers/emplace_front.pass.cpp
@@ -12,6 +12,7 @@
 // <list>
 
 // template <class... Args> reference emplace_front(Args&&... args);
+// return type is 'reference' in C++17; 'void' before
 
 #include <list>
 #include <cassert>
@@ -37,6 +38,7 @@ int main()
 {
     {
     std::list<A> c;
+#if TEST_STD_VER > 14
     A& r1 = c.emplace_front(2, 3.5);
     assert(c.size() == 1);
     assert(&r1 == &c.front());
@@ -45,13 +47,23 @@ int main()
     A& r2 = c.emplace_front(3, 4.5);
     assert(c.size() == 2);
     assert(&r2 == &c.front());
+#else
+    c.emplace_front(2, 3.5);
+    assert(c.size() == 1);
+    assert(c.front().geti() == 2);
+    assert(c.front().getd() == 3.5);
+    c.emplace_front(3, 4.5);
+    assert(c.size() == 2);
+#endif
     assert(c.front().geti() == 3);
     assert(c.front().getd() == 4.5);
     assert(c.back().geti() == 2);
     assert(c.back().getd() == 3.5);
     }
+
     {
     std::list<A, min_allocator<A>> c;
+#if TEST_STD_VER > 14
     A& r1 = c.emplace_front(2, 3.5);
     assert(c.size() == 1);
     assert(&r1 == &c.front());
@@ -60,6 +72,14 @@ int main()
     A& r2 = c.emplace_front(3, 4.5);
     assert(c.size() == 2);
     assert(&r2 == &c.front());
+#else
+    c.emplace_front(2, 3.5);
+    assert(c.size() == 1);
+    assert(c.front().geti() == 2);
+    assert(c.front().getd() == 3.5);
+    c.emplace_front(3, 4.5);
+    assert(c.size() == 2);
+#endif
     assert(c.front().geti() == 3);
     assert(c.front().getd() == 4.5);
     assert(c.back().geti() == 2);
diff --git a/test/std/containers/sequences/vector.bool/emplace_back.pass.cpp b/test/std/containers/sequences/vector.bool/emplace_back.pass.cpp
index 24005bec82df..81a8ae475168 100644
--- a/test/std/containers/sequences/vector.bool/emplace_back.pass.cpp
+++ b/test/std/containers/sequences/vector.bool/emplace_back.pass.cpp
@@ -12,17 +12,20 @@
 //  vector.bool
 
 // template <class... Args> reference emplace_back(Args&&... args);
+// return type is 'reference' in C++17; 'void' before
 
 #include <vector>
 #include <cassert>
+#include "test_macros.h"
 #include "min_allocator.h"
 
 int main()
 {
     {
         typedef std::vector<bool> C;
-        typedef C::reference Ref;
         C c;
+#if TEST_STD_VER > 14
+        typedef C::reference Ref;
         Ref r1 = c.emplace_back();
         assert(c.size() == 1);
         assert(c.front() == false);
@@ -36,19 +39,27 @@ int main()
         r2 = false;
         assert(c.back() == false);
         r2 = true;
-        Ref r3 = c.emplace_back(1 == 1);
+#else
+        c.emplace_back();
+        assert(c.size() == 1);
+        assert(c.front() == false);
+        c.emplace_back(true);
+        assert(c.size() == 2);
+        assert(c.front() == false);
+        assert(c.back() == true);
+#endif
+        c.emplace_back(1 == 1);
         assert(c.size() == 3);
         assert(c.front() == false);
         assert(c[1] == true);
         assert(c.back() == true);
-        r3 = false;
-        assert(c.back() == false);
     }
     {
         typedef std::vector<bool, min_allocator<bool>> C;
-        typedef C::reference Ref;
         C c;
 
+#if TEST_STD_VER > 14
+        typedef C::reference Ref;
         Ref r1 = c.emplace_back();
         assert(c.size() == 1);
         assert(c.front() == false);
@@ -62,6 +73,15 @@ int main()
         r2 = false;
         assert(c.back() == false);
         r2 = true;
+#else
+        c.emplace_back();
+        assert(c.size() == 1);
+        assert(c.front() == false);
+        c.emplace_back(true);
+        assert(c.size() == 2);
+        assert(c.front() == false);
+        assert(c.back() == true);
+#endif
         c.emplace_back(1 == 1);
         assert(c.size() == 3);
         assert(c.front() == false);
diff --git a/test/std/containers/sequences/vector/vector.modifiers/emplace_back.pass.cpp b/test/std/containers/sequences/vector/vector.modifiers/emplace_back.pass.cpp
index 2fece8c78c78..f70c9b9abec3 100644
--- a/test/std/containers/sequences/vector/vector.modifiers/emplace_back.pass.cpp
+++ b/test/std/containers/sequences/vector/vector.modifiers/emplace_back.pass.cpp
@@ -12,9 +12,11 @@
 // <vector>
 
 // template <class... Args> reference emplace_back(Args&&... args);
+// return type is 'reference' in C++17; 'void' before
 
 #include <vector>
 #include <cassert>
+#include "test_macros.h"
 #include "test_allocator.h"
 #include "min_allocator.h"
 #include "test_allocator.h"
@@ -56,6 +58,7 @@ int main()
 {
     {
         std::vector<A> c;
+#if TEST_STD_VER > 14
         A& r1 = c.emplace_back(2, 3.5);
         assert(c.size() == 1);
         assert(&r1 == &c.back());
@@ -65,6 +68,15 @@ int main()
         A& r2 = c.emplace_back(3, 4.5);
         assert(c.size() == 2);
         assert(&r2 == &c.back());
+#else
+        c.emplace_back(2, 3.5);
+        assert(c.size() == 1);
+        assert(c.front().geti() == 2);
+        assert(c.front().getd() == 3.5);
+        assert(is_contiguous_container_asan_correct(c));
+        c.emplace_back(3, 4.5);
+        assert(c.size() == 2);
+#endif
         assert(c.front().geti() == 2);
         assert(c.front().getd() == 3.5);
         assert(c.back().geti() == 3);
@@ -73,6 +85,7 @@ int main()
     }
     {
         std::vector<A, limited_allocator<A, 4> > c;
+#if TEST_STD_VER > 14
         A& r1 = c.emplace_back(2, 3.5);
         assert(c.size() == 1);
         assert(&r1 == &c.back());
@@ -82,6 +95,15 @@ int main()
         A& r2 = c.emplace_back(3, 4.5);
         assert(c.size() == 2);
         assert(&r2 == &c.back());
+#else
+        c.emplace_back(2, 3.5);
+        assert(c.size() == 1);
+        assert(c.front().geti() == 2);
+        assert(c.front().getd() == 3.5);
+        assert(is_contiguous_container_asan_correct(c));
+        c.emplace_back(3, 4.5);
+        assert(c.size() == 2);
+#endif
         assert(c.front().geti() == 2);
         assert(c.front().getd() == 3.5);
         assert(c.back().geti() == 3);
@@ -90,6 +112,7 @@ int main()
     }
     {
         std::vector<A, min_allocator<A>> c;
+#if TEST_STD_VER > 14
         A& r1 = c.emplace_back(2, 3.5);
         assert(c.size() == 1);
         assert(&r1 == &c.back());
@@ -99,6 +122,15 @@ int main()
         A& r2 = c.emplace_back(3, 4.5);
         assert(c.size() == 2);
         assert(&r2 == &c.back());
+#else
+        c.emplace_back(2, 3.5);
+        assert(c.size() == 1);
+        assert(c.front().geti() == 2);
+        assert(c.front().getd() == 3.5);
+        assert(is_contiguous_container_asan_correct(c));
+        c.emplace_back(3, 4.5);
+        assert(c.size() == 2);
+#endif
         assert(c.front().geti() == 2);
         assert(c.front().getd() == 3.5);
         assert(c.back().geti() == 3);
diff --git a/www/cxx1z_status.html b/www/cxx1z_status.html
index 22503b98016e..bf470668b7f4 100644
--- a/www/cxx1z_status.html
+++ b/www/cxx1z_status.html
@@ -376,10 +376,10 @@
 	<tr><td><a href="http://wg21.link/LWG2680">2680</a></td><td>Add "Equivalent to" to filesystem</td><td>Issaquah</td><td>Complete</td></tr>
 	<tr><td><a href="http://wg21.link/LWG2681">2681</a></td><td>filesystem::copy() cannot copy symlinks</td><td>Issaquah</td><td>Complete</td></tr>
 	<tr><td><a href="http://wg21.link/LWG2682">2682</a></td><td>filesystem::copy() won't create a symlink to a directory</td><td>Issaquah</td><td>Complete</td></tr>
-	<tr><td><a href="http://wg21.link/LWG2686">2686</a></td><td>Why is std::hash specialized for error_code, but not error_condition?</td><td>Issaquah</td><td>Patch ready</td></tr>
+	<tr><td><a href="http://wg21.link/LWG2686">2686</a></td><td>Why is std::hash specialized for error_code, but not error_condition?</td><td>Issaquah</td><td>Complete</td></tr>
 	<tr><td><a href="http://wg21.link/LWG2694">2694</a></td><td>Application of LWG 436 accidentally deleted definition of "facet"</td><td>Issaquah</td><td>Complete</td></tr>
 	<tr><td><a href="http://wg21.link/LWG2696">2696</a></td><td>Interaction between make_shared and enable_shared_from_this is underspecified</td><td>Issaquah</td><td></td></tr>
-	<tr><td><a href="http://wg21.link/LWG2699">2699</a></td><td>Missing restriction in [numeric.requirements]</td><td>Issaquah</td><td></td></tr>
+	<tr><td><a href="http://wg21.link/LWG2699">2699</a></td><td>Missing restriction in [numeric.requirements]</td><td>Issaquah</td><td>Complete</td></tr>
 	<tr><td><a href="http://wg21.link/LWG2712">2712</a></td><td>copy_file(from, to, ...) has a number of unspecified error conditions</td><td>Issaquah</td><td>Complete</td></tr>
 	<tr><td><a href="http://wg21.link/LWG2722">2722</a></td><td>equivalent incorrectly specifies throws clause</td><td>Issaquah</td><td>Complete</td></tr>
 	<tr><td><a href="http://wg21.link/LWG2729">2729</a></td><td>Missing SFINAE on std::pair::operator=</td><td>Issaquah</td><td></td></tr>

From b025d011dd270230aeb50d7174a2a05616fe81eb Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Sun, 29 Jan 2017 20:59:10 +0000
Subject: [PATCH 5/5] Vendor import of lld release_40 branch r293443:
 https://llvm.org/svn/llvm-project/lld/branches/release_40@293443

---
 CMakeLists.txt                        |  1 +
 ELF/LinkerScript.cpp                  |  7 +--
 ELF/SyntheticSections.h               |  4 +-
 cmake/modules/AddLLD.cmake            | 34 +++++++++-
 docs/ReleaseNotes.rst                 | 89 ++++++++++++++++++++++++---
 test/ELF/linkerscript/common-assign.s | 48 +++++++++++++++
 6 files changed, 164 insertions(+), 19 deletions(-)
 create mode 100644 test/ELF/linkerscript/common-assign.s

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 48ac5e038cdb..be424efbbd87 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -42,6 +42,7 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   include_directories("${LLVM_BINARY_DIR}/include" ${LLVM_INCLUDE_DIRS})
   link_directories(${LLVM_LIBRARY_DIRS})
 
+  set(LLVM_LIBRARY_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/lib${LLVM_LIBDIR_SUFFIX})
   set(LLVM_RUNTIME_OUTPUT_INTDIR ${CMAKE_BINARY_DIR}/${CMAKE_CFG_INTDIR}/bin)
   find_program(LLVM_TABLEGEN_EXE "llvm-tblgen" ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
 
diff --git a/ELF/LinkerScript.cpp b/ELF/LinkerScript.cpp
index 887ca12537ad..3cc235386b88 100644
--- a/ELF/LinkerScript.cpp
+++ b/ELF/LinkerScript.cpp
@@ -918,12 +918,7 @@ const OutputSectionBase *LinkerScript<ELFT>::getSymbolSection(StringRef S) {
     return CurOutSec ? CurOutSec : (*OutputSections)[0];
   }
 
-  if (auto *DR = dyn_cast_or_null<DefinedRegular<ELFT>>(Sym))
-    return DR->Section ? DR->Section->OutSec : nullptr;
-  if (auto *DS = dyn_cast_or_null<DefinedSynthetic>(Sym))
-    return DS->Section;
-
-  return nullptr;
+  return SymbolTableSection<ELFT>::getOutputSection(Sym);
 }
 
 // Returns indices of ELF headers containing specific section, identified
diff --git a/ELF/SyntheticSections.h b/ELF/SyntheticSections.h
index dfefb3821e75..f7e891ec3a66 100644
--- a/ELF/SyntheticSections.h
+++ b/ELF/SyntheticSections.h
@@ -372,6 +372,8 @@ class SymbolTableSection final : public SyntheticSection<ELFT> {
 
   ArrayRef<SymbolTableEntry> getSymbols() const { return Symbols; }
 
+  static const OutputSectionBase *getOutputSection(SymbolBody *Sym);
+
   unsigned NumLocals = 0;
   StringTableSection<ELFT> &StrTabSec;
 
@@ -379,8 +381,6 @@ class SymbolTableSection final : public SyntheticSection<ELFT> {
   void writeLocalSymbols(uint8_t *&Buf);
   void writeGlobalSymbols(uint8_t *Buf);
 
-  const OutputSectionBase *getOutputSection(SymbolBody *Sym);
-
   // A vector of symbols and their string table offsets.
   std::vector<SymbolTableEntry> Symbols;
 };
diff --git a/cmake/modules/AddLLD.cmake b/cmake/modules/AddLLD.cmake
index 906b2952a943..fd1d44199ca6 100644
--- a/cmake/modules/AddLLD.cmake
+++ b/cmake/modules/AddLLD.cmake
@@ -1,6 +1,38 @@
 macro(add_lld_library name)
-  llvm_add_library(${name} ${ARGN})
+  cmake_parse_arguments(ARG
+    "SHARED"
+    ""
+    ""
+    ${ARGN})
+  if(ARG_SHARED)
+    set(ARG_ENABLE_SHARED SHARED)
+  endif()
+  llvm_add_library(${name} ${ARG_ENABLE_SHARED} ${ARG_UNPARSED_ARGUMENTS})
   set_target_properties(${name} PROPERTIES FOLDER "lld libraries")
+
+  if (LLD_BUILD_TOOLS)
+    if(${name} IN_LIST LLVM_DISTRIBUTION_COMPONENTS OR
+        NOT LLVM_DISTRIBUTION_COMPONENTS)
+      set(export_to_lldtargets EXPORT lldTargets)
+      set_property(GLOBAL PROPERTY LLD_HAS_EXPORTS True)
+    endif()
+
+    install(TARGETS ${name}
+      COMPONENT ${name}
+      ${export_to_lldtargets}
+      LIBRARY DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+      ARCHIVE DESTINATION lib${LLVM_LIBDIR_SUFFIX}
+      RUNTIME DESTINATION bin)
+
+    if (${ARG_SHARED} AND NOT CMAKE_CONFIGURATION_TYPES)
+      add_custom_target(install-${name}
+        DEPENDS ${name}
+        COMMAND "${CMAKE_COMMAND}"
+          -DCMAKE_INSTALL_COMPONENT=${name}
+          -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+    endif()
+    set_property(GLOBAL APPEND PROPERTY LLD_EXPORTS ${name})
+  endif()
 endmacro(add_lld_library)
 
 macro(add_lld_executable name)
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 8bd73db363d7..6eb857070690 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -11,25 +11,94 @@ LLD 4.0.0 Release Notes
 Introduction
 ============
 
-This document contains the release notes for the LLD linker, release 4.0.0.
+LLD is a linker which supports ELF (Unix), COFF (Windows) and Mach-O
+(macOS). It is generally faster than the GNU BFD/gold linkers or the
+MSVC linker.
+
+LLD is designed to be a drop-in replacmenet for the system linkers, so
+that users don't need to change their build systems other than swapping
+the linker command.
+
+This document contains the release notes for LLD 4.0.0.
 Here we describe the status of LLD, including major improvements
 from the previous release. All LLD releases may be downloaded
 from the `LLVM releases web site <http://llvm.org/releases/>`_.
 
-Non-comprehensive list of changes in this release
-=================================================
+
+What's New in LLD 4.0?
+======================
 
 ELF Improvements
 ----------------
 
-* Initial support for LTO.
+LLD provides much better compatibility with the GNU linker than before.
+Now it is able to link the entire FreeBSD base system including the kernel
+out of the box. We are working closely with the FreeBSD project to
+make it usable as the system linker in a future release of the operating
+system.
+
+Multi-threading performance has been improved, and multi-threading
+is now enabled by default. Combined with other optimizations, LLD 4.0
+is about 1.5 times faster than LLD 3.9 when linking large programs
+in our test environment.
+
+Other notable changes are listed below:
+
+* Error messages contain more information than before. If debug info
+  is available, the linker prints out not only the object file name
+  but the source location of unresolved symbols.
+
+* Error messages are printed in red just like Clang by default. You
+  can disable it by passing -no-color-diagnostics.
+
+* LLD's version string is now embedded in a .comment section in the
+  result output file. You can dump it with this command: ``objdump -j -s
+  .comment <file>``.
+
+* The -Map option is supported. With that, you can print out section
+  and symbol information to a specified file. This feature is useful
+  for analyzing link results.
+
+* The file format for the -reproduce option has changed from cpio to
+  tar.
+
+* When creating a copy relocation for a symbol, LLD now scans the
+  DSO's header to see if the symbol is in a read-only segment. If so,
+  space for the copy relocation is reserved in .bss.rel.ro instead of
+  .bss. This fixes a security issue that read-only data in a DSO
+  becomes writable if it is copied by a copy relocation. This issue
+  was disclosed originally on the binutils mailing list at
+  `<https://sourceware.org/ml/libc-alpha/2016-12/msg00914.html>`.
+
+* Compressed input sections are supported.
+
+* ``--oformat binary``, ``--section-start``, ``-Tbss``, ``-Tdata``,
+  ``-Ttext``, ``-b binary``, ``-build-id=uuid``, ``-no-rosegment``,
+  ``-nopie``, ``-nostdlib``, ``-omagic``, ``-retain-symbols-file``,
+  ``-sort-section``, ``-z max-page-size`` and ``-z wxneeded`` are
+  suppoorted.
+
+* A lot of linker script directives have been added.
+
+* Default image base address for x86-64 has changed from 0x10000 to
+  0x200000 to make it huge-page friendly.
+
+* ARM port now supports GNU ifunc, the ARM C++ exceptions ABI, TLS
+  relocations and static linking. Problems with dlopen() on systems
+  using eglibc fixed.
+
+* MIPS port now supports input files in new R6 revision of MIPS ABIs
+  or N32 ABI. Generated file now contains .MIPS.abiflags section and
+  complete set of ELF headers flags.
+
+* Relocations produced by the ``-mxgot`` compiler's flag is supported
+  for MIPS. Now it is possible to generate "large" GOT exceeds 64K
+  limit.
 
 COFF Improvements
 -----------------
 
-* Item 1.
-
-MachO Improvements
-------------------
-
-* Item 1.
+* Performance on Windows has been improved by parallelizing parts of the
+  linker and optimizing file system operations. As a result of these
+  improvements, LLD 4.0 has been measured to be about 2.5 times faster
+  than LLD 3.9 when linking a large Chromium DLL.
diff --git a/test/ELF/linkerscript/common-assign.s b/test/ELF/linkerscript/common-assign.s
new file mode 100644
index 000000000000..4244ae321110
--- /dev/null
+++ b/test/ELF/linkerscript/common-assign.s
@@ -0,0 +1,48 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
+# RUN: echo "SECTIONS { . = SIZEOF_HEADERS; pfoo = foo; pbar = bar; }" > %t.script
+# RUN: ld.lld -o %t1 --script %t.script %t
+# RUN: llvm-readobj -symbols %t1 | FileCheck %s
+
+# CHECK:       Symbol {
+# CHECK:         Name: bar
+# CHECK-NEXT:     Value: 0x134
+# CHECK-NEXT:     Size: 4
+# CHECK-NEXT:     Binding: Global
+# CHECK-NEXT:     Type: Object
+# CHECK-NEXT:     Other: 0
+# CHECK-NEXT:     Section: .bss
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: foo
+# CHECK-NEXT:     Value: 0x138
+# CHECK-NEXT:     Size: 4
+# CHECK-NEXT:     Binding: Global
+# CHECK-NEXT:     Type: Object
+# CHECK-NEXT:     Other: 0
+# CHECK-NEXT:     Section: .bss
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: pfoo
+# CHECK-NEXT:     Value: 0x138
+# CHECK-NEXT:     Size: 0
+# CHECK-NEXT:     Binding: Global
+# CHECK-NEXT:     Type: None
+# CHECK-NEXT:     Other: 0
+# CHECK-NEXT:     Section: .bss
+# CHECK-NEXT:   }
+# CHECK-NEXT:   Symbol {
+# CHECK-NEXT:     Name: pbar
+# CHECK-NEXT:     Value: 0x134
+# CHECK-NEXT:     Size: 0
+# CHECK-NEXT:     Binding: Global
+# CHECK-NEXT:     Type: None
+# CHECK-NEXT:     Other: 0
+# CHECK-NEXT:     Section: .bss
+# CHECK-NEXT:   }
+# CHECK-NEXT: ]
+
+.comm	foo,4,4
+.comm	bar,4,4
+movl	$1, foo(%rip)
+movl	$2, bar(%rip)