Vendor import of llvm release_39 branch r288513:

https://llvm.org/svn/llvm-project/llvm/branches/release_39@288513
This commit is contained in:
Dimitry Andric 2016-12-02 19:20:10 +00:00
parent 6449741f4c
commit 2cf3bd4601
17 changed files with 7403 additions and 37 deletions

View File

@ -20,11 +20,11 @@
#include <ciso646> // So we can check the C++ standard lib macros.
#include <functional>
// We use std::call_once on all Unix platforms except for NetBSD with
// libstdc++. That platform has a bug they are working to fix, and they'll
// remove the NetBSD checks once fixed.
#if defined(LLVM_ON_UNIX) && \
!(defined(__NetBSD__) && !defined(_LIBCPP_VERSION)) && !defined(__ppc__)
// std::call_once from libc++ is used on all Unix platforms. Other
// implementations like libstdc++ are known to have problems on NetBSD,
// OpenBSD and PowerPC.
#if defined(LLVM_ON_UNIX) && (defined(_LIBCPP_VERSION) || \
!(defined(__NetBSD__) || defined(__OpenBSD__) || defined(__ppc__)))
#define LLVM_THREADING_USE_STD_CALL_ONCE 1
#else
#define LLVM_THREADING_USE_STD_CALL_ONCE 0

View File

@ -2185,24 +2185,29 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
// options. This is a trivially-generalized version of the code from
// Hacker's Delight (itself derived from Knuth's Algorithm M from section
// 4.3.1).
SDValue Mask =
DAG.getConstant(APInt::getLowBitsSet(NVT.getSizeInBits(),
NVT.getSizeInBits() >> 1), dl, NVT);
unsigned Bits = NVT.getSizeInBits();
unsigned HalfBits = Bits >> 1;
SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl,
NVT);
SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask);
SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask);
SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL);
SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);
SDValue Shift =
DAG.getConstant(NVT.getSizeInBits() >> 1, dl,
TLI.getShiftAmountTy(NVT, DAG.getDataLayout()));
EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
if (APInt::getMaxValue(ShiftAmtTy.getSizeInBits()).ult(HalfBits)) {
// The type from TLI is too small to fit the shift amount we want.
// Override it with i32. The shift will have to be legalized.
ShiftAmtTy = MVT::i32;
}
SDValue Shift = DAG.getConstant(HalfBits, dl, ShiftAmtTy);
SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift);
SDValue U = DAG.getNode(ISD::ADD, dl, NVT,
DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TL);
DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TH);
SDValue UL = DAG.getNode(ISD::AND, dl, NVT, U, Mask);
SDValue UH = DAG.getNode(ISD::SRL, dl, NVT, U, Shift);
@ -2211,14 +2216,14 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
SDValue VH = DAG.getNode(ISD::SRL, dl, NVT, V, Shift);
SDValue W = DAG.getNode(ISD::ADD, dl, NVT,
DAG.getNode(ISD::MUL, dl, NVT, LL, RL),
DAG.getNode(ISD::MUL, dl, NVT, LLH, RLH),
DAG.getNode(ISD::ADD, dl, NVT, UH, VH));
Lo = DAG.getNode(ISD::ADD, dl, NVT, TH,
Lo = DAG.getNode(ISD::ADD, dl, NVT, TL,
DAG.getNode(ISD::SHL, dl, NVT, V, Shift));
Hi = DAG.getNode(ISD::ADD, dl, NVT, W,
DAG.getNode(ISD::ADD, dl, NVT,
DAG.getNode(ISD::MUL, dl, NVT, RH, LL),
DAG.getNode(ISD::MUL, dl, NVT, RH, LL),
DAG.getNode(ISD::MUL, dl, NVT, RL, LH)));
return;
}

View File

@ -2203,7 +2203,8 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
}
void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
MachineFunction &MF = *MI.getParent()->getParent();
MachineRegisterInfo &MRI = MF.getRegInfo();
// Legalize VOP2
if (isVOP2(MI) || isVOPC(MI)) {
@ -2321,8 +2322,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
return;
}
// Legalize MIMG
if (isMIMG(MI)) {
// Legalize MIMG and MUBUF/MTBUF for shaders.
//
// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
// scratch memory access. In both cases, the legalization never involves
// conversion to the addr64 form.
if (isMIMG(MI) ||
(AMDGPU::isShader(MF.getFunction()->getCallingConv()) &&
(isMUBUF(MI) || isMTBUF(MI)))) {
MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
@ -2337,9 +2344,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
return;
}
// Legalize MUBUF* instructions
// Legalize MUBUF* instructions by converting to addr64 form.
// FIXME: If we start using the non-addr64 instructions for compute, we
// may need to legalize them here.
// may need to legalize them as above. This especially applies to the
// buffer_load_format_* variants and variants with idxen (or bothen).
int SRsrcIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
if (SRsrcIdx != -1) {

View File

@ -2029,6 +2029,7 @@ def SI_RETURN : PseudoInstSI <
let hasSideEffects = 1;
let SALU = 1;
let hasNoSchedulingInfo = 1;
let DisableWQM = 1;
}
let Uses = [EXEC], Defs = [EXEC, VCC, M0],

View File

@ -219,13 +219,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
markInstruction(MI, Flags, Worklist);
GlobalFlags |= Flags;
}
if (WQMOutputs && MBB.succ_empty()) {
// This is a prolog shader. Make sure we go back to exact mode at the end.
Blocks[&MBB].OutNeeds = StateExact;
Worklist.push_back(&MBB);
GlobalFlags |= StateExact;
}
}
return GlobalFlags;

View File

@ -634,7 +634,7 @@ static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
}
if (!isa<IntToPtrInst>(V) && !isa<PtrToIntInst>(V) &&
!isa<GEPOperator>(V) && !isa<PHINode>(V))
!isa<GetElementPtrInst>(V) && !isa<PHINode>(V))
// We've found some value that we can't explore which is different from
// the base. Therefore we can't do this transformation.
return false;

View File

@ -579,6 +579,13 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
UndefValue::get(T), NewLoad, 0, Name));
}
// Bail out if the array is too large. Ideally we would like to optimize
// arrays of arbitrary size but this has a terrible impact on compile time.
// The threshold here is chosen arbitrarily, maybe needs a little bit of
// tuning.
if (NumElements > 1024)
return nullptr;
const DataLayout &DL = IC.getDataLayout();
auto EltSize = DL.getTypeAllocSize(ET);
auto Align = LI.getAlignment();
@ -1081,6 +1088,13 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
return true;
}
// Bail out if the array is too large. Ideally we would like to optimize
// arrays of arbitrary size but this has a terrible impact on compile time.
// The threshold here is chosen arbitrarily, maybe needs a little bit of
// tuning.
if (NumElements > 1024)
return false;
const DataLayout &DL = IC.getDataLayout();
auto EltSize = DL.getTypeAllocSize(AT->getElementType());
auto Align = SI.getAlignment();

View File

@ -2024,14 +2024,20 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
// Move all 'aggressive' instructions, which are defined in the
// conditional parts of the if's up to the dominating block.
if (IfBlock1)
if (IfBlock1) {
for (auto &I : *IfBlock1)
I.dropUnknownNonDebugMetadata();
DomBlock->getInstList().splice(InsertPt->getIterator(),
IfBlock1->getInstList(), IfBlock1->begin(),
IfBlock1->getTerminator()->getIterator());
if (IfBlock2)
}
if (IfBlock2) {
for (auto &I : *IfBlock2)
I.dropUnknownNonDebugMetadata();
DomBlock->getInstList().splice(InsertPt->getIterator(),
IfBlock2->getInstList(), IfBlock2->begin(),
IfBlock2->getTerminator()->getIterator());
}
while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
// Change the PHI node into a select instruction.

View File

@ -0,0 +1,49 @@
;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
; Test that buffer_load_format with VGPR resource descriptor is properly
; legalized.
; CHECK-LABEL: {{^}}test_none:
; CHECK: buffer_load_format_x v0, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
define amdgpu_vs float @test_none(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
main_body:
%ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
%tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
%tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 0, i1 0, i1 0)
ret float %tmp7
}
; CHECK-LABEL: {{^}}test_idxen:
; CHECK: buffer_load_format_x v0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen{{$}}
define amdgpu_vs float @test_idxen(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
main_body:
%ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
%tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
%tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 0, i1 0, i1 0)
ret float %tmp7
}
; CHECK-LABEL: {{^}}test_offen:
; CHECK: buffer_load_format_x v0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
define amdgpu_vs float @test_offen(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
main_body:
%ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
%tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
%tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 undef, i1 0, i1 0)
ret float %tmp7
}
; CHECK-LABEL: {{^}}test_both:
; CHECK: buffer_load_format_x v0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen{{$}}
define amdgpu_vs float @test_both(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
main_body:
%ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
%tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
%tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 undef, i1 0, i1 0)
ret float %tmp7
}
declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) nounwind readonly
attributes #0 = { nounwind readnone }

View File

@ -17,17 +17,18 @@ main_body:
;CHECK-LABEL: {{^}}test2:
;CHECK-NEXT: ; %main_body
;CHECK-NEXT: s_wqm_b64 exec, exec
;CHECK: image_sample
;CHECK-NOT: exec
;CHECK: _load_dword v0,
define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
main_body:
%c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%c.2 = bitcast <4 x float> %c.1 to <4 x i32>
%c.3 = extractelement <4 x i32> %c.2, i32 0
%gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
%data = load float, float addrspace(1)* %gep
ret float %data
call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %data, float undef, float undef, float undef)
ret void
}
; ... but disabled for stores (and, in this simple case, not re-enabled).
@ -414,6 +415,46 @@ entry:
ret void
}
; Must return to exact at the end of a non-void returning shader,
; otherwise the EXEC mask exported by the epilog will be wrong. This is true
; even if the shader has no kills, because a kill could have happened in a
; previous shader fragment.
;
; CHECK-LABEL: {{^}}test_nonvoid_return:
; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
; CHECK: s_wqm_b64 exec, exec
;
; CHECK: s_and_b64 exec, exec, [[LIVE]]
; CHECK-NOT: exec
define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
%tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tex.i = bitcast <4 x float> %tex to <4 x i32>
%dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
ret <4 x float> %dtex
}
; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable:
; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
; CHECK: s_wqm_b64 exec, exec
;
; CHECK: s_and_b64 exec, exec, [[LIVE]]
; CHECK-NOT: exec
define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
entry:
%tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%tex.i = bitcast <4 x float> %tex to <4 x i32>
%dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
%cc = icmp sgt i32 %c, 0
br i1 %cc, label %if, label %else
if:
store volatile <4 x float> %dtex, <4 x float>* undef
unreachable
else:
ret <4 x float> %dtex
}
declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1

File diff suppressed because it is too large Load Diff

View File

@ -15,12 +15,17 @@ entry:
; There is a lot of inter-register motion, and so matching the instruction
; sequence will be fragile. There should be 6 underlying multiplications.
; CHECK: imulq
; CHECK: mulq
; CHECK: imulq
; CHECK: imulq
; CHECK: mulq
; CHECK: imulq
; CHECK: imulq
; CHECK: imulq
; CHECK: mulq
; CHECK: mulq
; CHECK: mulq
; CHECK: mulq
; CHECK-NOT: imulq
; CHECK-NOT: mulq
; CHECK: retq
attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }

1238
test/CodeGen/X86/mul-i512.ll Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,6 @@
; RUN: llvm-as -o %t.dst.bc %s
; RUN: llvm-as -o %t.src.bc %S/Inputs/type-mapping-src.ll
; RUN: llvm-lto %t.dst.bc %t.src.bc -o=/dev/null
; RUN: llvm-lto %t.dst.bc %t.src.bc -o=%t.lto.bc
target triple = "x86_64-pc-windows-msvc18.0.0"

View File

@ -167,4 +167,24 @@ lpad:
; CHECK: ret i32* %[[PTR]]
}
@pr30402 = constant i64 3
define i1 @test7() {
entry:
br label %bb7
bb7: ; preds = %bb10, %entry-block
%phi = phi i64* [ @pr30402, %entry ], [ getelementptr inbounds (i64, i64* @pr30402, i32 1), %bb7 ]
%cmp = icmp eq i64* %phi, getelementptr inbounds (i64, i64* @pr30402, i32 1)
br i1 %cmp, label %bb10, label %bb7
bb10:
ret i1 %cmp
}
; CHECK-LABEL: @test7(
; CHECK: %[[phi:.*]] = phi i64* [ @pr30402, %entry ], [ getelementptr inbounds (i64, i64* @pr30402, i32 1), %bb7 ]
; CHECK: %[[cmp:.*]] = icmp eq i64* %[[phi]], getelementptr inbounds (i64, i64* @pr30402, i32 1)
; CHECK: ret i1 %[[cmp]]
declare i32 @__gxx_personality_v0(...)

View File

@ -49,6 +49,15 @@ define void @storeArrayOfA([1 x %A]* %aa.ptr) {
ret void
}
define void @storeLargeArrayOfA([2000 x %A]* %aa.ptr) {
; CHECK-LABEL: storeLargeArrayOfA
; CHECK-NEXT: store [2000 x %A]
; CHECK-NEXT: ret void
%i1 = insertvalue [2000 x %A] undef, %A { %A__vtbl* @A__vtblZ }, 1
store [2000 x %A] %i1, [2000 x %A]* %aa.ptr, align 8
ret void
}
define void @storeStructOfArrayOfA({ [1 x %A] }* %saa.ptr) {
; CHECK-LABEL: storeStructOfArrayOfA
; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr inbounds { [1 x %A] }, { [1 x %A] }* %saa.ptr, i64 0, i32 0, i64 0, i32 0
@ -179,6 +188,14 @@ define [2 x %B] @loadArrayOfB([2 x %B]* %ab.ptr) {
ret [2 x %B] %1
}
define [2000 x %B] @loadLargeArrayOfB([2000 x %B]* %ab.ptr) {
; CHECK-LABEL: loadLargeArrayOfB
; CHECK-NEXT: load [2000 x %B], [2000 x %B]* %ab.ptr, align 8
; CHECK-NEXT: ret [2000 x %B]
%1 = load [2000 x %B], [2000 x %B]* %ab.ptr, align 8
ret [2000 x %B] %1
}
%struct.S = type <{ i8, %struct.T }>
%struct.T = type { i32, i32 }

View File

@ -0,0 +1,31 @@
; RUN: opt -S -simplifycfg < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-linux-gnu"
@GV = external constant i64*
define i64* @test1(i1 %cond, i8* %P) {
entry:
br i1 %cond, label %if, label %then
then:
%bc = bitcast i8* %P to i64*
br label %join
if:
%load = load i64*, i64** @GV, align 8, !dereferenceable !0
br label %join
join:
%phi = phi i64* [ %bc, %then ], [ %load, %if ]
ret i64* %phi
}
; CHECK-LABEL: define i64* @test1(
; CHECK: %[[bc:.*]] = bitcast i8* %P to i64*
; CHECK: %[[load:.*]] = load i64*, i64** @GV, align 8{{$}}
; CHECK: %[[phi:.*]] = select i1 %cond, i64* %[[load]], i64* %[[bc]]
; CHECK: ret i64* %[[phi]]
!0 = !{i64 8}