Vendor import of llvm release_39 branch r288513:

https://llvm.org/svn/llvm-project/llvm/branches/release_39@288513
2016-12-02 19:20:10 +00:00 · 2016-12-02 19:20:10 +00:00 · 2cf3bd4601
commit 2cf3bd4601
parent 6449741f4c
17 changed files with 7403 additions and 37 deletions
--- a/include/llvm/Support/Threading.h
+++ b/include/llvm/Support/Threading.h
@ -20,11 +20,11 @@
 #include <ciso646> // So we can check the C++ standard lib macros.
 #include <functional>

-// We use std::call_once on all Unix platforms except for NetBSD with
-// libstdc++. That platform has a bug they are working to fix, and they'll
-// remove the NetBSD checks once fixed.
-#if defined(LLVM_ON_UNIX) &&                                                   \
-    !(defined(__NetBSD__) && !defined(_LIBCPP_VERSION)) && !defined(__ppc__)
+// std::call_once from libc++ is used on all Unix platforms. Other
+// implementations like libstdc++ are known to have problems on NetBSD,
+// OpenBSD and PowerPC.
+#if defined(LLVM_ON_UNIX) && (defined(_LIBCPP_VERSION) ||                      \
+    !(defined(__NetBSD__) || defined(__OpenBSD__) || defined(__ppc__)))
 #define LLVM_THREADING_USE_STD_CALL_ONCE 1
 #else
 #define LLVM_THREADING_USE_STD_CALL_ONCE 0
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@ -2185,24 +2185,29 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
    // options. This is a trivially-generalized version of the code from
    // Hacker's Delight (itself derived from Knuth's Algorithm M from section
    // 4.3.1).
-    SDValue Mask =
-      DAG.getConstant(APInt::getLowBitsSet(NVT.getSizeInBits(),
-                                           NVT.getSizeInBits() >> 1), dl, NVT);
+    unsigned Bits = NVT.getSizeInBits();
+    unsigned HalfBits = Bits >> 1;
+    SDValue Mask = DAG.getConstant(APInt::getLowBitsSet(Bits, HalfBits), dl,
+                                   NVT);
    SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask);
    SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask);

    SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL);
    SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);

-    SDValue Shift =
-      DAG.getConstant(NVT.getSizeInBits() >> 1, dl,
-                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout()));
+    EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
+    if (APInt::getMaxValue(ShiftAmtTy.getSizeInBits()).ult(HalfBits)) {
+      // The type from TLI is too small to fit the shift amount we want.
+      // Override it with i32. The shift will have to be legalized.
+      ShiftAmtTy = MVT::i32;
+    }
+    SDValue Shift = DAG.getConstant(HalfBits, dl, ShiftAmtTy);
    SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
    SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
    SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift);

    SDValue U = DAG.getNode(ISD::ADD, dl, NVT,
-                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TL);
+                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TH);
    SDValue UL = DAG.getNode(ISD::AND, dl, NVT, U, Mask);
    SDValue UH = DAG.getNode(ISD::SRL, dl, NVT, U, Shift);

@ -2211,14 +2216,14 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
    SDValue VH = DAG.getNode(ISD::SRL, dl, NVT, V, Shift);

    SDValue W = DAG.getNode(ISD::ADD, dl, NVT,
-                            DAG.getNode(ISD::MUL, dl, NVT, LL, RL),
+                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLH),
                            DAG.getNode(ISD::ADD, dl, NVT, UH, VH));
-    Lo = DAG.getNode(ISD::ADD, dl, NVT, TH,
+    Lo = DAG.getNode(ISD::ADD, dl, NVT, TL,
                     DAG.getNode(ISD::SHL, dl, NVT, V, Shift));

    Hi = DAG.getNode(ISD::ADD, dl, NVT, W,
                     DAG.getNode(ISD::ADD, dl, NVT,
-                                 DAG.getNode(ISD::MUL, dl, NVT, RH, LL), 
+                                 DAG.getNode(ISD::MUL, dl, NVT, RH, LL),
                                 DAG.getNode(ISD::MUL, dl, NVT, RL, LH)));
    return;
  }
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@ -2203,7 +2203,8 @@ void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI,
 }

 void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
-  MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();

  // Legalize VOP2
  if (isVOP2(MI) || isVOPC(MI)) {
@ -2321,8 +2322,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
    return;
  }

-  // Legalize MIMG
-  if (isMIMG(MI)) {
+  // Legalize MIMG and MUBUF/MTBUF for shaders.
+  //
+  // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
+  // scratch memory access. In both cases, the legalization never involves
+  // conversion to the addr64 form.
+  if (isMIMG(MI) ||
+      (AMDGPU::isShader(MF.getFunction()->getCallingConv()) &&
+       (isMUBUF(MI) || isMTBUF(MI)))) {
    MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
    if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) {
      unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI);
@ -2337,9 +2344,10 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const {
    return;
  }

-  // Legalize MUBUF* instructions
+  // Legalize MUBUF* instructions by converting to addr64 form.
  // FIXME: If we start using the non-addr64 instructions for compute, we
-  // may need to legalize them here.
+  // may need to legalize them as above. This especially applies to the
+  // buffer_load_format_* variants and variants with idxen (or bothen).
  int SRsrcIdx =
      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
  if (SRsrcIdx != -1) {
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@ -2029,6 +2029,7 @@ def SI_RETURN : PseudoInstSI <
  let hasSideEffects = 1;
  let SALU = 1;
  let hasNoSchedulingInfo = 1;
+  let DisableWQM = 1;
 }

 let Uses = [EXEC], Defs = [EXEC, VCC, M0],
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@ -219,13 +219,6 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
      markInstruction(MI, Flags, Worklist);
      GlobalFlags |= Flags;
    }
-
-    if (WQMOutputs && MBB.succ_empty()) {
-      // This is a prolog shader. Make sure we go back to exact mode at the end.
-      Blocks[&MBB].OutNeeds = StateExact;
-      Worklist.push_back(&MBB);
-      GlobalFlags |= StateExact;
-    }
  }

  return GlobalFlags;
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@ -634,7 +634,7 @@ static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
      }

      if (!isa<IntToPtrInst>(V) && !isa<PtrToIntInst>(V) &&
-          !isa<GEPOperator>(V) && !isa<PHINode>(V))
+          !isa<GetElementPtrInst>(V) && !isa<PHINode>(V))
        // We've found some value that we can't explore which is different from
        // the base. Therefore we can't do this transformation.
        return false;
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@ -579,6 +579,13 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
        UndefValue::get(T), NewLoad, 0, Name));
    }

+    // Bail out if the array is too large. Ideally we would like to optimize
+    // arrays of arbitrary size but this has a terrible impact on compile time.
+    // The threshold here is chosen arbitrarily, maybe needs a little bit of
+    // tuning.
+    if (NumElements > 1024)
+      return nullptr;
+
    const DataLayout &DL = IC.getDataLayout();
    auto EltSize = DL.getTypeAllocSize(ET);
    auto Align = LI.getAlignment();
@ -1081,6 +1088,13 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
      return true;
    }

+    // Bail out if the array is too large. Ideally we would like to optimize
+    // arrays of arbitrary size but this has a terrible impact on compile time.
+    // The threshold here is chosen arbitrarily, maybe needs a little bit of
+    // tuning.
+    if (NumElements > 1024)
+      return false;
+
    const DataLayout &DL = IC.getDataLayout();
    auto EltSize = DL.getTypeAllocSize(AT->getElementType());
    auto Align = SI.getAlignment();
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@ -2024,14 +2024,20 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,

  // Move all 'aggressive' instructions, which are defined in the
  // conditional parts of the if's up to the dominating block.
-  if (IfBlock1)
+  if (IfBlock1) {
+    for (auto &I : *IfBlock1)
+      I.dropUnknownNonDebugMetadata();
    DomBlock->getInstList().splice(InsertPt->getIterator(),
                                   IfBlock1->getInstList(), IfBlock1->begin(),
                                   IfBlock1->getTerminator()->getIterator());
-  if (IfBlock2)
+  }
+  if (IfBlock2) {
+    for (auto &I : *IfBlock2)
+      I.dropUnknownNonDebugMetadata();
    DomBlock->getInstList().splice(InsertPt->getIterator(),
                                   IfBlock2->getInstList(), IfBlock2->begin(),
                                   IfBlock2->getTerminator()->getIterator());
+  }

  while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
    // Change the PHI node into a select instruction.
--- a/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
+++ b/test/CodeGen/AMDGPU/mubuf-shader-vgpr.ll
@ -0,0 +1,49 @@
+;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s -check-prefix=CHECK
+
+; Test that buffer_load_format with VGPR resource descriptor is properly
+; legalized.
+
+; CHECK-LABEL: {{^}}test_none:
+; CHECK: buffer_load_format_x v0, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
+define amdgpu_vs float @test_none(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
+main_body:
+  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
+  %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 0, i1 0, i1 0)
+  ret float %tmp7
+}
+
+; CHECK-LABEL: {{^}}test_idxen:
+; CHECK: buffer_load_format_x v0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen{{$}}
+define amdgpu_vs float @test_idxen(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
+main_body:
+  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
+  %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 0, i1 0, i1 0)
+  ret float %tmp7
+}
+
+; CHECK-LABEL: {{^}}test_offen:
+; CHECK: buffer_load_format_x v0, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen{{$}}
+define amdgpu_vs float @test_offen(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
+main_body:
+  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
+  %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 0, i32 undef, i1 0, i1 0)
+  ret float %tmp7
+}
+
+; CHECK-LABEL: {{^}}test_both:
+; CHECK: buffer_load_format_x v0, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen offen{{$}}
+define amdgpu_vs float @test_both(<4 x i32> addrspace(2)* inreg %base, i32 %i) {
+main_body:
+  %ptr = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %base, i32 %i
+  %tmp2 = load <4 x i32>, <4 x i32> addrspace(2)* %ptr, align 32
+  %tmp7 = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %tmp2, i32 undef, i32 undef, i1 0, i1 0)
+  ret float %tmp7
+}
+
+declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) nounwind readonly
+
+attributes #0 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@ -17,17 +17,18 @@ main_body:
 ;CHECK-LABEL: {{^}}test2:
 ;CHECK-NEXT: ; %main_body
 ;CHECK-NEXT: s_wqm_b64 exec, exec
-;CHECK: image_sample
 ;CHECK-NOT: exec
-;CHECK: _load_dword v0,
-define amdgpu_ps float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
+define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
 main_body:
  %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
  %c.3 = extractelement <4 x i32> %c.2, i32 0
  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
  %data = load float, float addrspace(1)* %gep
-  ret float %data
+
+  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %data, float undef, float undef, float undef)
+
+  ret void
 }

 ; ... but disabled for stores (and, in this simple case, not re-enabled).
@ -414,6 +415,46 @@ entry:
  ret void
 }

+; Must return to exact at the end of a non-void returning shader,
+; otherwise the EXEC mask exported by the epilog will be wrong. This is true
+; even if the shader has no kills, because a kill could have happened in a
+; previous shader fragment.
+;
+; CHECK-LABEL: {{^}}test_nonvoid_return:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+;
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK-NOT: exec
+define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
+  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.i = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  ret <4 x float> %dtex
+}
+
+; CHECK-LABEL: {{^}}test_nonvoid_return_unreachable:
+; CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec
+; CHECK: s_wqm_b64 exec, exec
+;
+; CHECK: s_and_b64 exec, exec, [[LIVE]]
+; CHECK-NOT: exec
+define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
+entry:
+  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex.i = bitcast <4 x float> %tex to <4 x i32>
+  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+
+  %cc = icmp sgt i32 %c, 0
+  br i1 %cc, label %if, label %else
+
+if:
+  store volatile <4 x float> %dtex, <4 x float>* undef
+  unreachable
+
+else:
+  ret <4 x float> %dtex
+}

 declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
--- a/test/CodeGen/X86/mul-i1024.ll
+++ b/test/CodeGen/X86/mul-i1024.ll
--- a/test/CodeGen/X86/mul-i256.ll
+++ b/test/CodeGen/X86/mul-i256.ll
@ -15,12 +15,17 @@ entry:
 ; There is a lot of inter-register motion, and so matching the instruction
 ; sequence will be fragile. There should be 6 underlying multiplications.
 ; CHECK: imulq
+; CHECK: mulq
 ; CHECK: imulq
 ; CHECK: imulq
+; CHECK: mulq
 ; CHECK: imulq
-; CHECK: imulq
-; CHECK: imulq
+; CHECK: mulq
+; CHECK: mulq
+; CHECK: mulq
+; CHECK: mulq
 ; CHECK-NOT: imulq
+; CHECK-NOT: mulq
 ; CHECK: retq

 attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
--- a/test/CodeGen/X86/mul-i512.ll
+++ b/test/CodeGen/X86/mul-i512.ll
--- a/test/LTO/X86/type-mapping-bug.ll
+++ b/test/LTO/X86/type-mapping-bug.ll
@ -1,6 +1,6 @@
 ; RUN: llvm-as -o %t.dst.bc %s
 ; RUN: llvm-as -o %t.src.bc %S/Inputs/type-mapping-src.ll
-; RUN: llvm-lto %t.dst.bc %t.src.bc -o=/dev/null
+; RUN: llvm-lto %t.dst.bc %t.src.bc -o=%t.lto.bc

 target triple = "x86_64-pc-windows-msvc18.0.0"

--- a/test/Transforms/InstCombine/indexed-gep-compares.ll
+++ b/test/Transforms/InstCombine/indexed-gep-compares.ll
@ -167,4 +167,24 @@ lpad:
 ; CHECK:  ret i32* %[[PTR]]
 }

+
+@pr30402 = constant i64 3
+define i1 @test7() {
+entry:
+  br label %bb7
+
+bb7:                                              ; preds = %bb10, %entry-block
+  %phi = phi i64* [ @pr30402, %entry ], [ getelementptr inbounds (i64, i64* @pr30402, i32 1), %bb7 ]
+  %cmp = icmp eq i64* %phi, getelementptr inbounds (i64, i64* @pr30402, i32 1)
+  br i1 %cmp, label %bb10, label %bb7
+
+bb10:
+  ret i1 %cmp
+}
+; CHECK-LABEL: @test7(
+; CHECK:  %[[phi:.*]] = phi i64* [ @pr30402, %entry ], [ getelementptr inbounds (i64, i64* @pr30402, i32 1), %bb7 ]
+; CHECK:  %[[cmp:.*]] = icmp eq i64* %[[phi]], getelementptr inbounds (i64, i64* @pr30402, i32 1)
+; CHECK: ret i1 %[[cmp]]
+
+
 declare i32 @__gxx_personality_v0(...)
--- a/test/Transforms/InstCombine/unpack-fca.ll
+++ b/test/Transforms/InstCombine/unpack-fca.ll
@ -49,6 +49,15 @@ define void @storeArrayOfA([1 x %A]* %aa.ptr) {
  ret void
 }

+define void @storeLargeArrayOfA([2000 x %A]* %aa.ptr) {
+; CHECK-LABEL: storeLargeArrayOfA
+; CHECK-NEXT: store [2000 x %A]
+; CHECK-NEXT: ret void
+  %i1 = insertvalue [2000 x %A] undef, %A { %A__vtbl* @A__vtblZ }, 1
+  store [2000 x %A] %i1, [2000 x %A]* %aa.ptr, align 8
+  ret void
+}
+
 define void @storeStructOfArrayOfA({ [1 x %A] }* %saa.ptr) {
 ; CHECK-LABEL: storeStructOfArrayOfA
 ; CHECK-NEXT: [[GEP:%[a-z0-9\.]+]] = getelementptr inbounds { [1 x %A] }, { [1 x %A] }* %saa.ptr, i64 0, i32 0, i64 0, i32 0
@ -179,6 +188,14 @@ define [2 x %B] @loadArrayOfB([2 x %B]* %ab.ptr) {
  ret [2 x %B] %1
 }

+define [2000 x %B] @loadLargeArrayOfB([2000 x %B]* %ab.ptr) {
+; CHECK-LABEL: loadLargeArrayOfB
+; CHECK-NEXT: load [2000 x %B], [2000 x %B]* %ab.ptr, align 8
+; CHECK-NEXT: ret [2000 x %B]
+  %1 = load [2000 x %B], [2000 x %B]* %ab.ptr, align 8
+  ret [2000 x %B] %1
+}
+
 %struct.S = type <{ i8, %struct.T }>
 %struct.T = type { i32, i32 }

--- a/test/Transforms/SimplifyCFG/PR29163.ll
+++ b/test/Transforms/SimplifyCFG/PR29163.ll
@ -0,0 +1,31 @@
+; RUN: opt -S -simplifycfg < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@GV = external constant i64*
+
+define i64* @test1(i1 %cond, i8* %P) {
+entry:
+  br i1 %cond, label %if, label %then
+
+then:
+  %bc = bitcast i8* %P to i64*
+  br label %join
+
+if:
+  %load = load i64*, i64** @GV, align 8, !dereferenceable !0
+  br label %join
+
+join:
+  %phi = phi i64* [ %bc, %then ], [ %load, %if ]
+  ret i64* %phi
+}
+
+; CHECK-LABEL: define i64* @test1(
+; CHECK: %[[bc:.*]] = bitcast i8* %P to i64*
+; CHECK: %[[load:.*]] = load i64*, i64** @GV, align 8{{$}}
+; CHECK: %[[phi:.*]] = select i1 %cond, i64* %[[load]], i64* %[[bc]]
+; CHECK: ret i64* %[[phi]]
+
+
+!0 = !{i64 8}