freebsd-dev/contrib/llvm/lib/Target/R600/SILowerControlFlow.cpp
Dimitry Andric ef6fa9e26d Upgrade our copy of clang and llvm to 3.6.1 release.
This release contains the following cherry-picked revisions from
upstream trunk:

  226124 226151 226164 226165 226166 226407 226408 226409 226652
  226905 226983 227084 227087 227089 227208 227209 227210 227211
  227212 227213 227214 227269 227430 227482 227503 227519 227574
  227822 227986 227987 227988 227989 227990 228037 228038 228039
  228040 228188 228189 228190 228273 228372 228373 228374 228403
  228765 228848 228918 229223 229225 229226 229227 229228 229230
  229234 229235 229236 229238 229239 229413 229507 229680 229750
  229751 229752 229911 230146 230147 230235 230253 230255 230469
  230500 230564 230603 230657 230742 230748 230956 231219 231237
  231245 231259 231280 231451 231563 231601 231658 231659 231662
  231984 231986 232046 232085 232142 232176 232179 232189 232382
  232386 232389 232425 232438 232443 232675 232786 232797 232943
  232957 233075 233080 233351 233353 233409 233410 233508 233584
  233819 233904 234629 234636 234891 234975 234977 235524 235641
  235662 235931 236099 236306 236307

Please note that from 3.5.0 onwards, clang and llvm require C++11
support to build; see UPDATING for more information.
2015-05-25 13:43:03 +00:00

606 lines
18 KiB
C++

//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief This pass lowers the pseudo control flow instructions to real
/// machine instructions.
///
/// All control flow is handled using predicated instructions and
/// a predicate stack. Each Scalar ALU controls the operations of 64 Vector
/// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs
/// by writting to the 64-bit EXEC register (each bit corresponds to a
/// single vector ALU). Typically, for predicates, a vector ALU will write
/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
/// Vector ALU) and then the ScalarALU will AND the VCC register with the
/// EXEC to update the predicates.
///
/// For example:
/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
/// %SGPR0 = SI_IF %VCC
/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
/// %SGPR0 = SI_ELSE %SGPR0
/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
/// SI_END_CF %SGPR0
///
/// becomes:
///
/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask
/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
/// S_CBRANCH_EXECZ label0 // This instruction is an optional
/// // optimization which allows us to
/// // branch if all the bits of
/// // EXEC are zero.
/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
///
/// label0:
/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC // Restore the exec mask for the Then block
/// %EXEC = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask
/// S_BRANCH_EXECZ label1 // Use our branch optimization
/// // instruction again.
/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block
/// label1:
/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Constants.h"
using namespace llvm;
namespace {
class SILowerControlFlowPass : public MachineFunctionPass {
private:
static const unsigned SkipThreshold = 12;
static char ID;
const SIRegisterInfo *TRI;
const SIInstrInfo *TII;
bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
void Skip(MachineInstr &From, MachineOperand &To);
void SkipIfDead(MachineInstr &MI);
void If(MachineInstr &MI);
void Else(MachineInstr &MI);
void Break(MachineInstr &MI);
void IfBreak(MachineInstr &MI);
void ElseBreak(MachineInstr &MI);
void Loop(MachineInstr &MI);
void EndCf(MachineInstr &MI);
void Kill(MachineInstr &MI);
void Branch(MachineInstr &MI);
void LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset = 0);
void computeIndirectRegAndOffset(unsigned VecReg, unsigned &Reg, int &Offset);
void IndirectSrc(MachineInstr &MI);
void IndirectDst(MachineInstr &MI);
public:
SILowerControlFlowPass(TargetMachine &tm) :
MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
bool runOnMachineFunction(MachineFunction &MF) override;
const char *getPassName() const override {
return "SI Lower control flow instructions";
}
};
} // End anonymous namespace
char SILowerControlFlowPass::ID = 0;
FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
return new SILowerControlFlowPass(tm);
}
bool SILowerControlFlowPass::shouldSkip(MachineBasicBlock *From,
MachineBasicBlock *To) {
unsigned NumInstr = 0;
for (MachineBasicBlock *MBB = From; MBB != To && !MBB->succ_empty();
MBB = *MBB->succ_begin()) {
for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
NumInstr < SkipThreshold && I != E; ++I) {
if (I->isBundle() || !I->isBundled())
if (++NumInstr >= SkipThreshold)
return true;
}
}
return false;
}
void SILowerControlFlowPass::Skip(MachineInstr &From, MachineOperand &To) {
if (!shouldSkip(*From.getParent()->succ_begin(), To.getMBB()))
return;
DebugLoc DL = From.getDebugLoc();
BuildMI(*From.getParent(), &From, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
.addOperand(To)
.addReg(AMDGPU::EXEC);
}
void SILowerControlFlowPass::SkipIfDead(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
if (MBB.getParent()->getInfo<SIMachineFunctionInfo>()->getShaderType() !=
ShaderType::PIXEL ||
!shouldSkip(&MBB, &MBB.getParent()->back()))
return;
MachineBasicBlock::iterator Insert = &MI;
++Insert;
// If the exec mask is non-zero, skip the next two instructions
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addImm(3)
.addReg(AMDGPU::EXEC);
// Exec mask is zero: Export to NULL target...
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::EXP))
.addImm(0)
.addImm(0x09) // V_008DFC_SQ_EXP_NULL
.addImm(0)
.addImm(1)
.addImm(1)
.addReg(AMDGPU::VGPR0)
.addReg(AMDGPU::VGPR0)
.addReg(AMDGPU::VGPR0)
.addReg(AMDGPU::VGPR0);
// ... and terminate wavefront
BuildMI(MBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
}
void SILowerControlFlowPass::If(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Reg = MI.getOperand(0).getReg();
unsigned Vcc = MI.getOperand(1).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg)
.addReg(Vcc);
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg)
.addReg(AMDGPU::EXEC)
.addReg(Reg);
Skip(MI, MI.getOperand(2));
MI.eraseFromParent();
}
void SILowerControlFlowPass::Else(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Src = MI.getOperand(1).getReg();
BuildMI(MBB, MBB.getFirstNonPHI(), DL,
TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst)
.addReg(Src); // Saved EXEC
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(Dst);
Skip(MI, MI.getOperand(2));
MI.eraseFromParent();
}
void SILowerControlFlowPass::Break(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Src = MI.getOperand(1).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(AMDGPU::EXEC)
.addReg(Src);
MI.eraseFromParent();
}
void SILowerControlFlowPass::IfBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Vcc = MI.getOperand(1).getReg();
unsigned Src = MI.getOperand(2).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(Vcc)
.addReg(Src);
MI.eraseFromParent();
}
void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Saved = MI.getOperand(1).getReg();
unsigned Src = MI.getOperand(2).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst)
.addReg(Saved)
.addReg(Src);
MI.eraseFromParent();
}
void SILowerControlFlowPass::Loop(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Src = MI.getOperand(0).getReg();
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(Src);
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addOperand(MI.getOperand(1))
.addReg(AMDGPU::EXEC);
MI.eraseFromParent();
}
void SILowerControlFlowPass::EndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Reg = MI.getOperand(0).getReg();
BuildMI(MBB, MBB.getFirstNonPHI(), DL,
TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(Reg);
MI.eraseFromParent();
}
void SILowerControlFlowPass::Branch(MachineInstr &MI) {
if (MI.getOperand(0).getMBB() == MI.getParent()->getNextNode())
MI.eraseFromParent();
// If these aren't equal, this is probably an infinite loop.
}
void SILowerControlFlowPass::Kill(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
const MachineOperand &Op = MI.getOperand(0);
#ifndef NDEBUG
const SIMachineFunctionInfo *MFI
= MBB.getParent()->getInfo<SIMachineFunctionInfo>();
// Kill is only allowed in pixel / geometry shaders.
assert(MFI->getShaderType() == ShaderType::PIXEL ||
MFI->getShaderType() == ShaderType::GEOMETRY);
#endif
// Clear this thread from the exec mask if the operand is negative
if ((Op.isImm())) {
// Constant operand: Set exec mask to 0 or do nothing
if (Op.getImm() & 0x80000000) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
.addImm(0);
}
} else {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMPX_LE_F32_e32), AMDGPU::VCC)
.addImm(0)
.addOperand(Op);
}
MI.eraseFromParent();
}
void SILowerControlFlowPass::LoadM0(MachineInstr &MI, MachineInstr *MovRel, int Offset) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
MachineBasicBlock::iterator I = MI;
unsigned Save = MI.getOperand(1).getReg();
unsigned Idx = MI.getOperand(3).getReg();
if (AMDGPU::SReg_32RegClass.contains(Idx)) {
if (Offset) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
.addReg(Idx)
.addImm(Offset);
} else {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addReg(Idx);
}
MBB.insert(I, MovRel);
} else {
assert(AMDGPU::SReg_64RegClass.contains(Save));
assert(AMDGPU::VGPR_32RegClass.contains(Idx));
// Save the EXEC mask
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), Save)
.addReg(AMDGPU::EXEC);
// Read the next variant into VCC (lower 32 bits) <- also loop target
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
AMDGPU::VCC_LO)
.addReg(Idx);
// Move index from VCC into M0
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
.addReg(AMDGPU::VCC_LO);
// Compare the just read M0 value to all possible Idx values
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e32), AMDGPU::VCC)
.addReg(AMDGPU::M0)
.addReg(Idx);
// Update EXEC, save the original EXEC value to VCC
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), AMDGPU::VCC)
.addReg(AMDGPU::VCC);
if (Offset) {
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
.addReg(AMDGPU::M0)
.addImm(Offset);
}
// Do the actual move
MBB.insert(I, MovRel);
// Update EXEC, switch all done bits to 0 and all todo bits to 1
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC)
.addReg(AMDGPU::EXEC)
.addReg(AMDGPU::VCC);
// Loop back to V_READFIRSTLANE_B32 if there are still variants to cover
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
.addImm(-7)
.addReg(AMDGPU::EXEC);
// Restore EXEC
BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
.addReg(Save);
}
MI.eraseFromParent();
}
/// \param @VecReg The register which holds element zero of the vector
/// being addressed into.
/// \param[out] @Reg The base register to use in the indirect addressing instruction.
/// \param[in,out] @Offset As an input, this is the constant offset part of the
// indirect Index. e.g. v0 = v[VecReg + Offset]
// As an output, this is a constant value that needs
// to be added to the value stored in M0.
void SILowerControlFlowPass::computeIndirectRegAndOffset(unsigned VecReg,
unsigned &Reg,
int &Offset) {
unsigned SubReg = TRI->getSubReg(VecReg, AMDGPU::sub0);
if (!SubReg)
SubReg = VecReg;
const TargetRegisterClass *RC = TRI->getPhysRegClass(SubReg);
int RegIdx = TRI->getHWRegIndex(SubReg) + Offset;
if (RegIdx < 0) {
Offset = RegIdx;
RegIdx = 0;
} else {
Offset = 0;
}
Reg = RC->getRegister(RegIdx);
}
void SILowerControlFlowPass::IndirectSrc(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
unsigned Vec = MI.getOperand(2).getReg();
int Off = MI.getOperand(4).getImm();
unsigned Reg;
computeIndirectRegAndOffset(Vec, Reg, Off);
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
.addReg(Reg)
.addReg(AMDGPU::M0, RegState::Implicit)
.addReg(Vec, RegState::Implicit);
LoadM0(MI, MovRel, Off);
}
void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
DebugLoc DL = MI.getDebugLoc();
unsigned Dst = MI.getOperand(0).getReg();
int Off = MI.getOperand(4).getImm();
unsigned Val = MI.getOperand(5).getReg();
unsigned Reg;
computeIndirectRegAndOffset(Dst, Reg, Off);
MachineInstr *MovRel =
BuildMI(*MBB.getParent(), DL, TII->get(AMDGPU::V_MOVRELD_B32_e32))
.addReg(Reg, RegState::Define)
.addReg(Val)
.addReg(AMDGPU::M0, RegState::Implicit)
.addReg(Dst, RegState::Implicit);
LoadM0(MI, MovRel, Off);
}
bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
TII = static_cast<const SIInstrInfo *>(MF.getSubtarget().getInstrInfo());
TRI =
static_cast<const SIRegisterInfo *>(MF.getSubtarget().getRegisterInfo());
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
bool HaveKill = false;
bool NeedWQM = false;
bool NeedFlat = false;
unsigned Depth = 0;
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
MachineBasicBlock &MBB = *BI;
MachineBasicBlock::iterator I, Next;
for (I = MBB.begin(); I != MBB.end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
if (TII->isWQM(MI.getOpcode()) || TII->isDS(MI.getOpcode()))
NeedWQM = true;
// Flat uses m0 in case it needs to access LDS.
if (TII->isFLAT(MI.getOpcode()))
NeedFlat = true;
switch (MI.getOpcode()) {
default: break;
case AMDGPU::SI_IF:
++Depth;
If(MI);
break;
case AMDGPU::SI_ELSE:
Else(MI);
break;
case AMDGPU::SI_BREAK:
Break(MI);
break;
case AMDGPU::SI_IF_BREAK:
IfBreak(MI);
break;
case AMDGPU::SI_ELSE_BREAK:
ElseBreak(MI);
break;
case AMDGPU::SI_LOOP:
++Depth;
Loop(MI);
break;
case AMDGPU::SI_END_CF:
if (--Depth == 0 && HaveKill) {
SkipIfDead(MI);
HaveKill = false;
}
EndCf(MI);
break;
case AMDGPU::SI_KILL:
if (Depth == 0)
SkipIfDead(MI);
else
HaveKill = true;
Kill(MI);
break;
case AMDGPU::S_BRANCH:
Branch(MI);
break;
case AMDGPU::SI_INDIRECT_SRC:
IndirectSrc(MI);
break;
case AMDGPU::SI_INDIRECT_DST_V1:
case AMDGPU::SI_INDIRECT_DST_V2:
case AMDGPU::SI_INDIRECT_DST_V4:
case AMDGPU::SI_INDIRECT_DST_V8:
case AMDGPU::SI_INDIRECT_DST_V16:
IndirectDst(MI);
break;
}
}
}
if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) {
MachineBasicBlock &MBB = MF.front();
BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC).addReg(AMDGPU::EXEC);
}
// FIXME: This seems inappropriate to do here.
if (NeedFlat && MFI->IsKernel) {
// Insert the prologue initializing the SGPRs pointing to the scratch space
// for flat accesses.
const MachineFrameInfo *FrameInfo = MF.getFrameInfo();
// TODO: What to use with function calls?
// FIXME: This is reporting stack size that is used in a scratch buffer
// rather than registers as well.
uint64_t StackSizeBytes = FrameInfo->getStackSize();
int IndirectBegin
= static_cast<const AMDGPUInstrInfo*>(TII)->getIndirectIndexBegin(MF);
// Convert register index to 256-byte unit.
uint64_t StackOffset = IndirectBegin < 0 ? 0 : (4 * IndirectBegin / 256);
assert((StackSizeBytes < 0xffff) && StackOffset < 0xffff &&
"Stack limits should be smaller than 16-bits");
// Initialize the flat scratch register pair.
// TODO: Can we use one s_mov_b64 here?
// Offset is in units of 256-bytes.
MachineBasicBlock &MBB = MF.front();
DebugLoc NoDL;
MachineBasicBlock::iterator Start = MBB.getFirstNonPHI();
const MCInstrDesc &SMovK = TII->get(AMDGPU::S_MOVK_I32);
assert(isInt<16>(StackOffset) && isInt<16>(StackSizeBytes));
BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_LO)
.addImm(StackOffset);
// Documentation says size is "per-thread scratch size in bytes"
BuildMI(MBB, Start, NoDL, SMovK, AMDGPU::FLAT_SCR_HI)
.addImm(StackSizeBytes);
}
return true;
}