From 4dd9913293621b6d4970f64628dd62fc65ddd6bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Wed, 17 Feb 2016 13:55:18 -0500 Subject: [PATCH] AMDGPU: Add SIWholeQuadMode pass Whole quad mode is already enabled for pixel shaders that compute derivatives, but it must be suspended for instructions that cause a shader to have side effects (i.e. stores and atomics). Another issue related to WQM is that some applications rely on derivatives after non-uniform kills, or after kills that are uniform among the pixels of a primitive, but discard some of the helper pixels that are created initially. This pass addresses both issues. It keeps track of the current global live mask (which is initialized at the beginning and modified by SI_KILL instructions) and adds the necessary instructions to modify the EXEC register depending on whether WQM or exact execution is required. This pass is run before register coalescing so that we can use machine SSA for analysis. The changes in this patch expose a problem with the second machine scheduling pass: target independent instructions like COPY implicitly use EXEC when they operate on VGPRs, but this fact is not encoded in the MIR. This can lead to miscompilation because instructions are moved past changes to EXEC. This pass fixes the problem by adding use-implicit operands to target independent instructions. Some general codegen passes are relaxed to work with such implicit use operands. --- lib/CodeGen/ProcessImplicitDefs.cpp | 2 +- lib/CodeGen/TwoAddressInstructionPass.cpp | 30 +- lib/Target/AMDGPU/AMDGPU.h | 4 + lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 + lib/Target/AMDGPU/CMakeLists.txt | 1 + lib/Target/AMDGPU/SILowerControlFlow.cpp | 29 +- lib/Target/AMDGPU/SIRegisterInfo.h | 7 +- lib/Target/AMDGPU/SIWholeQuadMode.cpp | 707 ++++++++++++++++++++++++++++++ test/CodeGen/AMDGPU/si-scheduler.ll | 2 +- test/CodeGen/AMDGPU/wqm.ll | 399 +++++++++++++++++ 10 files changed, 1162 insertions(+), 21 deletions(-) create mode 100644 lib/Target/AMDGPU/SIWholeQuadMode.cpp create mode 100644 test/CodeGen/AMDGPU/wqm.ll diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp index d27ea2f..210f941 100644 --- a/lib/CodeGen/ProcessImplicitDefs.cpp +++ b/lib/CodeGen/ProcessImplicitDefs.cpp @@ -69,7 +69,7 @@ bool ProcessImplicitDefs::canTurnIntoImplicitDef(MachineInstr *MI) { !MI->isPHI()) return false; for (const MachineOperand &MO : MI->operands()) - if (MO.isReg() && MO.isUse() && MO.readsReg()) + if (MO.isReg() && MO.isUse() && MO.readsReg() && !MO.isImplicit()) return false; return true; } diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index e8009cc..9badb82 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1723,20 +1723,31 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { /// /// The instruction is turned into a sequence of sub-register copies: /// -/// %dst = REG_SEQUENCE %v1, ssub0, %v2, ssub1 +/// %dst = REG_SEQUENCE %v1, ssub0, %v2, ssub1, [implicit uses] /// /// Becomes: /// -/// %dst:ssub0 = COPY %v1 -/// %dst:ssub1 = COPY %v2 +/// %dst:ssub0 = COPY %v1, [implicit uses] +/// %dst:ssub1 = COPY %v2, [implicit uses] /// void TwoAddressInstructionPass:: eliminateRegSequence(MachineBasicBlock::iterator &MBBI) { MachineInstr *MI = MBBI; unsigned DstReg = MI->getOperand(0).getReg(); + unsigned NumTrailingImplicit = 0; + + for (unsigned i = MI->getNumOperands(); i > 0; --i) { + const MachineOperand &MO = MI->getOperand(i - 1); + if (!MO.isReg() || !MO.isImplicit()) + break; + NumTrailingImplicit++; + } + + unsigned NumOperands = MI->getNumOperands() - NumTrailingImplicit; + if (MI->getOperand(0).getSubReg() || TargetRegisterInfo::isPhysicalRegister(DstReg) || - !(MI->getNumOperands() & 1)) { + !(NumOperands & 1)) { DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << *MI); llvm_unreachable(nullptr); } @@ -1744,12 +1755,12 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) { SmallVector OrigRegs; if (LIS) { OrigRegs.push_back(MI->getOperand(0).getReg()); - for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) + for (unsigned i = 1; i < NumOperands; i += 2) OrigRegs.push_back(MI->getOperand(i).getReg()); } bool DefEmitted = false; - for (unsigned i = 1, e = MI->getNumOperands(); i < e; i += 2) { + for (unsigned i = 1; i < NumOperands; i += 2) { MachineOperand &UseMO = MI->getOperand(i); unsigned SrcReg = UseMO.getReg(); unsigned SubIdx = MI->getOperand(i+1).getImm(); @@ -1761,7 +1772,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) { // might insert a COPY that uses SrcReg after is was killed. bool isKill = UseMO.isKill(); if (isKill) - for (unsigned j = i + 2; j < e; j += 2) + for (unsigned j = i + 2; j < NumOperands; j += 2) if (MI->getOperand(j).getReg() == SrcReg) { MI->getOperand(j).setIsKill(); UseMO.setIsKill(false); @@ -1775,6 +1786,9 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) { .addReg(DstReg, RegState::Define, SubIdx) .addOperand(UseMO); + for (unsigned j = 0; j < NumTrailingImplicit; ++j) + CopyMI->addOperand(MI->getOperand(NumOperands + j)); + // The first def needs an flag because there is no live register // before it. if (!DefEmitted) { @@ -1797,7 +1811,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) { if (!DefEmitted) { DEBUG(dbgs() << "Turned: " << *MI << " into an IMPLICIT_DEF"); MI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF)); - for (int j = MI->getNumOperands() - 1, ee = 0; j > ee; --j) + for (int j = NumOperands - 1, ee = 0; j > ee; --j) MI->RemoveOperand(j); } else { DEBUG(dbgs() << "Eliminated: " << *MI); diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 31d223e..602918e 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -44,6 +44,7 @@ FunctionPass *createSIFoldOperandsPass(); FunctionPass *createSILowerI1CopiesPass(); FunctionPass *createSIShrinkInstructionsPass(); FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm); +FunctionPass *createSIWholeQuadModePass(); FunctionPass *createSILowerControlFlowPass(); FunctionPass *createSIFixControlFlowLiveIntervalsPass(); FunctionPass *createSIFixSGPRCopiesPass(); @@ -69,6 +70,9 @@ extern char &SILowerI1CopiesID; void initializeSILoadStoreOptimizerPass(PassRegistry &); extern char &SILoadStoreOptimizerID; +void initializeSIWholeQuadModePass(PassRegistry &); +extern char &SIWholeQuadModeID; + void initializeSILowerControlFlowPass(PassRegistry &); extern char &SILowerControlFlowPassID; diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 32e9d8a..11c4116 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -55,6 +55,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeAMDGPUPromoteAllocaPass(*PR); initializeSIAnnotateControlFlowPass(*PR); initializeSIInsertWaitsPass(*PR); + initializeSIWholeQuadModePass(*PR); initializeSILowerControlFlowPass(*PR); } @@ -338,6 +339,7 @@ void GCNPassConfig::addPreRegAlloc() { insertPass(&MachineSchedulerID, &RegisterCoalescerID); } addPass(createSIShrinkInstructionsPass(), false); + addPass(createSIWholeQuadModePass()); } void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) { diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 3f1b673..4835baf 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -62,6 +62,7 @@ add_llvm_target(AMDGPUCodeGen SIRegisterInfo.cpp SIShrinkInstructions.cpp SITypeRewriter.cpp + SIWholeQuadMode.cpp ) add_subdirectory(AsmParser) diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index edcfb08..c233af1 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -211,11 +211,31 @@ void SILowerControlFlow::Else(MachineInstr &MI) { DebugLoc DL = MI.getDebugLoc(); unsigned Dst = MI.getOperand(0).getReg(); unsigned Src = MI.getOperand(1).getReg(); + bool ExecModified = false; + + for (MachineInstr *Prev = MI.getPrevNode(); Prev && !Prev->isPHI(); + Prev = Prev->getPrevNode()) { + for (const auto &Def : Prev->defs()) { + if (Def.isReg() && Def.isDef() && Def.getReg() == AMDGPU::EXEC) { + ExecModified = true; + break; + } + } + } BuildMI(MBB, MBB.getFirstNonPHI(), DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) .addReg(Src); // Saved EXEC + if (ExecModified) { + // Adjust the saved exec to account for the modifications during the flow + // block that contains the ELSE. This can happen when WQM mode is switched + // off. + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), Dst) + .addReg(AMDGPU::EXEC) + .addReg(Dst); + } + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) .addReg(AMDGPU::EXEC) .addReg(Dst); @@ -479,7 +499,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { SIMachineFunctionInfo *MFI = MF.getInfo(); bool HaveKill = false; - bool NeedWQM = false; bool NeedFlat = false; unsigned Depth = 0; @@ -492,8 +511,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (TII->isWQM(MI) || TII->isDS(MI)) - NeedWQM = true; // Flat uses m0 in case it needs to access LDS. if (TII->isFLAT(MI)) @@ -566,12 +583,6 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) { } } - if (NeedWQM && MFI->getShaderType() == ShaderType::PIXEL) { - MachineBasicBlock &MBB = MF.front(); - BuildMI(MBB, MBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WQM_B64), - AMDGPU::EXEC).addReg(AMDGPU::EXEC); - } - if (NeedFlat && MFI->IsKernel) { // TODO: What to use with function calls? // We will need to Initialize the flat scratch register pair. diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h index 76eaa2c..6bb3200 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/lib/Target/AMDGPU/SIRegisterInfo.h @@ -70,9 +70,12 @@ public: } bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const { + const TargetRegisterClass *RC; if (TargetRegisterInfo::isVirtualRegister(Reg)) - return isSGPRClass(MRI.getRegClass(Reg)); - return getPhysRegClass(Reg); + RC = MRI.getRegClass(Reg); + else + RC = getPhysRegClass(Reg); + return isSGPRClass(RC); } /// \returns true if this class contains VGPR registers. diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp new file mode 100644 index 0000000..6a1df35 --- /dev/null +++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp @@ -0,0 +1,707 @@ +//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// \brief This pass adds instructions to enable whole quad mode for pixel +/// shaders. +/// +/// Whole quad mode is required for derivative computations, but it interferes +/// with shader side effects (stores and atomics). This pass is run after the +/// (first) machine scheduler pass but before register coalescing, so that +/// machine SSA is available for analysis. It ensures that WQM is enabled when +/// necessary, but disabled around stores and atomics. +/// +/// When necessary, this pass creates a function prolog +/// +/// S_MOV_B64 LiveMask, EXEC +/// S_WQM_B64 EXEC, EXEC +/// +/// to enter WQM at the top of the function and surrounds blocks of Exact +/// instructions by +/// +/// S_AND_SAVEEXEC_B64 Tmp, LiveMask +/// ... +/// S_MOV_B64 EXEC, Tmp +/// +/// In order to avoid excessive switching during sequences of Exact +/// instructions, the pass first analyzes which instructions must be run in WQM +/// (aka which instructions produce values that lead to derivative +/// computations). +/// +/// This also provides an alternative lowering of SI_KILL instructions that +/// masks bits away from the LiveMask, thus providing a D3D10-like behavior +/// in which discarded pixels continue executing if they are required for +/// derivative computations. +/// +/// Basic blocks are always exited (and entered) in WQM as long as they or +/// some successor needs WQM. +/// +/// There is room for improvement given better control flow analysis: +/// +/// (1) at the top level (outside of control flow statements), one SGPR can be +/// saved by recovering WQM from the LiveMask (this is implemented for the +/// entry block). +/// +/// (2) when entire regions (e.g. if-else blocks or entire loops) only +/// consist of exact and don't-care instructions, the switch only has to +/// be done at the entry and exit points rather than potentially in each +/// block of the region. +/// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" +#include "llvm/CodeGen/MachineDominanceFrontier.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/IR/Constants.h" + +using namespace llvm; +class todo; + +#define DEBUG_TYPE "si-wqm" + +namespace { + +// Helper class for inserting a new variable in SSA representation. Usage: +// - Mark all blocks with a def using addDef +// - Optionally use setInReg on the entry block of the function +// - Call finalize to insert and wire up PHI instructions +// - Use getInReg and getLastDef for generating code that uses or sets the +// the variable. +class VariableInsertion { +private: + struct Block { + MachineInstr *PHI = nullptr; + unsigned LastDef = 0; + unsigned InReg = 0; + }; + + MachineFunction &MF; + const MachineDominanceFrontier &MDF; + const MachineDominatorTree &MDT; + const TargetRegisterClass *RC; + + mutable DenseMap Blocks; + +public: + VariableInsertion(MachineFunction &MF, + const MachineDominanceFrontier &MDF, + const MachineDominatorTree &MDT, + const TargetRegisterClass *RC) : MF(MF), MDF(MDF), MDT(MDT), RC(RC) {} + + void addDef(MachineBasicBlock &MBB); + void setInReg(MachineBasicBlock &MBB, unsigned Reg); + void finalize(); + + unsigned getLastDef(MachineBasicBlock &MBB) const; + unsigned getInReg(MachineBasicBlock &MBB) const; + unsigned getOutReg(MachineBasicBlock &MBB) const; +}; + +void VariableInsertion::addDef(MachineBasicBlock &MBB) +{ + Block &BI = Blocks[&MBB]; + if (!BI.LastDef) + BI.LastDef = MF.getRegInfo().createVirtualRegister(RC); +} + +void VariableInsertion::setInReg(MachineBasicBlock &MBB, unsigned Reg) +{ + Block &BI = Blocks[&MBB]; + assert(!BI.InReg); + BI.InReg = Reg; +} + +void VariableInsertion::finalize() +{ + SmallVector Worklist; + + // Insert PHIs + for (auto &MBBI : Blocks) + Worklist.push_back(MBBI.first); + + const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.back(); + Worklist.pop_back(); + + for (MachineBasicBlock *DFBlock : MDF.find(MBB)->second) { + Block &BI = Blocks[DFBlock]; + if (BI.PHI) + continue; + + assert(!BI.InReg); + unsigned Reg = MRI.createVirtualRegister(RC); + BI.PHI = BuildMI(*DFBlock, DFBlock->begin(), DebugLoc(), + TII->get(TargetOpcode::PHI), Reg); + if (!BI.LastDef) + Worklist.push_back(DFBlock); + } + } + + // Wire up PHIs + for (auto &MBBI : Blocks) { + MachineBasicBlock &MBB = *MBBI.first; + Block &BI = MBBI.second; + if (!BI.PHI) + continue; + + for (MachineBasicBlock *Pred : MBB.predecessors()) { + BI.PHI->addOperand(MachineOperand::CreateReg(getOutReg(*Pred), false)); + BI.PHI->addOperand(MachineOperand::CreateMBB(Pred)); + } + } +} + +unsigned VariableInsertion::getLastDef(MachineBasicBlock &MBB) const +{ + auto BI = Blocks.find(&MBB); + if (BI != Blocks.end()) + return BI->second.LastDef; + return 0; +} + +unsigned VariableInsertion::getOutReg(MachineBasicBlock &MBB) const +{ + Block &BI = Blocks[&MBB]; + if (BI.LastDef) + return BI.LastDef; + return getInReg(MBB); +} + +unsigned VariableInsertion::getInReg(MachineBasicBlock &MBB) const +{ + Block &BI = Blocks[&MBB]; + if (BI.PHI) + return BI.PHI->getOperand(0).getReg(); + if (!BI.InReg) { + if (auto IDomNode = MDT[&MBB]->getIDom()) + BI.InReg = getOutReg(*IDomNode->getBlock()); + } + return BI.InReg; +} + +enum { + StateWQM = 0x1, + StateExact = 0x2, +}; + +struct InstrInfo { + char Needs = 0; + char OutNeeds = 0; +}; + +struct BlockInfo { + char Needs = 0; + char InNeeds = 0; + char OutNeeds = 0; + unsigned WQMKills = 0; +}; + +struct WorkItem { + const MachineBasicBlock *MBB = nullptr; + const MachineInstr *MI = nullptr; + + WorkItem() {} + WorkItem(const MachineBasicBlock *MBB) : MBB(MBB) {} + WorkItem(const MachineInstr *MI) : MI(MI) {} +}; + +class SIWholeQuadMode : public MachineFunctionPass { +private: + const SIInstrInfo *TII; + const SIRegisterInfo *TRI; + MachineRegisterInfo *MRI; + + DenseMap Instructions; + DenseMap Blocks; + SmallVector ExecExports; + bool HaveWQMKill; + + char scanInstructions(const MachineFunction &MF, std::vector& Worklist); + void foundWQMKill(const MachineFunction &MF, std::vector& Worklist); + void propagateInstruction(const MachineInstr &MI, std::vector& Worklist); + void propagateBlock(const MachineBasicBlock &MBB, std::vector& Worklist); + char analyzeFunction(const MachineFunction &MF); + + void WQMKill(MachineInstr &MI, unsigned OldLiveMask, unsigned NewLiveMask, + bool TopLevel); + void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SaveWQM, unsigned LiveMaskReg); + void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before, + unsigned SavedWQM); + void processBlock(const VariableInsertion &LiveMask, + MachineBasicBlock &MBB, bool isEntry); + +public: + static char ID; + + SIWholeQuadMode() : + MachineFunctionPass(ID) { } + + bool runOnMachineFunction(MachineFunction &MF) override; + + const char *getPassName() const override { + return "SI Whole Quad Mode"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + AU.addRequired(); + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +} // End anonymous namespace + +char SIWholeQuadMode::ID = 0; + +INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, + "SI Whole Quad Mode", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier) +INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) +INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, + "SI Whole Quad Mode", false, false) + +char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID; + +FunctionPass *llvm::createSIWholeQuadModePass() { + return new SIWholeQuadMode; +} + +// Scan instructions to determine which ones require an Exact execmask and +// which ones seed WQM requirements. +char SIWholeQuadMode::scanInstructions(const MachineFunction &MF, + std::vector &Worklist) { + char GlobalFlags = 0; + + for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { + const MachineBasicBlock &MBB = *BI; + + for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) { + const MachineInstr &MI = *II; + unsigned Opcode = MI.getOpcode(); + char Flags; + + if (TII->isWQM(Opcode) || TII->isDS(Opcode)) { + Flags = StateWQM; + } else if (TII->get(Opcode).mayStore() && + (MI.getDesc().TSFlags & SIInstrFlags::VM_CNT)) { + Flags = StateExact; + } else { + if (Opcode == AMDGPU::EXP) { + //TODO only if op[4] != 0 + ExecExports.push_back(&MI); + } + continue; + } + + Instructions[&MI].Needs = Flags; + Worklist.push_back(&MI); + GlobalFlags |= Flags; + } + } + + return GlobalFlags; +} + +void SIWholeQuadMode::foundWQMKill(const MachineFunction &MF, + std::vector& Worklist) +{ + HaveWQMKill = true; + + for (const MachineInstr *Export : ExecExports) { + InstrInfo &ExportII = Instructions[Export]; + ExportII.Needs = StateExact; + Worklist.push_back(Export); + } + + if (ExecExports.empty()) { + // We are compiling a part of a non-monolithic shader. Need to ensure + // that exiting basic blocks leave the exact exec-mask in place. + for (const MachineBasicBlock &MBB : MF) { + if (MBB.succ_empty()) { + BlockInfo &BI = Blocks[&MBB]; + BI.OutNeeds = StateExact; + Worklist.push_back(&MBB); + } + } + } +} + +void SIWholeQuadMode::propagateInstruction(const MachineInstr &MI, + std::vector& Worklist) { + const MachineBasicBlock &MBB = *MI.getParent(); + InstrInfo &II = Instructions[&MI]; + BlockInfo &BI = Blocks[&MBB]; + + // Control flow-type instructions that are followed by WQM computations + // must themselves be in WQM. + if ((MI.isBranch() || MI.isTerminator() || MI.getOpcode() == AMDGPU::SI_KILL) && + (II.OutNeeds & StateWQM) && !(II.Needs & StateWQM)) { + II.Needs = StateWQM; + + if (MI.getOpcode() == AMDGPU::SI_KILL) { + BI.WQMKills++; + + if (!HaveWQMKill) + foundWQMKill(*MI.getParent()->getParent(), Worklist); + } + } + + // Propagate to block level + BI.Needs |= II.Needs; + if ((BI.InNeeds | II.Needs) != BI.InNeeds) { + BI.InNeeds |= II.Needs; + Worklist.push_back(&MBB); + } + + // Propagate backwards within block + if (const MachineInstr *PrevMI = MI.getPrevNode()) { + char InNeeds = II.Needs | II.OutNeeds; + if (!PrevMI->isPHI()) { + InstrInfo &PrevII = Instructions[PrevMI]; + if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) { + PrevII.OutNeeds |= InNeeds; + Worklist.push_back(PrevMI); + } + } + } + + // Propagate WQM flag to instruction inputs + assert(II.Needs != (StateWQM | StateExact)); + if (II.Needs != StateWQM) + return; + + for (const MachineOperand &Use : MI.uses()) { + if (!Use.isReg() || !Use.isUse()) + continue; + + // At this point, physical registers appear as inputs or outputs + // and following them makes no sense (and would in fact be incorrect + // when the same VGPR is used as both an output and an input that leads + // to a NeedsWQM instruction). + // + // Note: VCC appears e.g. in 64-bit addition with carry - theoretically we + // have to trace this, in practice it happens for 64-bit computations like + // pointers where both dwords are followed already anyway. + if (!TargetRegisterInfo::isVirtualRegister(Use.getReg())) + continue; + + for (const MachineOperand &Def : MRI->def_operands(Use.getReg())) { + const MachineInstr *DefMI = Def.getParent(); + InstrInfo &DefII = Instructions[DefMI]; + + // Obviously skip if DefMI is already flagged as NeedWQM. + // + // The instruction might also be flagged as NeedExact. This happens when + // the result of an atomic is used in a WQM computation. In this case, + // the atomic must not run for helper pixels and the WQM result is + // undefined. + if (DefII.Needs != 0) + continue; + + DefII.Needs = StateWQM; + Worklist.push_back(DefMI); + } + } +} + +void SIWholeQuadMode::propagateBlock(const MachineBasicBlock &MBB, + std::vector& Worklist) { + BlockInfo &BI = Blocks[&MBB]; + + // Propagate through instructions + if (!MBB.empty()) { + const MachineInstr *LastMI = &*MBB.rbegin(); + InstrInfo &LastII = Instructions[LastMI]; + if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) { + LastII.OutNeeds |= BI.OutNeeds; + Worklist.push_back(LastMI); + } + } + + // Predecessor blocks must provide for our WQM/Exact needs. + for (const MachineBasicBlock *Pred : MBB.predecessors()) { + BlockInfo &PredBI = Blocks[Pred]; + if ((BI.InNeeds & ~PredBI.OutNeeds) == 0) + continue; + + PredBI.OutNeeds |= BI.InNeeds; + PredBI.InNeeds |= BI.InNeeds; + Worklist.push_back(Pred); + } + + // All successors must be prepared to accept the same set of WQM/Exact + // data. + for (const MachineBasicBlock *Succ : MBB.successors()) { + BlockInfo &SuccBI = Blocks[Succ]; + if ((BI.OutNeeds & ~SuccBI.InNeeds) == 0) + continue; + + SuccBI.InNeeds |= BI.OutNeeds; + Worklist.push_back(Succ); + } +} + +char SIWholeQuadMode::analyzeFunction(const MachineFunction &MF) { + std::vector Worklist; + char GlobalFlags = scanInstructions(MF, Worklist); + + while (!Worklist.empty()) { + WorkItem WI = Worklist.back(); + Worklist.pop_back(); + + if (WI.MI) + propagateInstruction(*WI.MI, Worklist); + else + propagateBlock(*WI.MBB, Worklist); + } + + return GlobalFlags; +} + +void SIWholeQuadMode::WQMKill(MachineInstr &MI, unsigned OldLiveMask, + unsigned NewLiveMask, bool TopLevel) { + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + const MachineOperand &Op = MI.getOperand(0); + + // Clear this thread from the live mask if the operand is negative + unsigned KilledMask = 0; + if ((Op.isImm())) { + if (!(Op.getImm() & 0x80000000)) + return; + + KilledMask = AMDGPU::EXEC; + } else { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NLE_F32_e32)) + .addImm(0) + .addOperand(Op); + KilledMask = AMDGPU::VCC; + } + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), NewLiveMask) + .addReg(OldLiveMask) + .addReg(KilledMask); + + if (TopLevel) { + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC) + .addReg(NewLiveMask); + } else { + unsigned Tmp = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_WQM_B64), Tmp) + .addReg(NewLiveMask); + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Tmp); + } + + // Keep a kill instruction with zero operand. The SILowerControlFlow pass + // will take this as a hint to consider emitting a skip-if-dead sequence. + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::SI_KILL)).addImm(0); + MI.eraseFromParent(); +} + +void SIWholeQuadMode::toExact(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SaveWQM, unsigned LiveMaskReg) +{ + if (SaveWQM) { + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), + SaveWQM) + .addReg(LiveMaskReg); + } else { + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_AND_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(LiveMaskReg); + } +} + +void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB, + MachineBasicBlock::iterator Before, + unsigned SavedWQM) +{ + if (SavedWQM) { + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::EXEC) + .addReg(SavedWQM); + } else { + BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC) + .addReg(AMDGPU::EXEC); + } +} + +void SIWholeQuadMode::processBlock(const VariableInsertion &LiveMask, + MachineBasicBlock &MBB, bool isEntry) { + auto BII = Blocks.find(&MBB); + if (BII == Blocks.end()) + return; + + const BlockInfo &BI = BII->second; + + if (!(BI.InNeeds & StateWQM)) + return; + + if (!isEntry && !BI.WQMKills && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact) + return; + + unsigned LiveMaskReg = LiveMask.getInReg(MBB); + unsigned SavedWQMReg = 0; + unsigned NumWQMKills = 0; + char State = isEntry ? StateExact : StateWQM; + + auto II = MBB.getFirstNonPHI(), IE = MBB.end(); + while (II != IE) { + MachineInstr &MI = *II; + ++II; + + // Skip instructions that are not affected by EXEC + if (MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD) && + !MI.isBranch() && !MI.isTerminator()) + continue; + + // Generic instructions such as COPY will either disappear by register + // coalescing or be lowered to SALU or VALU instructions. + if (TargetInstrInfo::isGenericOpcode(MI.getOpcode())) { + if (MI.getNumExplicitOperands() >= 1) { + const MachineOperand &Op = MI.getOperand(0); + if (Op.isReg()) { + if (TRI->isSGPRReg(*MRI, Op.getReg())) { + // SGPR instructions are not affected by EXEC + continue; + } else { + // Generic instructions on VGPRs must be marked as implicitly using + // EXEC or subsequent passes might reschedule them incorrectly. + MI.addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true)); + } + } + } + } + + char Needs = 0; + char OutNeeds = 0; + auto InstrInfoIt = Instructions.find(&MI); + if (InstrInfoIt != Instructions.end()) { + Needs = InstrInfoIt->second.Needs; + OutNeeds = InstrInfoIt->second.OutNeeds; + + // Make sure to switch to Exact mode when appropriate before the end of + // a basic block. Similarly before a kill, so that we can lower the kill + // as a simple EXEC update. + if (!Needs && OutNeeds == StateExact && + (MI.isBranch() || MI.isTerminator() || MI.getOpcode() == AMDGPU::SI_KILL)) + Needs = StateExact; + } + + // State switching + if (Needs && State != Needs) { + if (Needs == StateExact) { + assert(!SavedWQMReg); + + if (!isEntry && (OutNeeds & StateWQM)) + SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + + toExact(MBB, &MI, SavedWQMReg, LiveMaskReg); + } else { + assert(isEntry == (SavedWQMReg == 0)); + toWQM(MBB, &MI, SavedWQMReg); + SavedWQMReg = 0; + } + + State = Needs; + } + + // Handle WQM kills; non-WQM kills are lowered to pure EXEC manipulation + // in a later pass + if (MI.getOpcode() == AMDGPU::SI_KILL && Needs == StateWQM) { + unsigned NewLiveMask; + ++NumWQMKills; + if (NumWQMKills == BI.WQMKills) + NewLiveMask = LiveMask.getLastDef(MBB); + else + NewLiveMask = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + + WQMKill(MI, LiveMaskReg, NewLiveMask, isEntry); + + LiveMaskReg = NewLiveMask; + } + } + + if ((BI.OutNeeds & StateWQM) && State != StateWQM) { + assert(isEntry == (SavedWQMReg == 0)); + toWQM(MBB, MBB.end(), SavedWQMReg); + } else if (BI.OutNeeds == StateExact && State != StateExact) { + toExact(MBB, MBB.end(), 0, LiveMaskReg); + } +} + +bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) { + SIMachineFunctionInfo *MFI = MF.getInfo(); + + if (MFI->getShaderType() != ShaderType::PIXEL) + return false; + + Instructions.clear(); + Blocks.clear(); + ExecExports.clear(); + HaveWQMKill = false; + + TII = static_cast(MF.getSubtarget().getInstrInfo()); + TRI = static_cast(MF.getSubtarget().getRegisterInfo()); + MRI = &MF.getRegInfo(); + + char GlobalFlags = analyzeFunction(MF); + if (!(GlobalFlags & StateWQM)) + return false; + + MachineBasicBlock &Entry = MF.front(); + MachineInstr *EntryMI = Entry.getFirstNonPHI(); + + if (GlobalFlags == StateWQM && !HaveWQMKill) { + // For a shader that needs only WQM, we can just set it once. + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64), + AMDGPU::EXEC).addReg(AMDGPU::EXEC); + return true; + } + + // Handle the general case + VariableInsertion LiveMask(MF, + getAnalysis(), + getAnalysis(), + &AMDGPU::SReg_64RegClass); + + unsigned InitLiveMask = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), InitLiveMask) + .addReg(AMDGPU::EXEC); + LiveMask.setInReg(*MF.begin(), InitLiveMask); + + for (const auto &BII : Blocks) { + if (BII.second.WQMKills) + LiveMask.addDef(const_cast(*BII.first)); + } + + LiveMask.finalize(); + + for (const auto &BII : Blocks) + processBlock(LiveMask, const_cast(*BII.first), + BII.first == &*MF.begin()); + + return true; +} diff --git a/test/CodeGen/AMDGPU/si-scheduler.ll b/test/CodeGen/AMDGPU/si-scheduler.ll index 19c5815..b059872 100644 --- a/test/CodeGen/AMDGPU/si-scheduler.ll +++ b/test/CodeGen/AMDGPU/si-scheduler.ll @@ -3,9 +3,9 @@ ; The test checks the "si" machine scheduler pass works correctly. ; CHECK-LABEL: {{^}}main: -; CHECK: s_wqm ; CHECK: s_load_dwordx4 ; CHECK: s_load_dwordx8 +; CHECK: s_wqm ; CHECK: s_waitcnt lgkmcnt(0) ; CHECK: image_sample ; CHECK: s_waitcnt vmcnt(0) diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll new file mode 100644 index 0000000..73a0f5b --- /dev/null +++ b/test/CodeGen/AMDGPU/wqm.ll @@ -0,0 +1,399 @@ +;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s +;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s + +; Check that WQM isn't triggered by image load/store intrinsics. +; +;CHECK-LABEL: {{^}}test1: +;CHECK-NOT: s_wqm +define <4 x float> @test1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 { +main_body: + %tex = call <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + call void @llvm.amdgcn.image.store.v4i32(<4 x float> %tex, <4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0) + ret <4 x float> %tex +} + +; Check that WQM is triggered by image samples and left untouched for loads... +; +;CHECK-LABEL: {{^}}test2: +;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: image_sample +;CHECK-NOT: exec +;CHECK: _load_dword v0, +define float @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 { +main_body: + %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %c.2 = bitcast <4 x float> %c.1 to <4 x i32> + %c.3 = extractelement <4 x i32> %c.2, i32 0 + %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3 + %data = load float, float addrspace(1)* %gep + ret float %data +} + +; ... but disabled for stores (and, in this simple case, not re-enabled). +; +;CHECK-LABEL: {{^}}test3: +;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: image_sample +;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: store +;CHECK-NOT: exec +define <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) #0 { +main_body: + %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex.1 = bitcast <4 x float> %tex to <4 x i32> + %tex.2 = extractelement <4 x i32> %tex.1, i32 0 + %gep = getelementptr float, float addrspace(1)* %ptr, i32 %tex.2 + %wr = extractelement <4 x float> %tex, i32 1 + store float %wr, float addrspace(1)* %gep + ret <4 x float> %tex +} + +; Check that WQM is re-enabled when required. +; +;CHECK-LABEL: {{^}}test4: +;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: v_mul_lo_i32 [[MUL:v[0-9]+]], v0, v1 +;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: store +;CHECK: s_wqm_b64 exec, exec +;CHECK: image_sample v[0:3], 15, 0, 0, 0, 0, 0, 0, 0, [[MUL]], s[0:7], s[8:11] +define <4 x float> @test4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %d, float %data) #0 { +main_body: + %c.1 = mul i32 %c, %d + %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.1 + store float %data, float addrspace(1)* %gep + %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %tex +} + +; Check a case of one branch of an if-else requiring WQM, the other requiring +; exact. +; +; Note: In this particular case, the save-and-restore could be avoided if the +; analysis understood that the two branches of the if-else are mutually +; exclusive. +; +;CHECK-LABEL: {{^}}test_control_flow_0: +;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: %ELSE +;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]] +;CHECK: store +;CHECK: s_mov_b64 exec, [[SAVED]] +;CHECK: %IF +;CHECK: image_sample +define float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 { +main_body: + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %IF, label %ELSE + +IF: + %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %data.if = extractelement <4 x float> %tex, i32 0 + br label %END + +ELSE: + %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c + store float %data, float addrspace(1)* %gep + br label %END + +END: + %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] + ret float %r +} + +; Reverse branch order compared to the previous test. +; +;CHECK-LABEL: {{^}}test_control_flow_1: +;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: %IF +;CHECK: image_sample +;CHECK: %Flow +;CHECK-NEXT: s_or_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], +;CHECK-NEXT: s_and_b64 exec, exec, [[ORIG]] +;CHECK-NEXT: s_and_b64 [[SAVED]], exec, [[SAVED]] +;CHECK-NEXT: s_xor_b64 exec, exec, [[SAVED]] +;CHECK-NEXT: %ELSE +;CHECK: store +;CHECK: %END +define float @test_control_flow_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) #0 { +main_body: + %cmp = icmp eq i32 %z, 0 + br i1 %cmp, label %ELSE, label %IF + +IF: + %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %data.if = extractelement <4 x float> %tex, i32 0 + br label %END + +ELSE: + %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c + store float %data, float addrspace(1)* %gep + br label %END + +END: + %r = phi float [ %data.if, %IF ], [ %data, %ELSE ] + ret float %r +} + +; Check that branch conditions are properly marked as needing WQM... +; +;CHECK-LABEL: {{^}}test_control_flow_2: +;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: store +;CHECK: s_wqm_b64 exec, exec +;CHECK: load +;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: store +;CHECK: s_wqm_b64 exec, exec +;CHECK: v_cmp +define <4 x float> @test_control_flow_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 { +main_body: + %idx.1 = extractelement <3 x i32> %idx, i32 0 + %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 + %data.1 = extractelement <2 x float> %data, i32 0 + store float %data.1, float addrspace(1)* %gep.1 + + ; The load that determines the branch (and should therefore be WQM) is + ; surrounded by stores that require disabled WQM. + %idx.2 = extractelement <3 x i32> %idx, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2 + %z = load float, float addrspace(1)* %gep.2 + + %idx.3 = extractelement <3 x i32> %idx, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 + %data.3 = extractelement <2 x float> %data, i32 1 + store float %data.3, float addrspace(1)* %gep.3 + + %cc = fcmp ogt float %z, 0.0 + br i1 %cc, label %IF, label %ELSE + +IF: + %coord.IF = mul i32 %coord, 3 + br label %END + +ELSE: + %coord.ELSE = mul i32 %coord, 4 + br label %END + +END: + %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ] + %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %tex +} + +; ... but only if they really do need it. +; +;CHECK-LABEL: {{^}}test_control_flow_3: +;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: image_sample +;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: store +;CHECK: load +;CHECK: store +;CHECK: v_cmp +define float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <3 x i32> %idx, <2 x float> %data, i32 %coord) #0 { +main_body: + %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex.1 = extractelement <4 x float> %tex, i32 0 + + %idx.1 = extractelement <3 x i32> %idx, i32 0 + %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 + %data.1 = extractelement <2 x float> %data, i32 0 + store float %data.1, float addrspace(1)* %gep.1 + + %idx.2 = extractelement <3 x i32> %idx, i32 1 + %gep.2 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.2 + %z = load float, float addrspace(1)* %gep.2 + + %idx.3 = extractelement <3 x i32> %idx, i32 2 + %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 + %data.3 = extractelement <2 x float> %data, i32 1 + store float %data.3, float addrspace(1)* %gep.3 + + %cc = fcmp ogt float %z, 0.0 + br i1 %cc, label %IF, label %ELSE + +IF: + %tex.IF = fmul float %tex.1, 3.0 + br label %END + +ELSE: + %tex.ELSE = fmul float %tex.1, 4.0 + br label %END + +END: + %tex.END = phi float [ %tex.IF, %IF ], [ %tex.ELSE, %ELSE ] + ret float %tex.END +} + +; Another test that failed at some point because of terminator handling. +; +;CHECK-LABEL: {{^}}test_control_flow_4: +;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: %IF +;CHECK: load +;CHECK: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[ORIG]] +;CHECK: store +;CHECK: s_mov_b64 exec, [[SAVE]] +;CHECK: %END +;CHECK: image_sample +define <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) #0 { +main_body: + %cond = icmp eq i32 %y, 0 + br i1 %cond, label %IF, label %END + +IF: + %data = load float, float addrspace(1)* %ptr + %gep = getelementptr float, float addrspace(1)* %ptr, i32 1 + store float %data, float addrspace(1)* %gep + br label %END + +END: + %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %tex +} + +; Kill must run in WQM if there is downstream WQM ... +; +;CHECK-LABEL: {{^}}test_kill_0: +;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: store +;CHECK: s_wqm_b64 exec, exec +;CHECK: v_cmp_nle_f32_e32 vcc, +;CHECK: s_andn2_b64 [[NEW:s\[[0-9]+:[0-9]+\]]], [[ORIG]], vcc +;CHECK: s_wqm_b64 exec, [[NEW]] +;CHECK: s_and_b64 exec, exec, [[NEW]] +;CHECK: store +;CHECK: s_wqm_b64 exec, exec +;CHECK: image_sample +;CHECK: s_and_b64 exec, exec, [[NEW]] +define <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, float %z) #0 { +main_body: + %idx.1 = extractelement <2 x i32> %idx, i32 0 + %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 + %data.1 = extractelement <2 x float> %data, i32 0 + store float %data.1, float addrspace(1)* %gep.1 + + call void @llvm.AMDGPU.kill(float %z) + + %idx.3 = extractelement <2 x i32> %idx, i32 1 + %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 + %data.3 = extractelement <2 x float> %data, i32 1 + store float %data.3, float addrspace(1)* %gep.3 + + %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %tex +} + +; ... and only if there is downstream WQM. +; +;CHECK-LABEL: {{^}}test_kill_1: +;CHECK-NEXT: ; %main_body +;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK-NEXT: s_wqm_b64 exec, exec +;CHECK: image_sample +;CHECK: s_and_b64 exec, exec, [[ORIG]] +;CHECK: store +;CHECK: v_cmpx +;CHECK: store +define <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, float %z) #0 { +main_body: + %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + + %idx.1 = extractelement <2 x i32> %idx, i32 0 + %gep.1 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.1 + %data.1 = extractelement <2 x float> %data, i32 0 + store float %data.1, float addrspace(1)* %gep.1 + + call void @llvm.AMDGPU.kill(float %z) + + %idx.3 = extractelement <2 x i32> %idx, i32 1 + %gep.3 = getelementptr float, float addrspace(1)* %ptr, i32 %idx.3 + %data.3 = extractelement <2 x float> %data, i32 1 + store float %data.3, float addrspace(1)* %gep.3 + + ret <4 x float> %tex +} + +; Check that WQM kill returns to exact exec before exports. +; +;CHECK-LABEL: test_kill_2 +;CHECK: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: v_cmp_nle_f32_e32 vcc, +;CHECK: s_andn2_b64 [[NEW:s\[[0-9]+:[0-9]+\]]], [[ORIG]], vcc +;CHECK: s_wqm_b64 exec, [[NEW]] +;CHECK: image_sample +;CHECK: s_and_b64 exec, exec, [[NEW]] +;CHECK: exp +define void @test_kill_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, float %z) #0 { +main_body: + call void @llvm.AMDGPU.kill(float %z) + + %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + %tex.0 = extractelement <4 x float> %tex, i32 0 + + call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tex.0, float undef, float undef, float undef) + ret void +} + +; Check WQM kill inside control flow. +; +;CHECK-LABEL: test_kill_3 +;CHECK: s_mov_b64 [[LIVE:s\[[0-9]+:[0-9]+\]]], exec +;CHECK: s_wqm_b64 exec, exec +;CHECK: %IF +;CHECK: v_cmp_nle_f32_e32 vcc, +;CHECK: s_andn2_b64 [[LIVE]], [[LIVE]], vcc +;CHECK: s_wqm_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[NEW]] +;CHECK: s_and_b64 exec, exec, [[TMP]] +;CHECK: %END +;CHECK: image_sample +;CHECK: s_and_b64 exec, exec, [[NEW]] +define <4 x float> @test_kill_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %coord, i32 %y, float %z) #0 { +main_body: + %cond = icmp eq i32 %y, 0 + br i1 %cond, label %IF, label %END + +IF: + call void @llvm.AMDGPU.kill(float %z) + br label %END + +END: + %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %tex +} + +declare void @llvm.amdgcn.image.store.v4i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1 + +declare <4 x float> @llvm.amdgcn.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2 + +declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 +declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3 + +declare void @llvm.AMDGPU.kill(float) +declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) + +attributes #0 = { "ShaderType"="0" } +attributes #1 = { nounwind } +attributes #2 = { nounwind readonly } +attributes #3 = { nounwind readnone } -- 2.5.0