From 936fa068b36483833dbb618ed2bb3bcb5a88e532 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Thu, 18 Sep 2014 14:46:46 -0400 Subject: [PATCH] R600/SI: Implement VGPR register spilling v3 VGPRs are spilled to LDS. v2: Only calculate thread id once v3: Move insertion of spill instructions to SIRegisterInfo::eliminateFrameIndex() --- lib/Target/R600/AMDGPUAsmPrinter.cpp | 6 +- lib/Target/R600/SIInstrInfo.cpp | 173 +++++++++++++++++++++++++----- lib/Target/R600/SIInstrInfo.h | 7 ++ lib/Target/R600/SIInstructions.td | 21 ++++ lib/Target/R600/SIMachineFunctionInfo.cpp | 34 +++--- lib/Target/R600/SIMachineFunctionInfo.h | 10 ++ lib/Target/R600/SIRegisterInfo.cpp | 115 +++++++++++++++++++- lib/Target/R600/SIRegisterInfo.h | 1 + 8 files changed, 319 insertions(+), 48 deletions(-) diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp index 2755af2..f29b203 100644 --- a/lib/Target/R600/AMDGPUAsmPrinter.cpp +++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp @@ -377,8 +377,12 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, LDSAlignShift = 9; } + unsigned LDSSpillSize = MFI->LDSWaveSpillSize * + MFI->getMaximumWorkGroupSize(MF); + unsigned LDSBlocks = - RoundUpToAlignment(MFI->LDSSize, 1 << LDSAlignShift) >> LDSAlignShift; + RoundUpToAlignment(MFI->LDSSize + LDSSpillSize, + 1 << LDSAlignShift) >> LDSAlignShift; // Scratch is allocated in 256 dword blocks. unsigned ScratchAlignShift = 10; diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp index 2606161..b5fe973 100644 --- a/lib/Target/R600/SIInstrInfo.cpp +++ b/lib/Target/R600/SIInstrInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" +#include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCInstrDesc.h" using namespace llvm; @@ -419,32 +420,41 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineFunction *MF = MBB.getParent(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); + int Opcode = -1; - if (RI.hasVGPRs(RC)) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Can't spill VGPR!"); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0) - .addReg(SrcReg); - } else if (RI.isSGPRClass(RC)) { + if (RI.isSGPRClass(RC)) { // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling // SGPRs. - unsigned Opcode; switch (RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; - default: llvm_unreachable("Cannot spill register class"); + case 32: Opcode = AMDGPU::SI_SPILL_S32_SAVE; break; + case 64: Opcode = AMDGPU::SI_SPILL_S64_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break; } + } else if(RI.hasVGPRs(RC)) { + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_V32_SAVE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_SAVE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_SAVE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_SAVE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_SAVE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_SAVE; break; + } + } + if (Opcode != -1) { FrameInfo->setObjectAlignment(FrameIndex, 4); BuildMI(MBB, MI, DL, get(Opcode)) .addReg(SrcReg) .addFrameIndex(FrameIndex); } else { - llvm_unreachable("VGPR spilling not supported"); + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" + " spill register"); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0) + .addReg(SrcReg); } } @@ -456,29 +466,136 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineFunction *MF = MBB.getParent(); MachineFrameInfo *FrameInfo = MF->getFrameInfo(); DebugLoc DL = MBB.findDebugLoc(MI); + int Opcode = -1; - if (RI.hasVGPRs(RC)) { - LLVMContext &Ctx = MF->getFunction()->getContext(); - Ctx.emitError("SIInstrInfo::loadRegToStackSlot - Can't retrieve spilled VGPR!"); - BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) - .addImm(0); - } else if (RI.isSGPRClass(RC)){ - unsigned Opcode; + if (RI.isSGPRClass(RC)){ + switch(RC->getSize() * 8) { + case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; + } + } else if(RI.hasVGPRs(RC)) { switch(RC->getSize() * 8) { - case 32: Opcode = AMDGPU::SI_SPILL_S32_RESTORE; break; - case 64: Opcode = AMDGPU::SI_SPILL_S64_RESTORE; break; - case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break; - case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break; - case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break; - default: llvm_unreachable("Cannot spill register class"); + case 32: Opcode = AMDGPU::SI_SPILL_V32_RESTORE; break; + case 64: Opcode = AMDGPU::SI_SPILL_V64_RESTORE; break; + case 96: Opcode = AMDGPU::SI_SPILL_V96_RESTORE; break; + case 128: Opcode = AMDGPU::SI_SPILL_V128_RESTORE; break; + case 256: Opcode = AMDGPU::SI_SPILL_V256_RESTORE; break; + case 512: Opcode = AMDGPU::SI_SPILL_V512_RESTORE; break; } + } + if (Opcode != -1) { FrameInfo->setObjectAlignment(FrameIndex, 4); BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex); } else { - llvm_unreachable("VGPR spilling not supported"); + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" + " restore register"); + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addReg(AMDGPU::VGPR0); + } +} + +/// \param @Offset Offset in bytes of the FrameIndex being spilled +unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + RegScavenger *RS, unsigned TmpReg, + unsigned FrameOffset, + unsigned Size) const { + MachineFunction *MF = MBB.getParent(); + SIMachineFunctionInfo *MFI = MF->getInfo(); + const AMDGPUSubtarget &ST = MF->getTarget().getSubtarget(); + const SIRegisterInfo *TRI = + static_cast(ST.getRegisterInfo()); + DebugLoc DL = MBB.findDebugLoc(MI); + unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); + unsigned WavefrontSize = ST.getWavefrontSize(); + + unsigned TIDReg = MFI->getTIDReg(); + if (!MFI->hasCalculatedTID()) { + MachineBasicBlock &Entry = MBB.getParent()->front(); + MachineBasicBlock::iterator Insert = Entry.front(); + DebugLoc DL = Insert->getDebugLoc(); + + TIDReg = RI.findUnusedVGPR(MF->getRegInfo()); + if (TIDReg == AMDGPU::NoRegister) + return TIDReg; + + + if (MFI->getShaderType() == ShaderType::COMPUTE && + WorkGroupSize > WavefrontSize) { + + unsigned TIDIGXReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_X); + unsigned TIDIGYReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Y); + unsigned TIDIGZReg = TRI->getPreloadedValue(*MF, SIRegisterInfo::TIDIG_Z); + unsigned InputPtrReg = + TRI->getPreloadedValue(*MF, SIRegisterInfo::INPUT_PTR); + static const unsigned TIDIGRegs[3] = { + TIDIGXReg, TIDIGYReg, TIDIGZReg + }; + for (unsigned Reg : TIDIGRegs) { + if (!Entry.isLiveIn(Reg)) + Entry.addLiveIn(Reg); + } + + RS->enterBasicBlock(&Entry); + unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) + .addReg(InputPtrReg) + .addImm(SI::KernelInputOffsets::NGROUPS_Z); + BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp1) + .addReg(InputPtrReg) + .addImm(SI::KernelInputOffsets::NGROUPS_Y); + + // NGROUPS.X * NGROUPS.Y + BuildMI(Entry, Insert, DL, get(AMDGPU::S_MUL_I32), STmp1) + .addReg(STmp1) + .addReg(STmp0); + // (NGROUPS.X * NGROUPS.Y) * TIDIG.X + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MUL_U32_U24_e32), TIDReg) + .addReg(STmp1) + .addReg(TIDIGXReg); + // NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X) + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MAD_U32_U24), TIDReg) + .addReg(STmp0) + .addReg(TIDIGYReg) + .addReg(TIDReg); + // (NGROUPS.Z * TIDIG.Y + (NGROUPS.X * NGROPUS.Y * TIDIG.X)) + TIDIG.Z + BuildMI(Entry, Insert, DL, get(AMDGPU::V_ADD_I32_e32), TIDReg) + .addReg(TIDReg) + .addReg(TIDIGZReg); + } else { + // Get the wave id + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_LO_U32_B32_e64), + TIDReg) + .addImm(-1) + .addImm(0); + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_MBCNT_HI_U32_B32_e32), + TIDReg) + .addImm(-1) + .addReg(TIDReg); + } + + BuildMI(Entry, Insert, DL, get(AMDGPU::V_LSHLREV_B32_e32), + TIDReg) + .addImm(2) + .addReg(TIDReg); + MFI->setTIDReg(TIDReg); } + + // Add FrameIndex to LDS offset + unsigned LDSOffset = MFI->LDSSize + (FrameOffset * WorkGroupSize); + BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), TmpReg) + .addImm(LDSOffset) + .addReg(TIDReg); + + return TmpReg; } void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI, diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h index 25e2438..7f280b6 100644 --- a/lib/Target/R600/SIInstrInfo.h +++ b/lib/Target/R600/SIInstrInfo.h @@ -75,6 +75,13 @@ public: unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; + unsigned calculateLDSSpillAddress(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + RegScavenger *RS, + unsigned TmpReg, + unsigned Offset, + unsigned Size) const; + void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIndex, diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td index 48104eb..35cf9dd 100644 --- a/lib/Target/R600/SIInstructions.td +++ b/lib/Target/R600/SIInstructions.td @@ -1829,6 +1829,27 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR ; defm SI_SPILL_S256 : SI_SPILL_SGPR ; defm SI_SPILL_S512 : SI_SPILL_SGPR ; +multiclass SI_SPILL_VGPR { + def _SAVE : InstSI < + (outs), + (ins vgpr_class:$src, i32imm:$frame_idx), + "", [] + >; + + def _RESTORE : InstSI < + (outs vgpr_class:$dst), + (ins i32imm:$frame_idx), + "", [] + >; +} + +defm SI_SPILL_V32 : SI_SPILL_VGPR ; +defm SI_SPILL_V64 : SI_SPILL_VGPR ; +defm SI_SPILL_V96 : SI_SPILL_VGPR ; +defm SI_SPILL_V128 : SI_SPILL_VGPR ; +defm SI_SPILL_V256 : SI_SPILL_VGPR ; +defm SI_SPILL_V512 : SI_SPILL_VGPR ; + let Defs = [SCC] in { def SI_CONSTDATA_PTR : InstSI < diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp index b978203..d58f31d 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.cpp +++ b/lib/Target/R600/SIMachineFunctionInfo.cpp @@ -10,8 +10,9 @@ #include "SIMachineFunctionInfo.h" +#include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" -#include "SIRegisterInfo.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" @@ -27,29 +28,18 @@ void SIMachineFunctionInfo::anchor() {} SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), + TIDReg(AMDGPU::NoRegister), PSInputAddr(0), - NumUserSGPRs(0) { } - -/// \brief Returns a register that is not used at any point in the function. -/// If all registers are used, then this function will return -// AMDGPU::NoRegister. -static unsigned findUnusedVGPR(const MachineRegisterInfo &MRI) { - - const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass; - - for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); - I != E; ++I) { - if (!MRI.isPhysRegUsed(*I)) - return *I; - } - return AMDGPU::NoRegister; -} + NumUserSGPRs(0), + LDSWaveSpillSize(0) { } SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( MachineFunction *MF, unsigned FrameIndex, unsigned SubIdx) { const MachineFrameInfo *FrameInfo = MF->getFrameInfo(); + const SIRegisterInfo *TRI = static_cast( + MF->getTarget().getSubtarget().getRegisterInfo()); MachineRegisterInfo &MRI = MF->getRegInfo(); int64_t Offset = FrameInfo->getObjectOffset(FrameIndex); Offset += SubIdx * 4; @@ -60,7 +50,7 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( struct SpilledReg Spill; if (!LaneVGPRs.count(LaneVGPRIdx)) { - unsigned LaneVGPR = findUnusedVGPR(MRI); + unsigned LaneVGPR = TRI->findUnusedVGPR(MRI); LaneVGPRs[LaneVGPRIdx] = LaneVGPR; MRI.setPhysRegUsed(LaneVGPR); @@ -76,3 +66,11 @@ SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg( Spill.Lane = Lane; return Spill; } + +unsigned SIMachineFunctionInfo::getMaximumWorkGroupSize( + const MachineFunction &MF) const { + const AMDGPUSubtarget &ST = MF.getTarget().getSubtarget(); + // FIXME: We should get this information from kernel attributes if it + // is available. + return getShaderType() == ShaderType::COMPUTE ? 256 : ST.getWavefrontSize(); +} diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h index 2917edf..6bb8f9d 100644 --- a/lib/Target/R600/SIMachineFunctionInfo.h +++ b/lib/Target/R600/SIMachineFunctionInfo.h @@ -16,6 +16,7 @@ #define LLVM_LIB_TARGET_R600_SIMACHINEFUNCTIONINFO_H #include "AMDGPUMachineFunction.h" +#include "SIRegisterInfo.h" #include namespace llvm { @@ -26,6 +27,9 @@ class MachineRegisterInfo; /// tells the hardware which interpolation parameters to load. class SIMachineFunctionInfo : public AMDGPUMachineFunction { void anchor() override; + + unsigned TIDReg; + public: struct SpilledReg { @@ -44,6 +48,12 @@ public: unsigned PSInputAddr; unsigned NumUserSGPRs; std::map LaneVGPRs; + unsigned LDSWaveSpillSize; + bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; }; + unsigned getTIDReg() const { return TIDReg; }; + void setTIDReg(unsigned Reg) { TIDReg = Reg; } + + unsigned getMaximumWorkGroupSize(const MachineFunction &MF) const; }; } // End namespace llvm diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp index 3924e21..47e7024 100644 --- a/lib/Target/R600/SIRegisterInfo.cpp +++ b/lib/Target/R600/SIRegisterInfo.cpp @@ -34,6 +34,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { Reserved.set(AMDGPU::EXEC); Reserved.set(AMDGPU::INDIRECT_BASE_ADDR); Reserved.set(AMDGPU::FLAT_SCR); + + // Reserve some VGPRs to use as temp registers in case we have to spill VGPRs + Reserved.set(AMDGPU::VGPR255); + Reserved.set(AMDGPU::VGPR254); + return Reserved; } @@ -51,18 +56,31 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) { switch (Op) { case AMDGPU::SI_SPILL_S512_SAVE: case AMDGPU::SI_SPILL_S512_RESTORE: + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V512_RESTORE: return 16; case AMDGPU::SI_SPILL_S256_SAVE: case AMDGPU::SI_SPILL_S256_RESTORE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V256_RESTORE: return 8; case AMDGPU::SI_SPILL_S128_SAVE: case AMDGPU::SI_SPILL_S128_RESTORE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V128_RESTORE: return 4; + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V96_RESTORE: + return 3; case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S64_RESTORE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V64_RESTORE: return 2; case AMDGPU::SI_SPILL_S32_SAVE: case AMDGPU::SI_SPILL_S32_RESTORE: + case AMDGPU::SI_SPILL_V32_SAVE: + case AMDGPU::SI_SPILL_V32_RESTORE: return 1; default: llvm_unreachable("Invalid spill opcode"); } @@ -139,6 +157,81 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI, break; } + // VGPR register spill + case AMDGPU::SI_SPILL_V512_SAVE: + case AMDGPU::SI_SPILL_V256_SAVE: + case AMDGPU::SI_SPILL_V128_SAVE: + case AMDGPU::SI_SPILL_V96_SAVE: + case AMDGPU::SI_SPILL_V64_SAVE: + case AMDGPU::SI_SPILL_V32_SAVE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned SrcReg = MI->getOperand(0).getReg(); + int64_t Offset = FrameInfo->getObjectOffset(Index); + unsigned Size = NumSubRegs * 4; + unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + + for (unsigned i = 0, e = NumSubRegs; i != e; ++i) { + unsigned SubReg = NumSubRegs > 1 ? + getPhysRegSubReg(SrcReg, &AMDGPU::VGPR_32RegClass, i) : + SrcReg; + Offset += (i * 4); + MFI->LDSWaveSpillSize = std::max((unsigned)Offset + 4, (unsigned)MFI->LDSWaveSpillSize); + + unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg, + Offset, Size); + + if (AddrReg == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling VGPRS"); + AddrReg = AMDGPU::VGPR0; + } + + // Store the value in LDS + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_WRITE_B32)) + .addImm(0) // gds + .addReg(AddrReg, RegState::Kill) // addr + .addReg(SubReg) // data0 + .addImm(0); // offset + } + + MI->eraseFromParent(); + break; + } + case AMDGPU::SI_SPILL_V32_RESTORE: + case AMDGPU::SI_SPILL_V64_RESTORE: + case AMDGPU::SI_SPILL_V128_RESTORE: + case AMDGPU::SI_SPILL_V256_RESTORE: + case AMDGPU::SI_SPILL_V512_RESTORE: { + unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode()); + unsigned DstReg = MI->getOperand(0).getReg(); + int64_t Offset = FrameInfo->getObjectOffset(Index); + unsigned Size = NumSubRegs * 4; + unsigned TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0); + + // FIXME: We could use DS_READ_B64 here to optimize for larger registers. + for (unsigned i = 0, e = NumSubRegs; i != e; ++i) { + unsigned SubReg = NumSubRegs > 1 ? + getPhysRegSubReg(DstReg, &AMDGPU::VGPR_32RegClass, i) : + DstReg; + + Offset += (i * 4); + unsigned AddrReg = TII->calculateLDSSpillAddress(*MBB, MI, RS, TmpReg, + Offset, Size); + if (AddrReg == AMDGPU::NoRegister) { + LLVMContext &Ctx = MF->getFunction()->getContext(); + Ctx.emitError("Ran out of VGPRs for spilling VGPRs"); + AddrReg = AMDGPU::VGPR0; + } + + BuildMI(*MBB, MI, DL, TII->get(AMDGPU::DS_READ_B32), SubReg) + .addImm(0) // gds + .addReg(AddrReg, RegState::Kill) // addr + .addImm(0); //offset + } + MI->eraseFromParent(); + break; + } + default: { int64_t Offset = FrameInfo->getObjectOffset(Index); FIOp.ChangeToImmediate(Offset); @@ -173,8 +266,12 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const { &AMDGPU::SReg_32RegClass, &AMDGPU::VReg_64RegClass, &AMDGPU::SReg_64RegClass, + &AMDGPU::VReg_96RegClass, + &AMDGPU::VReg_128RegClass, &AMDGPU::SReg_128RegClass, - &AMDGPU::SReg_256RegClass + &AMDGPU::VReg_256RegClass, + &AMDGPU::SReg_256RegClass, + &AMDGPU::VReg_512RegClass }; for (const TargetRegisterClass *BaseClass : BaseClasses) { @@ -335,3 +432,19 @@ unsigned SIRegisterInfo::getPreloadedValue(const MachineFunction &MF, } llvm_unreachable("unexpected preloaded value type"); } + +/// \brief Returns a register that is not used at any point in the function. +/// If all registers are used, then this function will return +// AMDGPU::NoRegister. +unsigned SIRegisterInfo::findUnusedVGPR(const MachineRegisterInfo &MRI) const { + + const TargetRegisterClass *RC = &AMDGPU::VGPR_32RegClass; + + for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); + I != E; ++I) { + if (!MRI.isPhysRegUsed(*I)) + return *I; + } + return AMDGPU::NoRegister; +} + diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h index 29b5d0c..0ac9f36 100644 --- a/lib/Target/R600/SIRegisterInfo.h +++ b/lib/Target/R600/SIRegisterInfo.h @@ -100,6 +100,7 @@ struct SIRegisterInfo : public AMDGPURegisterInfo { unsigned getPreloadedValue(const MachineFunction &MF, enum PreloadedValue Value) const; + unsigned findUnusedVGPR(const MachineRegisterInfo &MRI) const; }; } // End namespace llvm -- 1.8.5.5