From 951ebad474e2548a76aabcd3e1df3792f99d8a1d Mon Sep 17 00:00:00 2001 From: Vincent Lejeune Date: Thu, 6 Jun 2013 19:43:51 +0200 Subject: [PATCH] R600: Use a refined heuristic to choose when switching clause This is using a hint from AMD APP OpenCL Programming Guide with empirically tweaked parameters. I used Unigine Heaven 3.0 to determine best parameters on my system (i7 2600/Radeon 6950/Kernel 3.9.4) the benchmark : it went from 38.8 average fps to 39.6, which is ~3% gain. (Lightmark 2008.2 gain is much more marginal: from 537 to 539) --- lib/Target/R600/R600MachineScheduler.cpp | 50 +++++++++++++++++++++++++++----- lib/Target/R600/R600MachineScheduler.h | 5 +++- 2 files changed, 46 insertions(+), 9 deletions(-) diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index e1badba..e7e4921 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -243,6 +243,8 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { const AMDGPUSubtarget &ST = DAG->TM.getSubtarget(); InstKindLimit[IDFetch] = ST.getTexVTXClauseSize(); + AluInstCount = 0; + FetchInstCount = 0; } void R600SchedStrategy::MoveUnits(std::vector &QSrc, @@ -252,6 +254,11 @@ void R600SchedStrategy::MoveUnits(std::vector &QSrc, QSrc.clear(); } +static +unsigned getWFCountLimitedByGPR(unsigned GPRCount) { + return 248 / GPRCount; +} + SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { SUnit *SU = 0; NextInstKind = IDOther; @@ -264,6 +271,32 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { bool AllowSwitchFromAlu = (CurEmitted >= InstKindLimit[CurInstKind]) && (!Available[IDFetch].empty() || !Available[IDOther].empty()); + if (CurInstKind == IDAlu && !Available[IDFetch].empty()) { + // We use the heuristic provided by AMD Accelerated Parallel Processing + // OpenCL Programming Guide : + // The approx. number of WF that allows TEX inst to hide ALU inst is : + // 500 (cycles for TEX) / (AluFetchRatio * 8 (cycles for ALU)) + float ALUFetchRationEstimate = + (AluInstCount + AvailablesAluCount() + Pending[IDAlu].size()) / + (FetchInstCount + Available[IDFetch].size()); + unsigned NeededWF = 62.5f / ALUFetchRationEstimate; + DEBUG( dbgs() << NeededWF << " approx. Wavefronts Required\n" ); + // We assume the local GPR requirements to be "dominated" by the requirement + // of the TEX clause (which consumes 128 bits regs) ; ALU inst before and + // after TEX are indeed likely to consume or generate values from/for the + // TEX clause. + // Available[IDFetch].size() * 2 : GPRs required in the Fetch clause + // We assume that fetch instructions are either TnXYZW = TEX TnXYZW (need + // one GPR) or TmXYZW = TnXYZW (need 2 GPR). + // (TODO : use RegisterPressure) + // If we are going too use too many GPR, we flush Fetch instruction to lower + // register pressure on 128 bits regs. + unsigned NearRegisterRequirement = 2 * Available[IDFetch].size(); + if (NeededWF > getWFCountLimitedByGPR(NearRegisterRequirement)) + AllowSwitchFromAlu = true; + } + + // We want to scheduled AR defs as soon as possible to make sure they aren't // put in a different ALU clause from their uses. if (!SU && !UnscheduledARDefs.empty()) { @@ -337,6 +370,7 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { } if (CurInstKind == IDAlu) { + AluInstCount ++; switch (getAluKind(SU)) { case AluT_XYZW: CurEmitted += 4; @@ -362,7 +396,8 @@ void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) { if (CurInstKind != IDFetch) { MoveUnits(Pending[IDFetch], Available[IDFetch]); - } + } else + FetchInstCount++; } static bool @@ -574,16 +609,15 @@ SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) { return UnslotedSU; } -bool R600SchedStrategy::isAvailablesAluEmpty() const { - return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() && - AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() && - AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() && - AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty() && - AvailableAlus[AluPredX].empty(); +unsigned R600SchedStrategy::AvailablesAluCount() const { + return AvailableAlus[AluAny].size() + AvailableAlus[AluT_XYZW].size() + + AvailableAlus[AluT_X].size() + AvailableAlus[AluT_Y].size() + + AvailableAlus[AluT_Z].size() + AvailableAlus[AluT_W].size() + + AvailableAlus[AluDiscarded].size() + AvailableAlus[AluPredX].size(); } SUnit* R600SchedStrategy::pickAlu() { - while (!isAvailablesAluEmpty()) { + while (AvailablesAluCount() || !Pending[IDAlu].empty()) { if (!OccupedSlotsMask) { // Bottom up scheduling : predX must comes first if (!AvailableAlus[AluPredX].empty()) { diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h index ff13f25..5dbe266 100644 --- a/lib/Target/R600/R600MachineScheduler.h +++ b/lib/Target/R600/R600MachineScheduler.h @@ -61,6 +61,9 @@ class R600SchedStrategy : public MachineSchedStrategy { int CurEmitted; InstKind NextInstKind; + unsigned AluInstCount; + unsigned FetchInstCount; + int InstKindLimit[IDLast]; int OccupedSlotsMask; @@ -86,7 +89,7 @@ private: bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const; AluKind getAluKind(SUnit *SU) const; void LoadAlu(); - bool isAvailablesAluEmpty() const; + unsigned AvailablesAluCount() const; SUnit *AttemptFillSlot (unsigned Slot); void PrepareNextSlot(); SUnit *PopInst(std::vector &Q); -- 1.8.2.1