From 1c2a864bc3106de5c7c0d13cecd254a305c15d18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Tue, 28 Nov 2017 13:05:34 +0100 Subject: [PATCH 2/2] AMDGPU: Fix copying i1 value out of loop with non-uniform exit Summary: When an i1-value is defined inside of a loop and used outside of it, we cannot simply use the SGPR bitmask from the loop's last iteration. There are also useful and correct cases of an i1-value being copied between basic blocks, e.g. when a condition is computed outside of a loop and used inside it. The concept of dominators is not sufficient to capture what is going on, so I propose the notion of "lane-dominators". Fixes a bug encountered in Nier: Automata. Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103743 Change-Id: If37b969ddc71d823ab3004aeafb9ea050e45bd9a Reviewers: arsenm, rampitec Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, tpr, llvm-commits, t-tye Differential Revision: https://reviews.llvm.org/D40547 --- lib/Target/AMDGPU/SILowerI1Copies.cpp | 4 +- lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp | 75 +++++++++++++++++++++++++ lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h | 24 ++++++++ lib/Target/AMDGPU/Utils/CMakeLists.txt | 1 + test/CodeGen/AMDGPU/i1-copy-from-loop.ll | 48 ++++++++++++++++ 5 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp create mode 100644 lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h create mode 100644 test/CodeGen/AMDGPU/i1-copy-from-loop.ll diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp index 3880d052bf8..777c1c36dd0 100644 --- a/lib/Target/AMDGPU/SILowerI1Copies.cpp +++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp @@ -10,20 +10,21 @@ /// This is not possible for any other value type. Since there are no /// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1. /// //===----------------------------------------------------------------------===// // #define DEBUG_TYPE "si-i1-copies" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "Utils/AMDGPULaneDominator.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/LLVMContext.h" #include "llvm/Support/Debug.h" #include "llvm/Target/TargetMachine.h" using namespace llvm; @@ -134,21 +135,22 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) { SrcRC == &AMDGPU::VReg_1RegClass) { if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 && DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() && DefInst->getOperand(1).getImm() == 0 && DefInst->getOperand(2).getImm() != 0 && DefInst->getOperand(3).isReg() && TargetRegisterInfo::isVirtualRegister( DefInst->getOperand(3).getReg()) && TRI->getCommonSubClass( MRI.getRegClass(DefInst->getOperand(3).getReg()), - &AMDGPU::SGPR_64RegClass)) { + &AMDGPU::SGPR_64RegClass) && + AMDGPU::laneDominates(DefInst->getParent(), &MBB)) { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64)) .add(Dst) .addReg(AMDGPU::EXEC) .add(DefInst->getOperand(3)); } else { BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64)) .add(Dst) .add(Src) .addImm(0); } diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp new file mode 100644 index 00000000000..1924f71f11c --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp @@ -0,0 +1,75 @@ +//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// MBB A lane-dominates MBB B if +// 1. A dominates B in the usual sense, i.e. every path from the entry to B +// goes through A, and +// 2. whenever B executes, every active lane during that execution of B was +// also active during the most recent execution of A. +// +// The simplest example where A dominates B but does not lane-dominate it is +// where A is a loop: +// +// | +// +--+ +// A | +// +--+ +// | +// B +// +// Unfortunately, the second condition is not fully captured by the control +// flow graph when it is unstructured (as may happen when branch conditions are +// uniform). +// +// The following replacement of the second condition is a conservative +// approximation. It is an equivalent condition when the CFG is fully +// structured: +// +// 2'. every cycle in the CFG that contains A also contains B. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPULaneDominator.h" + +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" + +namespace llvm { + +namespace AMDGPU { + +// Given machine basic blocks A and B where A dominates B, check whether +// A lane-dominates B. +// +// The check is conservative, i.e. there can be false-negatives. +bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) { + // Check whether A is reachable from itself without going through B. + DenseSet Reachable; + SmallVector Stack; + + Stack.push_back(A); + do { + MachineBasicBlock *MBB = Stack.back(); + Stack.pop_back(); + + for (MachineBasicBlock *Succ : MBB->successors()) { + if (Succ == A) + return false; + if (Succ != B && Reachable.insert(Succ).second) + Stack.push_back(Succ); + } + } while (!Stack.empty()); + + return true; +} + +} // namespace AMDGPU + +} // namespace llvm diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h new file mode 100644 index 00000000000..4f33a89a364 --- /dev/null +++ b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h @@ -0,0 +1,24 @@ +//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H +#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H + +namespace llvm { + +class MachineBasicBlock; + +namespace AMDGPU { + +bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB); + +} // end namespace AMDGPU +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt index 01b80ebe8d3..c5ed32e4682 100644 --- a/lib/Target/AMDGPU/Utils/CMakeLists.txt +++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt @@ -1,5 +1,6 @@ add_llvm_library(LLVMAMDGPUUtils AMDGPUBaseInfo.cpp AMDKernelCodeTUtils.cpp AMDGPUAsmUtils.cpp + AMDGPULaneDominator.cpp ) diff --git a/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/test/CodeGen/AMDGPU/i1-copy-from-loop.ll new file mode 100644 index 00000000000..0a4a99b5bdc --- /dev/null +++ b/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -0,0 +1,48 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s + +; SI-LABEL: {{^}}i1_copy_from_loop: +; +; Cannot use an SGPR mask to copy %cc out of the loop, since the mask would +; only contain the lanes that were active during the last loop iteration. +; +; SI: ; %for.body +; SI: v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4, +; SI: v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]] +; SI-NOT: [[VREG]] +; SI: ; %for.end +; SI: v_cmp_ne_u32_e32 vcc, 0, [[VREG]] +define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) { +entry: + br label %for.body + +for.body: + %i = phi i32 [0, %entry], [%i.inc, %end.loop] + %cc = icmp ult i32 %i, 4 + br i1 %cc, label %mid.loop, label %for.end + +mid.loop: + %v = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %tid, i32 %i, i1 false, i1 false) + %cc2 = fcmp oge float %v, 0.0 + br i1 %cc2, label %end.loop, label %for.end + +end.loop: + %i.inc = add i32 %i, 1 + br label %for.body + +for.end: + br i1 %cc, label %if, label %end + +if: + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float undef, float undef, float undef, float undef, i1 true, i1 true) + br label %end + +end: + ret void +} + +declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0 +declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 + +attributes #0 = { nounwind readonly } +attributes #1 = { nounwind inaccessiblememonly } -- 2.11.0