From 1c2a864bc3106de5c7c0d13cecd254a305c15d18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 28 Nov 2017 13:05:34 +0100
Subject: [PATCH 2/2] AMDGPU: Fix copying i1 value out of loop with non-uniform
 exit

Summary:
When an i1-value is defined inside of a loop and used outside of it, we
cannot simply use the SGPR bitmask from the loop's last iteration.

There are also useful and correct cases of an i1-value being copied between
basic blocks, e.g. when a condition is computed outside of a loop and used
inside it. The concept of dominators is not sufficient to capture what is
going on, so I propose the notion of "lane-dominators".

Fixes a bug encountered in Nier: Automata.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=103743
Change-Id: If37b969ddc71d823ab3004aeafb9ea050e45bd9a

Reviewers: arsenm, rampitec

Subscribers: kzhuravl, wdng, mgorny, yaxunl, dstuttard, tpr, llvm-commits, t-tye

Differential Revision: https://reviews.llvm.org/D40547
---
 lib/Target/AMDGPU/SILowerI1Copies.cpp           |  4 +-
 lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp | 75 +++++++++++++++++++++++++
 lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h   | 24 ++++++++
 lib/Target/AMDGPU/Utils/CMakeLists.txt          |  1 +
 test/CodeGen/AMDGPU/i1-copy-from-loop.ll        | 48 ++++++++++++++++
 5 files changed, 151 insertions(+), 1 deletion(-)
 create mode 100644 lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
 create mode 100644 lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
 create mode 100644 test/CodeGen/AMDGPU/i1-copy-from-loop.ll

diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 3880d052bf8..777c1c36dd0 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -10,20 +10,21 @@
 /// This is not possible for any other value type.  Since there are no
 /// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
 ///
 //===----------------------------------------------------------------------===//
 //
 
 #define DEBUG_TYPE "si-i1-copies"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "Utils/AMDGPULaneDominator.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
@@ -134,21 +135,22 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
                  SrcRC == &AMDGPU::VReg_1RegClass) {
         if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
             DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
             DefInst->getOperand(1).getImm() == 0 &&
             DefInst->getOperand(2).getImm() != 0 &&
             DefInst->getOperand(3).isReg() &&
             TargetRegisterInfo::isVirtualRegister(
               DefInst->getOperand(3).getReg()) &&
             TRI->getCommonSubClass(
               MRI.getRegClass(DefInst->getOperand(3).getReg()),
-              &AMDGPU::SGPR_64RegClass)) {
+              &AMDGPU::SGPR_64RegClass) &&
+            AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {
           BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))
               .add(Dst)
               .addReg(AMDGPU::EXEC)
               .add(DefInst->getOperand(3));
         } else {
           BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))
               .add(Dst)
               .add(Src)
               .addImm(0);
         }
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
new file mode 100644
index 00000000000..1924f71f11c
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp
@@ -0,0 +1,75 @@
+//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MBB A lane-dominates MBB B if
+// 1. A dominates B in the usual sense, i.e. every path from the entry to B
+//    goes through A, and
+// 2. whenever B executes, every active lane during that execution of B was
+//    also active during the most recent execution of A.
+//
+// The simplest example where A dominates B but does not lane-dominate it is
+// where A is a loop:
+//
+//     |
+//     +--+
+//     A  |
+//     +--+
+//     |
+//     B
+//
+// Unfortunately, the second condition is not fully captured by the control
+// flow graph when it is unstructured (as may happen when branch conditions are
+// uniform).
+//
+// The following replacement of the second condition is a conservative
+// approximation. It is an equivalent condition when the CFG is fully
+// structured:
+//
+// 2'. every cycle in the CFG that contains A also contains B.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPULaneDominator.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+
+namespace AMDGPU {
+
+// Given machine basic blocks A and B where A dominates B, check whether
+// A lane-dominates B.
+//
+// The check is conservative, i.e. there can be false-negatives.
+bool laneDominates(MachineBasicBlock *A, MachineBasicBlock *B) {
+  // Check whether A is reachable from itself without going through B.
+  DenseSet<MachineBasicBlock *> Reachable;
+  SmallVector<MachineBasicBlock *, 8> Stack;
+
+  Stack.push_back(A);
+  do {
+    MachineBasicBlock *MBB = Stack.back();
+    Stack.pop_back();
+
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      if (Succ == A)
+        return false;
+      if (Succ != B && Reachable.insert(Succ).second)
+        Stack.push_back(Succ);
+    }
+  } while (!Stack.empty());
+
+  return true;
+}
+
+} // namespace AMDGPU
+
+} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
new file mode 100644
index 00000000000..4f33a89a364
--- /dev/null
+++ b/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h
@@ -0,0 +1,24 @@
+//===- AMDGPULaneDominator.h ------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
+#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
+
+namespace llvm {
+
+class MachineBasicBlock;
+
+namespace AMDGPU {
+
+bool laneDominates(MachineBasicBlock *MBBA, MachineBasicBlock *MBBB);
+
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
diff --git a/lib/Target/AMDGPU/Utils/CMakeLists.txt b/lib/Target/AMDGPU/Utils/CMakeLists.txt
index 01b80ebe8d3..c5ed32e4682 100644
--- a/lib/Target/AMDGPU/Utils/CMakeLists.txt
+++ b/lib/Target/AMDGPU/Utils/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_llvm_library(LLVMAMDGPUUtils
   AMDGPUBaseInfo.cpp
   AMDKernelCodeTUtils.cpp
   AMDGPUAsmUtils.cpp
+  AMDGPULaneDominator.cpp
   )
diff --git a/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
new file mode 100644
index 00000000000..0a4a99b5bdc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
@@ -0,0 +1,48 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}i1_copy_from_loop:
+;
+; Cannot use an SGPR mask to copy %cc out of the loop, since the mask would
+; only contain the lanes that were active during the last loop iteration.
+;
+; SI: ; %for.body
+; SI: v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4,
+; SI: v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]]
+; SI-NOT: [[VREG]]
+; SI: ; %for.end
+; SI: v_cmp_ne_u32_e32 vcc, 0, [[VREG]]
+define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [0, %entry], [%i.inc, %end.loop]
+  %cc = icmp ult i32 %i, 4
+  br i1 %cc, label %mid.loop, label %for.end
+
+mid.loop:
+  %v = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %tid, i32 %i, i1 false, i1 false)
+  %cc2 = fcmp oge float %v, 0.0
+  br i1 %cc2, label %end.loop, label %for.end
+
+end.loop:
+  %i.inc = add i32 %i, 1
+  br label %for.body
+
+for.end:
+  br i1 %cc, label %if, label %end
+
+if:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float undef, float undef, float undef, float undef, i1 true, i1 true)
+  br label %end
+
+end:
+  ret void
+}
+
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
+
+attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind inaccessiblememonly }
-- 
2.11.0