From 1a12fbe8b55fbb5cc8675d34fab03b3cae59851e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= <nicolai.haehnle@amd.com>
Date: Tue, 28 Nov 2017 13:05:33 +0100
Subject: [PATCH 1/2] StructurizeCFG: Test for branch divergence correctly

Summary:
This fixes cases like the new test @nonuniform. In that test, %cc itself
is a uniform value; however, when reading it after the end of the loop in
basic block %if, its value is effectively non-uniform.

This problem was encountered in
https://bugs.freedesktop.org/show_bug.cgi?id=103743; however, this change
in itself is not sufficient to fix that bug, as there is another issue
in the AMDGPU backend.

Change-Id: I32bbffece4a32f686fab54964dae1a5dd72949d4

Reviewers: arsenm, rampitec, jlebar

Subscribers: wdng, tpr, llvm-commits

Differential Revision: https://reviews.llvm.org/D40546
---
 include/llvm/Analysis/DivergenceAnalysis.h        |  8 +++-
 lib/Transforms/Scalar/StructurizeCFG.cpp          | 11 ++++-
 test/CodeGen/AMDGPU/control-flow-optnone.ll       |  4 +-
 test/Transforms/StructurizeCFG/uniform-regions.ll | 49 +++++++++++++++++++++++
 4 files changed, 67 insertions(+), 5 deletions(-)
 create mode 100644 test/Transforms/StructurizeCFG/uniform-regions.ll
diff --git a/include/llvm/Analysis/DivergenceAnalysis.h b/include/llvm/Analysis/DivergenceAnalysis.h
index aa2de571ba1..36027ab84cc 100644
--- a/include/llvm/Analysis/DivergenceAnalysis.h
+++ b/include/llvm/Analysis/DivergenceAnalysis.h
@@ -28,21 +28,27 @@ public:
     initializeDivergenceAnalysisPass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override;
 
   bool runOnFunction(Function &F) override;
 
   // Print all divergent branches in the function.
   void print(raw_ostream &OS, const Module *) const override;
 
-  // Returns true if V is divergent.
+  // Returns true if V is divergent at its definition.
+  //
+  // Even if this function returns false, V may still be divergent when used
+  // in a different basic block.
   bool isDivergent(const Value *V) const { return DivergentValues.count(V); }
 
   // Returns true if V is uniform/non-divergent.
+  //
+  // Even if this function returns true, V may still be divergent when used
+  // in a different basic block.
   bool isUniform(const Value *V) const { return !isDivergent(V); }
 
 private:
   // Stores all divergent values.
   DenseSet<const Value *> DivergentValues;
 };
 } // End llvm namespace
diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp
index 2972e1cff9a..d6034deea34 100644
--- a/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -48,20 +48,26 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "structurizecfg"
 
 // The name for newly created blocks.
 static const char *const FlowBlockName = "Flow";
 
 namespace {
 
+static cl::opt<bool> ForceSkipUniformRegions(
+  "structurizecfg-skip-uniform-regions",
+  cl::Hidden,
+  cl::desc("Force the StructurizeCFG pass to skip uniform regions"),
+  cl::init(false));
+
 // Definition of the complex types used in this pass.
 
 using BBValuePair = std::pair<BasicBlock *, Value *>;
 
 using RNVector = SmallVector<RegionNode *, 8>;
 using BBVector = SmallVector<BasicBlock *, 8>;
 using BranchVector = SmallVector<BranchInst *, 8>;
 using BBValueVector = SmallVector<BBValuePair, 2>;
 
 using BBSet = SmallPtrSet<BasicBlock *, 8>;
@@ -236,21 +242,22 @@ class StructurizeCFG : public RegionPass {
   void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
 
   void createFlow();
 
   void rebuildSSA();
 
 public:
   static char ID;
 
   explicit StructurizeCFG(bool SkipUniformRegions = false)
-      : RegionPass(ID), SkipUniformRegions(SkipUniformRegions) {
+      : RegionPass(ID),
+        SkipUniformRegions(SkipUniformRegions || ForceSkipUniformRegions) {
     initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
   }
 
   bool doInitialization(Region *R, RGPassManager &RGM) override;
 
   bool runOnRegion(Region *R, RGPassManager &RGM) override;
 
   StringRef getPassName() const override { return "Structurize control flow"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -884,21 +891,21 @@ void StructurizeCFG::rebuildSSA() {
     }
 }
 
 static bool hasOnlyUniformBranches(const Region *R,
                                    const DivergenceAnalysis &DA) {
   for (const BasicBlock *BB : R->blocks()) {
     const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator());
     if (!Br || !Br->isConditional())
       continue;
 
-    if (!DA.isUniform(Br->getCondition()))
+    if (!DA.isUniform(Br))
       return false;
     DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n");
   }
   return true;
 }
 
 /// \brief Run the transformation for each region found
 bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
   if (R->isTopLevelRegion())
     return false;
diff --git a/test/CodeGen/AMDGPU/control-flow-optnone.ll b/test/CodeGen/AMDGPU/control-flow-optnone.ll
index 2122af62735..8dda45fee0b 100644
--- a/test/CodeGen/AMDGPU/control-flow-optnone.ll
+++ b/test/CodeGen/AMDGPU/control-flow-optnone.ll
@@ -8,22 +8,22 @@
 
 ; GCN-LABEL: {{^}}copytoreg_divergent_brcond:
 ; GCN: s_branch
 
 ; GCN-DAG: v_cmp_lt_i32
 ; GCN-DAG: v_cmp_gt_i32
 ; GCN: s_and_b64
 ; GCN: s_mov_b64 exec
 
 ; GCN: s_or_b64 exec, exec
-; GCN: v_cmp_eq_u32
-; GCN: s_cbranch_vccnz
+; GCN: s_cmp_eq_u32
+; GCN: s_cbranch_scc1
 ; GCN-NEXT: s_branch
 define amdgpu_kernel void @copytoreg_divergent_brcond(i32 %arg, i32 %arg1, i32 %arg2) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp3 = zext i32 %tmp to i64
   %tmp5 = add i64 %tmp3, undef
   %tmp6 = trunc i64 %tmp5 to i32
   %tmp7 = mul nsw i32 %tmp6, %arg2
   br label %bb8
 
diff --git a/test/Transforms/StructurizeCFG/uniform-regions.ll b/test/Transforms/StructurizeCFG/uniform-regions.ll
new file mode 100644
index 00000000000..d41afd5b4a0
--- /dev/null
+++ b/test/Transforms/StructurizeCFG/uniform-regions.ll
@@ -0,0 +1,49 @@
+; RUN: opt -mtriple=amdgcn-- -S -o - -structurizecfg -structurizecfg-skip-uniform-regions < %s | FileCheck %s
+
+; CHECK-LABEL: @uniform(
+; CHECK: entry:
+; CHECK:   br i1 %cc, label %if, label %end, !structurizecfg.uniform !0
+; CHECK: if:
+; CHECK:   br label %end, !structurizecfg.uniform !0
+define amdgpu_cs void @uniform(i32 inreg %v) {
+entry:
+  %cc = icmp eq i32 %v, 0
+  br i1 %cc, label %if, label %end
+
+if:
+  br label %end
+
+end:
+  ret void
+}
+
+; CHECK-LABEL: @nonuniform(
+; CHECK-NOT: !structurizecfg
+define amdgpu_cs void @nonuniform(i32 addrspace(2)* %ptr) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [0, %entry], [%i.inc, %end.loop]
+  %cc = icmp ult i32 %i, 4
+  br i1 %cc, label %mid.loop, label %for.end
+
+mid.loop:
+  %gep = getelementptr i32, i32 addrspace(2)* %ptr, i32 %i
+  %v = load i32, i32 addrspace(2)* %gep, align 4
+  %cc2 = icmp eq i32 %v, 0
+  br i1 %cc2, label %end.loop, label %for.end
+
+end.loop:
+  %i.inc = add i32 %i, 1
+  br label %for.body
+
+for.end:
+  br i1 %cc, label %if, label %end
+
+if:
+  br label %end
+
+end:
+  ret void
+}
-- 
2.11.0