From 1a12fbe8b55fbb5cc8675d34fab03b3cae59851e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nicolai=20H=C3=A4hnle?= Date: Tue, 28 Nov 2017 13:05:33 +0100 Subject: [PATCH 1/2] StructurizeCFG: Test for branch divergence correctly Summary: This fixes cases like the new test @nonuniform. In that test, %cc itself is a uniform value; however, when reading it after the end of the loop in basic block %if, its value is effectively non-uniform. This problem was encountered in https://bugs.freedesktop.org/show_bug.cgi?id=103743; however, this change in itself is not sufficient to fix that bug, as there is another issue in the AMDGPU backend. Change-Id: I32bbffece4a32f686fab54964dae1a5dd72949d4 Reviewers: arsenm, rampitec, jlebar Subscribers: wdng, tpr, llvm-commits Differential Revision: https://reviews.llvm.org/D40546 --- include/llvm/Analysis/DivergenceAnalysis.h | 8 +++- lib/Transforms/Scalar/StructurizeCFG.cpp | 11 ++++- test/CodeGen/AMDGPU/control-flow-optnone.ll | 4 +- test/Transforms/StructurizeCFG/uniform-regions.ll | 49 +++++++++++++++++++++++ 4 files changed, 67 insertions(+), 5 deletions(-) create mode 100644 test/Transforms/StructurizeCFG/uniform-regions.ll diff --git a/include/llvm/Analysis/DivergenceAnalysis.h b/include/llvm/Analysis/DivergenceAnalysis.h index aa2de571ba1..36027ab84cc 100644 --- a/include/llvm/Analysis/DivergenceAnalysis.h +++ b/include/llvm/Analysis/DivergenceAnalysis.h @@ -28,21 +28,27 @@ public: initializeDivergenceAnalysisPass(*PassRegistry::getPassRegistry()); } void getAnalysisUsage(AnalysisUsage &AU) const override; bool runOnFunction(Function &F) override; // Print all divergent branches in the function. void print(raw_ostream &OS, const Module *) const override; - // Returns true if V is divergent. + // Returns true if V is divergent at its definition. + // + // Even if this function returns false, V may still be divergent when used + // in a different basic block. bool isDivergent(const Value *V) const { return DivergentValues.count(V); } // Returns true if V is uniform/non-divergent. + // + // Even if this function returns true, V may still be divergent when used + // in a different basic block. bool isUniform(const Value *V) const { return !isDivergent(V); } private: // Stores all divergent values. DenseSet DivergentValues; }; } // End llvm namespace diff --git a/lib/Transforms/Scalar/StructurizeCFG.cpp b/lib/Transforms/Scalar/StructurizeCFG.cpp index 2972e1cff9a..d6034deea34 100644 --- a/lib/Transforms/Scalar/StructurizeCFG.cpp +++ b/lib/Transforms/Scalar/StructurizeCFG.cpp @@ -48,20 +48,26 @@ using namespace llvm; using namespace llvm::PatternMatch; #define DEBUG_TYPE "structurizecfg" // The name for newly created blocks. static const char *const FlowBlockName = "Flow"; namespace { +static cl::opt ForceSkipUniformRegions( + "structurizecfg-skip-uniform-regions", + cl::Hidden, + cl::desc("Force the StructurizeCFG pass to skip uniform regions"), + cl::init(false)); + // Definition of the complex types used in this pass. using BBValuePair = std::pair; using RNVector = SmallVector; using BBVector = SmallVector; using BranchVector = SmallVector; using BBValueVector = SmallVector; using BBSet = SmallPtrSet; @@ -236,21 +242,22 @@ class StructurizeCFG : public RegionPass { void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd); void createFlow(); void rebuildSSA(); public: static char ID; explicit StructurizeCFG(bool SkipUniformRegions = false) - : RegionPass(ID), SkipUniformRegions(SkipUniformRegions) { + : RegionPass(ID), + SkipUniformRegions(SkipUniformRegions || ForceSkipUniformRegions) { initializeStructurizeCFGPass(*PassRegistry::getPassRegistry()); } bool doInitialization(Region *R, RGPassManager &RGM) override; bool runOnRegion(Region *R, RGPassManager &RGM) override; StringRef getPassName() const override { return "Structurize control flow"; } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -884,21 +891,21 @@ void StructurizeCFG::rebuildSSA() { } } static bool hasOnlyUniformBranches(const Region *R, const DivergenceAnalysis &DA) { for (const BasicBlock *BB : R->blocks()) { const BranchInst *Br = dyn_cast(BB->getTerminator()); if (!Br || !Br->isConditional()) continue; - if (!DA.isUniform(Br->getCondition())) + if (!DA.isUniform(Br)) return false; DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n"); } return true; } /// \brief Run the transformation for each region found bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { if (R->isTopLevelRegion()) return false; diff --git a/test/CodeGen/AMDGPU/control-flow-optnone.ll b/test/CodeGen/AMDGPU/control-flow-optnone.ll index 2122af62735..8dda45fee0b 100644 --- a/test/CodeGen/AMDGPU/control-flow-optnone.ll +++ b/test/CodeGen/AMDGPU/control-flow-optnone.ll @@ -8,22 +8,22 @@ ; GCN-LABEL: {{^}}copytoreg_divergent_brcond: ; GCN: s_branch ; GCN-DAG: v_cmp_lt_i32 ; GCN-DAG: v_cmp_gt_i32 ; GCN: s_and_b64 ; GCN: s_mov_b64 exec ; GCN: s_or_b64 exec, exec -; GCN: v_cmp_eq_u32 -; GCN: s_cbranch_vccnz +; GCN: s_cmp_eq_u32 +; GCN: s_cbranch_scc1 ; GCN-NEXT: s_branch define amdgpu_kernel void @copytoreg_divergent_brcond(i32 %arg, i32 %arg1, i32 %arg2) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() %tmp3 = zext i32 %tmp to i64 %tmp5 = add i64 %tmp3, undef %tmp6 = trunc i64 %tmp5 to i32 %tmp7 = mul nsw i32 %tmp6, %arg2 br label %bb8 diff --git a/test/Transforms/StructurizeCFG/uniform-regions.ll b/test/Transforms/StructurizeCFG/uniform-regions.ll new file mode 100644 index 00000000000..d41afd5b4a0 --- /dev/null +++ b/test/Transforms/StructurizeCFG/uniform-regions.ll @@ -0,0 +1,49 @@ +; RUN: opt -mtriple=amdgcn-- -S -o - -structurizecfg -structurizecfg-skip-uniform-regions < %s | FileCheck %s + +; CHECK-LABEL: @uniform( +; CHECK: entry: +; CHECK: br i1 %cc, label %if, label %end, !structurizecfg.uniform !0 +; CHECK: if: +; CHECK: br label %end, !structurizecfg.uniform !0 +define amdgpu_cs void @uniform(i32 inreg %v) { +entry: + %cc = icmp eq i32 %v, 0 + br i1 %cc, label %if, label %end + +if: + br label %end + +end: + ret void +} + +; CHECK-LABEL: @nonuniform( +; CHECK-NOT: !structurizecfg +define amdgpu_cs void @nonuniform(i32 addrspace(2)* %ptr) { +entry: + br label %for.body + +for.body: + %i = phi i32 [0, %entry], [%i.inc, %end.loop] + %cc = icmp ult i32 %i, 4 + br i1 %cc, label %mid.loop, label %for.end + +mid.loop: + %gep = getelementptr i32, i32 addrspace(2)* %ptr, i32 %i + %v = load i32, i32 addrspace(2)* %gep, align 4 + %cc2 = icmp eq i32 %v, 0 + br i1 %cc2, label %end.loop, label %for.end + +end.loop: + %i.inc = add i32 %i, 1 + br label %for.body + +for.end: + br i1 %cc, label %if, label %end + +if: + br label %end + +end: + ret void +} -- 2.11.0