From 0568ba8b8c6d4ded99c3c8cb405cbef1b49016db Mon Sep 17 00:00:00 2001 From: Vadim Girlin Date: Tue, 8 Oct 2013 03:13:20 +0400 Subject: [PATCH] r600g/sb: work around hw issues with stack on eg/cm v2: make it actually work, improve condition Signed-off-by: Vadim Girlin --- src/gallium/drivers/r600/sb/sb_bc.h | 21 ++++ src/gallium/drivers/r600/sb/sb_bc_finalize.cpp | 129 +++++++++++++++++-------- src/gallium/drivers/r600/sb/sb_context.cpp | 9 +- src/gallium/drivers/r600/sb/sb_ir.h | 5 +- src/gallium/drivers/r600/sb/sb_pass.h | 3 + 5 files changed, 123 insertions(+), 44 deletions(-) diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h index ad1b862..73b8b08 100644 --- a/src/gallium/drivers/r600/sb/sb_bc.h +++ b/src/gallium/drivers/r600/sb/sb_bc.h @@ -614,6 +614,10 @@ public: unsigned num_slots; bool uses_mova_gpr; + bool stack_workaround_8xx; + bool stack_workaround_9xx; + + unsigned wavefront_size; unsigned stack_entry_size; static unsigned dump_pass; @@ -638,6 +642,23 @@ public: bool is_cayman() {return hw_class == HW_CLASS_CAYMAN;} bool is_egcm() {return hw_class >= HW_CLASS_EVERGREEN;} + bool needs_8xx_stack_workaround() { + if (!is_evergreen()) + return false; + + switch (hw_chip) { + case HW_CHIP_CYPRESS: + case HW_CHIP_JUNIPER: + return false; + default: + return true; + } + } + + bool needs_9xx_stack_workaround() { + return is_cayman(); + } + sb_hw_class_bits hw_class_bit() { switch (hw_class) { case HW_CLASS_R600:return HB_R6; diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp index c56c866..bc71cf8 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp @@ -40,8 +40,9 @@ namespace r600_sb { int bc_finalizer::run() { - regions_vec &rv = sh.get_regions(); + run_on(sh.root); + regions_vec &rv = sh.get_regions(); for (regions_vec::reverse_iterator I = rv.rbegin(), E = rv.rend(); I != E; ++I) { region_node *r = *I; @@ -58,8 +59,6 @@ int bc_finalizer::run() { r->expand(); } - run_on(sh.root); - cf_peephole(); // workaround for some problems on r6xx/7xx @@ -213,18 +212,36 @@ void bc_finalizer::run_on(container_node* c) { if (n->is_alu_group()) { finalize_alu_group(static_cast(n)); } else { - if (n->is_fetch_inst()) { + if (n->is_alu_clause()) { + cf_node *c = static_cast(n); + + if (c->bc.op == CF_OP_ALU_PUSH_BEFORE && ctx.is_egcm()) { + if (ctx.stack_workaround_8xx) { + region_node *r = c->get_parent_region(); + if (r) { + unsigned ifs, loops; + unsigned elems = get_stack_depth(r, loops, ifs); + unsigned dmod1 = elems % ctx.stack_entry_size; + unsigned dmod2 = (elems + 1) % ctx.stack_entry_size; + + if (elems && (!dmod1 || !dmod2)) + c->flags |= NF_ALU_STACK_WORKAROUND; + } + } else if (ctx.stack_workaround_9xx) { + region_node *r = c->get_parent_region(); + if (r) { + unsigned ifs, loops; + get_stack_depth(r, loops, ifs); + if (loops >= 2) + c->flags |= NF_ALU_STACK_WORKAROUND; + } + } + } + } else if (n->is_fetch_inst()) { finalize_fetch(static_cast(n)); } else if (n->is_cf_inst()) { finalize_cf(static_cast(n)); - } else if (n->is_alu_clause()) { - - } else if (n->is_fetch_clause()) { - - } else { - assert(!"unexpected node"); } - if (n->is_container()) run_on(static_cast(n)); } @@ -578,10 +595,6 @@ void bc_finalizer::finalize_cf(cf_node* c) { unsigned flags = c->bc.op_ptr->flags; - if (flags & CF_CALL) { - update_nstack(c->get_parent_region(), ctx.is_cayman() ? 1 : 2); - } - c->bc.end_of_program = 0; last_cf = c; @@ -715,17 +728,8 @@ void bc_finalizer::finalize_cf(cf_node* c) { c->bc.index_gpr = reg >= 0 ? reg : 0; } - - - - } else { - -#if 0 - if ((flags & (CF_BRANCH | CF_LOOP)) && !sh.uses_gradients) { - c->bc.valid_pixel_mode = 1; - } -#endif - + } else if (flags & CF_CALL) { + update_nstack(c->get_parent_region(), ctx.wavefront_size == 16 ? 2 : 1); } } @@ -763,37 +767,78 @@ void bc_finalizer::update_ngpr(unsigned gpr) { ngpr = gpr + 1; } -void bc_finalizer::update_nstack(region_node* r, unsigned add) { - unsigned loops = 0; - unsigned ifs = 0; +unsigned bc_finalizer::get_stack_depth(node *n, unsigned &loops, + unsigned &ifs, unsigned add) { + unsigned stack_elements = add; + bool has_non_wqm_push_with_loops_on_stack = false; + bool has_non_wqm_push = (add != 0); + region_node *r = n->is_region() ? + static_cast(n) : n->get_parent_region(); + + loops = 0; + ifs = 0; while (r) { - if (r->is_loop()) + if (r->is_loop()) { ++loops; - else + if (has_non_wqm_push) + has_non_wqm_push_with_loops_on_stack = true; + } else { ++ifs; - + has_non_wqm_push = true; + } r = r->get_parent_region(); } - - unsigned stack_elements = (loops * ctx.stack_entry_size) + ifs + add; - - // FIXME calculate more precisely - if (ctx.is_evergreen()) { - ++stack_elements; - } else { - stack_elements += 2; - if (ctx.is_cayman()) + stack_elements += (loops * ctx.stack_entry_size) + ifs; + + // reserve additional elements in some cases + switch (ctx.hw_class) { + case HW_CLASS_R600: + case HW_CLASS_R700: + if (has_non_wqm_push) + stack_elements += 2; + break; + case HW_CLASS_CAYMAN: + if (stack_elements) + stack_elements += 2; + break; + case HW_CLASS_EVERGREEN: + if (has_non_wqm_push_with_loops_on_stack) ++stack_elements; + break; } + return stack_elements; +} - unsigned stack_entries = (stack_elements + 3) >> 2; +void bc_finalizer::update_nstack(region_node* r, unsigned add) { + unsigned loops = 0; + unsigned ifs = 0; + unsigned elems = r ? get_stack_depth(r, loops, ifs, add) : add; + + // XXX all chips expect this value to be computed using 4 as entry size, + // not the real entry size + unsigned stack_entries = (elems + 3) >> 2; if (nstack < stack_entries) nstack = stack_entries; } void bc_finalizer::cf_peephole() { + if (ctx.stack_workaround_8xx || ctx.stack_workaround_9xx) { + for (node_iterator N, I = sh.root->begin(), E = sh.root->end(); I != E; + I = N) { + N = I; ++N; + cf_node *c = static_cast(*I); + + if (c->bc.op == CF_OP_ALU_PUSH_BEFORE && + (c->flags & NF_ALU_STACK_WORKAROUND)) { + cf_node *push = sh.create_cf(CF_OP_PUSH); + c->insert_before(push); + push->jump(c); + c->bc.set_op(CF_OP_ALU); + } + } + } for (node_iterator N, I = sh.root->begin(), E = sh.root->end(); I != E; I = N) { diff --git a/src/gallium/drivers/r600/sb/sb_context.cpp b/src/gallium/drivers/r600/sb/sb_context.cpp index 9474f74..8e11428 100644 --- a/src/gallium/drivers/r600/sb/sb_context.cpp +++ b/src/gallium/drivers/r600/sb/sb_context.cpp @@ -66,20 +66,27 @@ int sb_context::init(r600_isa *isa, sb_hw_chip chip, sb_hw_class cclass) { case HW_CHIP_RS780: case HW_CHIP_RV620: case HW_CHIP_RS880: - + wavefront_size = 16; + stack_entry_size = 8; + break; case HW_CHIP_RV630: case HW_CHIP_RV635: case HW_CHIP_RV730: case HW_CHIP_RV710: case HW_CHIP_PALM: case HW_CHIP_CEDAR: + wavefront_size = 32; stack_entry_size = 8; break; default: + wavefront_size = 64; stack_entry_size = 4; break; } + stack_workaround_8xx = needs_8xx_stack_workaround(); + stack_workaround_9xx = needs_9xx_stack_workaround(); + return 0; } diff --git a/src/gallium/drivers/r600/sb/sb_ir.h b/src/gallium/drivers/r600/sb/sb_ir.h index a74d6cb..85c3d06 100644 --- a/src/gallium/drivers/r600/sb/sb_ir.h +++ b/src/gallium/drivers/r600/sb/sb_ir.h @@ -700,7 +700,10 @@ enum node_flags { NF_DONT_MOVE = (1 << 8), // for KILLxx - we want to schedule them as early as possible - NF_SCHEDULE_EARLY = (1 << 9) + NF_SCHEDULE_EARLY = (1 << 9), + + // for ALU_PUSH_BEFORE - when set, replace with PUSH + ALU + NF_ALU_STACK_WORKAROUND = (1 << 10) }; inline node_flags operator |(node_flags l, node_flags r) { diff --git a/src/gallium/drivers/r600/sb/sb_pass.h b/src/gallium/drivers/r600/sb/sb_pass.h index 95d2a20..93d122f 100644 --- a/src/gallium/drivers/r600/sb/sb_pass.h +++ b/src/gallium/drivers/r600/sb/sb_pass.h @@ -705,6 +705,9 @@ public: void update_ngpr(unsigned gpr); void update_nstack(region_node *r, unsigned add = 0); + unsigned get_stack_depth(node *n, unsigned &loops, unsigned &ifs, + unsigned add = 0); + void cf_peephole(); }; -- 1.8.1.2