diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 5559946..c673780 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -364,21 +364,21 @@ struct si_context { unsigned spi_tmpring_size; struct r600_resource *compute_scratch_buffer; /* Emitted derived tessellation state. */ /* Local shader (VS), or HS if LS-HS are merged. */ struct si_shader *last_ls; struct si_shader_selector *last_tcs; int last_num_tcs_input_cp; int last_tes_sh_base; - bool last_tess_uses_primid; + unsigned last_tess_primid_workaround_patches; unsigned last_num_patches; /* Debug state. */ bool is_debug; struct radeon_saved_cs last_gfx; struct r600_resource *last_trace_buf; struct r600_resource *trace_buf; unsigned trace_id; uint64_t dmesg_timestamp; unsigned apitrace_call_number; diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c index 8508259..3f86e1e 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.c +++ b/src/gallium/drivers/radeonsi/si_state_draw.c @@ -98,20 +98,21 @@ static void si_emit_derived_tess_state(struct si_context *sctx, struct radeon_winsys_cs *cs = sctx->b.gfx.cs; struct si_shader *ls_current; struct si_shader_selector *ls; /* The TES pointer will only be used for sctx->last_tcs. * It would be wrong to think that TCS = TES. */ struct si_shader_selector *tcs = sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso; unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id; bool has_primid_instancing_bug = sctx->b.chip_class == SI && sctx->b.screen->info.max_se == 1; + unsigned tess_primid_workaround_patches = 0; unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL]; unsigned num_tcs_input_cp = info->vertices_per_patch; unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs; unsigned num_tcs_patch_outputs; unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size; unsigned input_patch_size, output_patch_size, output_patch0_offset; unsigned perpatch_output_offset, lds_size; unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets; unsigned offchip_layout, hardware_lds_size, ls_hs_config; @@ -121,35 +122,41 @@ static void si_emit_derived_tess_state(struct si_context *sctx, ls_current = sctx->tcs_shader.current; else ls_current = sctx->fixed_func_tcs_shader.current; ls = ls_current->key.part.tcs.ls; } else { ls_current = sctx->vs_shader.current; ls = sctx->vs_shader.cso; } + if (has_primid_instancing_bug && tess_uses_primid) { + if (info->indirect) + tess_primid_workaround_patches = 1; + else if (info->instance_count > 1) + tess_primid_workaround_patches = info->count / num_tcs_input_cp; + } + if (sctx->last_ls == ls_current && sctx->last_tcs == tcs && sctx->last_tes_sh_base == tes_sh_base && sctx->last_num_tcs_input_cp == num_tcs_input_cp && - (!has_primid_instancing_bug || - (sctx->last_tess_uses_primid == tess_uses_primid))) { + sctx->last_tess_primid_workaround_patches == tess_primid_workaround_patches) { *num_patches = sctx->last_num_patches; return; } sctx->last_ls = ls_current; sctx->last_tcs = tcs; sctx->last_tes_sh_base = tes_sh_base; sctx->last_num_tcs_input_cp = num_tcs_input_cp; - sctx->last_tess_uses_primid = tess_uses_primid; + sctx->last_tess_primid_workaround_patches = tess_uses_primid; /* This calculates how shader inputs and outputs among VS, TCS, and TES * are laid out in LDS. */ num_tcs_inputs = util_last_bit64(ls->outputs_written); if (sctx->tcs_shader.cso) { num_tcs_outputs = util_last_bit64(tcs->outputs_written); num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT]; num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written); } else { @@ -205,22 +212,24 @@ static void si_emit_derived_tess_state(struct si_context *sctx, /* The VGT HS block increments the patch ID unconditionally * within a single threadgroup. This results in incorrect * patch IDs when instanced draws are used. * * The intended solution is to restrict threadgroups to * a single instance by setting SWITCH_ON_EOI, which * should cause IA to split instances up. However, this * doesn't work correctly on SI when there is no other * SE to switch to. */ - if (has_primid_instancing_bug) - *num_patches = 1; + if (tess_primid_workaround_patches) { + while (tess_primid_workaround_patches % *num_patches != 0) + (*num_patches)--; + } sctx->last_num_patches = *num_patches; output_patch0_offset = input_patch_size * *num_patches; perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size; /* Compute userdata SGPRs. */ assert(((input_vertex_size / 4) & ~0xff) == 0); assert(((output_vertex_size / 4) & ~0xff) == 0); assert(((input_patch_size / 4) & ~0x1fff) == 0);