diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 5559946..c673780 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -364,21 +364,21 @@ struct si_context {
 	unsigned		spi_tmpring_size;
 
 	struct r600_resource	*compute_scratch_buffer;
 
 	/* Emitted derived tessellation state. */
 	/* Local shader (VS), or HS if LS-HS are merged. */
 	struct si_shader	*last_ls;
 	struct si_shader_selector *last_tcs;
 	int			last_num_tcs_input_cp;
 	int			last_tes_sh_base;
-	bool			last_tess_uses_primid;
+	unsigned		last_tess_primid_workaround_patches;
 	unsigned		last_num_patches;
 
 	/* Debug state. */
 	bool			is_debug;
 	struct radeon_saved_cs	last_gfx;
 	struct r600_resource	*last_trace_buf;
 	struct r600_resource	*trace_buf;
 	unsigned		trace_id;
 	uint64_t		dmesg_timestamp;
 	unsigned		apitrace_call_number;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 8508259..3f86e1e 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -98,20 +98,21 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	struct si_shader *ls_current;
 	struct si_shader_selector *ls;
 	/* The TES pointer will only be used for sctx->last_tcs.
 	 * It would be wrong to think that TCS = TES. */
 	struct si_shader_selector *tcs =
 		sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
 	unsigned tess_uses_primid = sctx->ia_multi_vgt_param_key.u.tess_uses_prim_id;
 	bool has_primid_instancing_bug = sctx->b.chip_class == SI &&
 					 sctx->b.screen->info.max_se == 1;
+	unsigned tess_primid_workaround_patches = 0;
 	unsigned tes_sh_base = sctx->shader_userdata.sh_base[PIPE_SHADER_TESS_EVAL];
 	unsigned num_tcs_input_cp = info->vertices_per_patch;
 	unsigned num_tcs_output_cp, num_tcs_inputs, num_tcs_outputs;
 	unsigned num_tcs_patch_outputs;
 	unsigned input_vertex_size, output_vertex_size, pervertex_output_patch_size;
 	unsigned input_patch_size, output_patch_size, output_patch0_offset;
 	unsigned perpatch_output_offset, lds_size;
 	unsigned tcs_in_layout, tcs_out_layout, tcs_out_offsets;
 	unsigned offchip_layout, hardware_lds_size, ls_hs_config;
 
@@ -121,35 +122,41 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 			ls_current = sctx->tcs_shader.current;
 		else
 			ls_current = sctx->fixed_func_tcs_shader.current;
 
 		ls = ls_current->key.part.tcs.ls;
 	} else {
 		ls_current = sctx->vs_shader.current;
 		ls = sctx->vs_shader.cso;
 	}
 
+	if (has_primid_instancing_bug && tess_uses_primid) {
+		if (info->indirect)
+			tess_primid_workaround_patches = 1;
+		else if (info->instance_count > 1)
+			tess_primid_workaround_patches = info->count / num_tcs_input_cp;
+	}
+
 	if (sctx->last_ls == ls_current &&
 	    sctx->last_tcs == tcs &&
 	    sctx->last_tes_sh_base == tes_sh_base &&
 	    sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
-	    (!has_primid_instancing_bug ||
-	     (sctx->last_tess_uses_primid == tess_uses_primid))) {
+	    sctx->last_tess_primid_workaround_patches == tess_primid_workaround_patches) {
 		*num_patches = sctx->last_num_patches;
 		return;
 	}
 
 	sctx->last_ls = ls_current;
 	sctx->last_tcs = tcs;
 	sctx->last_tes_sh_base = tes_sh_base;
 	sctx->last_num_tcs_input_cp = num_tcs_input_cp;
-	sctx->last_tess_uses_primid = tess_uses_primid;
+	sctx->last_tess_primid_workaround_patches = tess_uses_primid;
 
 	/* This calculates how shader inputs and outputs among VS, TCS, and TES
 	 * are laid out in LDS. */
 	num_tcs_inputs = util_last_bit64(ls->outputs_written);
 
 	if (sctx->tcs_shader.cso) {
 		num_tcs_outputs = util_last_bit64(tcs->outputs_written);
 		num_tcs_output_cp = tcs->info.properties[TGSI_PROPERTY_TCS_VERTICES_OUT];
 		num_tcs_patch_outputs = util_last_bit64(tcs->patch_outputs_written);
 	} else {
@@ -205,22 +212,24 @@ static void si_emit_derived_tess_state(struct si_context *sctx,
 	/* The VGT HS block increments the patch ID unconditionally
 	 * within a single threadgroup. This results in incorrect
 	 * patch IDs when instanced draws are used.
 	 *
 	 * The intended solution is to restrict threadgroups to
 	 * a single instance by setting SWITCH_ON_EOI, which
 	 * should cause IA to split instances up. However, this
 	 * doesn't work correctly on SI when there is no other
 	 * SE to switch to.
 	 */
-	if (has_primid_instancing_bug)
-		*num_patches = 1;
+	if (tess_primid_workaround_patches) {
+		while (tess_primid_workaround_patches % *num_patches != 0)
+			(*num_patches)--;
+	}
 
 	sctx->last_num_patches = *num_patches;
 
 	output_patch0_offset = input_patch_size * *num_patches;
 	perpatch_output_offset = output_patch0_offset + pervertex_output_patch_size;
 
 	/* Compute userdata SGPRs. */
 	assert(((input_vertex_size / 4) & ~0xff) == 0);
 	assert(((output_vertex_size / 4) & ~0xff) == 0);
 	assert(((input_patch_size / 4) & ~0x1fff) == 0);