#version 430 // Single-threaded mode works correctly on my RX 480. // When disabling this, the shader produces incorrect output. #define SINGLE_THREADED layout(std140, binding = 76) uniform block_cs_cb1 { vec4 unused; vec2 img_size; // Image size, in pixels vec2 num_workgroups; // Number of CS workgroups (we use gl_NumWorkGroups instead) }; layout(binding = 160) uniform sampler2D cs_sampler0; // Scaled scene image layout(binding = 0) writeonly uniform uimageBuffer cs_image0; // Output buffer shared float cs_g0[64]; #ifdef SINGLE_THREADED layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in; #else layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in; #endif void main() { #ifdef SINGLE_THREADED float sum = 0.0f; for (int y = 0; y < 8; y++) { for (int x = 0; x < 8; x++) { ivec2 coord = 8 * ivec2(gl_WorkGroupID.xy) + ivec2(x, y); if ((coord.x < img_size.x) && (coord.y < img_size.y)) { vec3 color = texelFetch(cs_sampler0, 8 * ivec2(gl_WorkGroupID.xy) + ivec2(x, y), 0).xyz; sum += dot(color, vec3(0.298912f, 0.586611f, 0.114478f)); } else { sum += 1.0f; } } } imageStore(cs_image0, int(gl_WorkGroupID.y * gl_NumWorkGroups.x + gl_WorkGroupID.x), uvec4(floatBitsToUint(sum), 0, 0, 0)); #else ivec2 coord = ivec2(gl_GlobalInvocationID.xy); float value = 1.0f; if ((coord.x < img_size.x) && (coord.y < img_size.y)) { vec3 color = texelFetch(cs_sampler0, coord, 0).xyz; value = dot(color, vec3(0.298912f, 0.586611f, 0.114478f)); } cs_g0[int(gl_LocalInvocationIndex)] = value; groupMemoryBarrier(); barrier(); if (gl_LocalInvocationIndex == 0u) { float sum = 0.0f; for (int i = 0; i < 64; i++) sum += cs_g0[i]; imageStore(cs_image0, int(gl_WorkGroupID.y * gl_NumWorkGroups.x + gl_WorkGroupID.x), uvec4(floatBitsToUint(sum), 0, 0, 0)); } #endif }