#version 450 /* built in: in uvec3 gl_NumWorkGroups; in uvec3 gl_WorkGroupID; in uvec3 gl_LocalInvocationID; in uvec3 gl_GlobalInvocationID; in uint gl_LocalInvocationIndex; */ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout(push_constant) uniform PushStruct { uint offset; } p; layout(binding = 0) buffer inBuffer { uint v[]; }; layout(binding = 1) buffer outBuffer { uint g_v[]; }; // TODO: Shared variables // 512 Elements but initial reduction is done const uint bufferSize = 256; shared uint[bufferSize] localBuffer; void main() { uint tid = gl_LocalInvocationID.x; uint gid = gl_WorkGroupID.x; uint offset = gid * bufferSize; uint idx1 = offset + tid; uint idx2 = offset + tid + gl_WorkGroupSize.x; uint val1 = 0; uint val2 = 0; if (idx1 < p.offset) val1 = v[idx1]; if (idx2 < p.offset) val2 = v[idx2]; localBuffer[tid] = val1 + val2; barrier(); // Reduction in shared memory (unrolled for last 5 steps) for (uint s = gl_WorkGroupSize.x / 2; s > 32; s >>= 1) { if (tid < s) localBuffer[tid] += localBuffer[tid + s]; barrier(); } // Unrolled tail with full barriers for safety on Vulkan if (gl_WorkGroupSize.x >= 64) { if (tid < 32) { localBuffer[tid] += localBuffer[tid + 32]; barrier(); } } if (gl_WorkGroupSize.x >= 32) { if (tid < 16) { localBuffer[tid] += localBuffer[tid + 16]; barrier(); } } if (gl_WorkGroupSize.x >= 16) { if (tid < 8) { localBuffer[tid] += localBuffer[tid + 8]; barrier(); } } if (gl_WorkGroupSize.x >= 8) { if (tid < 4) { localBuffer[tid] += localBuffer[tid + 4]; barrier(); } } if (gl_WorkGroupSize.x >= 4) { if (tid < 2) { localBuffer[tid] += localBuffer[tid + 2]; barrier(); } } if (gl_WorkGroupSize.x >= 2) { if (tid < 1) { localBuffer[tid] += localBuffer[tid + 1]; barrier(); } } if (tid == 0) { g_v[gid] = localBuffer[tid]; } }