#version 450 /* built in: in uvec3 gl_NumWorkGroups; in uvec3 gl_WorkGroupID; in uvec3 gl_LocalInvocationID; in uvec3 gl_GlobalInvocationID; in uint gl_LocalInvocationIndex; */ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout(push_constant) uniform PushStruct { uint offset; } p; layout(binding = 0) buffer inBuffer { uint v[]; }; layout(binding = 1) buffer outBuffer { uint g_v[]; }; // TODO: Shared variables // 512 Elements but initial reduction is done const uint bufferSize = 256; shared uint[bufferSize] localBuffer; void main() { uint tid = gl_LocalInvocationID.x; uint gid = gl_WorkGroupID.x; uint offset = gid * bufferSize; uint idx1 = offset + tid; uint idx2 = offset + tid + gl_WorkGroupSize.x; uint val1 = 0; uint val2 = 0; if (idx1 < p.offset) val1 = v[idx1]; if (idx2 < p.offset) val2 = v[idx2]; localBuffer[tid] = val1 + val2; // Reduction in shared memory for (uint s = gl_WorkGroupSize.x / 2; s > 0; s /= 2) { if (tid < s) { barrier(); localBuffer[tid] += localBuffer[tid + s]; } } if (tid == 0) { g_v[gid] = localBuffer[tid]; } }