#version 450

/* built in:
in uvec3 gl_NumWorkGroups;
in uvec3 gl_WorkGroupID;
in uvec3 gl_LocalInvocationID;
in uvec3 gl_GlobalInvocationID;
in uint  gl_LocalInvocationIndex;
*/
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

layout(push_constant) uniform PushStruct
{
    uint offset;
}
p;

layout(binding = 0) buffer inBuffer
{
    uint v[];
};
layout(binding = 1) buffer outBuffer
{
    uint g_v[];
};

// TODO: Shared variables
// 512 Elements but initial reduction is done
const uint bufferSize = 256;
shared uint[bufferSize] localBuffer;

void main()
{
    uint tid    = gl_LocalInvocationID.x;
    uint gid    = gl_WorkGroupID.x;
    uint offset = gid * bufferSize;

    uint idx1 = offset + tid;
    uint idx2 = offset + tid + gl_WorkGroupSize.x;

    uint val1 = 0;
    uint val2 = 0;

    if (idx1 < p.offset)
        val1 = v[idx1];
    if (idx2 < p.offset)
        val2 = v[idx2];

    localBuffer[tid] = val1 + val2;
    barrier();

    // Reduction in shared memory (unrolled for last 5 steps)
    for (uint s = gl_WorkGroupSize.x / 2; s > 32; s >>= 1)
    {
        if (tid < s)
            localBuffer[tid] += localBuffer[tid + s];
        barrier();
    }

    // Unrolled tail with full barriers for safety on Vulkan
    if (gl_WorkGroupSize.x >= 64)
    {
        if (tid < 32)
        {
            localBuffer[tid] += localBuffer[tid + 32];
            barrier();
        }
    }
    if (gl_WorkGroupSize.x >= 32)
    {
        if (tid < 16)
        {
            localBuffer[tid] += localBuffer[tid + 16];
            barrier();
        }
    }
    if (gl_WorkGroupSize.x >= 16)
    {
        if (tid < 8)
        {
            localBuffer[tid] += localBuffer[tid + 8];
            barrier();
        }
    }
    if (gl_WorkGroupSize.x >= 8)
    {
        if (tid < 4)
        {
            localBuffer[tid] += localBuffer[tid + 4];
            barrier();
        }
    }
    if (gl_WorkGroupSize.x >= 4)
    {
        if (tid < 2)
        {
            localBuffer[tid] += localBuffer[tid + 2];
            barrier();
        }
    }
    if (gl_WorkGroupSize.x >= 2)
    {
        if (tid < 1)
        {
            localBuffer[tid] += localBuffer[tid + 1];
            barrier();
        }
    }

    if (tid == 0)
    {
        g_v[gid] = localBuffer[tid];
    }
}