#version 450

/* built in:
in uvec3 gl_NumWorkGroups;
in uvec3 gl_WorkGroupID;
in uvec3 gl_LocalInvocationID;
in uvec3 gl_GlobalInvocationID;
in uint  gl_LocalInvocationIndex;
*/

// Why did we not have conflicts in the Reduction?
// Because of the sequential addressing (here we use interleaved => we have conflicts).
// TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs)
#define NUM_BANKS 32
#define NUM_BANKS_LOG 5
#define SIMD_GROUP_SIZE 32
#define BUFFER_SIZE 256

layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

layout(push_constant) uniform PushStruct
{
    uint size;
}
p;

layout(binding = 0) buffer inoutBufer { uint array[]; };
layout(binding = 1) buffer offsetBuffer { uint g_v[]; };

shared uint temp[BUFFER_SIZE + (BUFFER_SIZE >> NUM_BANKS_LOG)];

// Bank conflicts
#ifdef AVOID_BANK_CONFLICTS
#define OFFSET(A) ((A) + ((A) >> NUM_BANKS_LOG))
#else
#define OFFSET(A) (A)
#endif

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void main()
{
    const uint tid  = gl_LocalInvocationID.x;
    const uint gid  = gl_GlobalInvocationID.x;
    const uint size = BUFFER_SIZE;

    uint val0 = 0;
    uint val1 = 0;

    if (2 * gid < p.size)
        val0 = array[2 * gid];
    if (2 * gid + 1 < p.size)
        val1 = array[2 * gid + 1];

    temp[OFFSET(2 * tid)]     = val0;
    temp[OFFSET(2 * tid + 1)] = val1;

    // Up-Sweep (Reduction) phase
    for (uint stride = 1; stride < size; stride <<= 1)
    {
        barrier();
        uint idx = (tid + 1) * stride * 2 - 1;
        if (idx < size)
        {
            temp[OFFSET(idx)] += temp[OFFSET(idx - stride)];
        }
    }

    // Clear the last element
    if (tid == 0)
    {
        g_v[gl_WorkGroupID.x] = temp[OFFSET(size - 1)];
        temp[OFFSET(size - 1)] = 0;
    }

    // Down-Sweep phase
    for (uint stride = size >> 1; stride > 0; stride >>= 1)
    {
        barrier();
        uint idx = (tid + 1) * stride * 2 - 1;
        if (idx < size)
        {
            uint t                     = temp[OFFSET(idx - stride)];
            temp[OFFSET(idx - stride)] = temp[OFFSET(idx)];
            temp[OFFSET(idx)] += t;
        }
    }

    if (2 * gid < p.size)
        array[2 * gid]     = temp[OFFSET(2 * tid)] + val0;
    if (2 * gid + 1 < p.size)
        array[2 * gid + 1] = temp[OFFSET(2 * tid + 1)] + val1;
}