Files
gpgpu-two/shaders/A2Task1KernelDecompositionUnroll.comp
2026-01-03 22:55:08 +01:00

115 lines
2.3 KiB
Plaintext

#version 450
/* built in:
in uvec3 gl_NumWorkGroups;
in uvec3 gl_WorkGroupID;
in uvec3 gl_LocalInvocationID;
in uvec3 gl_GlobalInvocationID;
in uint gl_LocalInvocationIndex;
*/
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(push_constant) uniform PushStruct
{
uint offset;
}
p;
layout(binding = 0) buffer inBuffer
{
uint v[];
};
layout(binding = 1) buffer outBuffer
{
uint g_v[];
};
// TODO: Shared variables
// 512 Elements but initial reduction is done
const uint bufferSize = 256;
shared uint[bufferSize] localBuffer;
void main()
{
uint tid = gl_LocalInvocationID.x;
uint gid = gl_WorkGroupID.x;
uint offset = gid * bufferSize;
uint idx1 = offset + tid;
uint idx2 = offset + tid + gl_WorkGroupSize.x;
uint val1 = 0;
uint val2 = 0;
if (idx1 < p.offset)
val1 = v[idx1];
if (idx2 < p.offset)
val2 = v[idx2];
localBuffer[tid] = val1 + val2;
barrier();
// Reduction in shared memory (unrolled for last 5 steps)
for (uint s = gl_WorkGroupSize.x / 2; s > 32; s >>= 1)
{
if (tid < s)
localBuffer[tid] += localBuffer[tid + s];
barrier();
}
// Unrolled tail with full barriers for safety on Vulkan
if (gl_WorkGroupSize.x >= 64)
{
if (tid < 32)
{
localBuffer[tid] += localBuffer[tid + 32];
barrier();
}
}
if (gl_WorkGroupSize.x >= 32)
{
if (tid < 16)
{
localBuffer[tid] += localBuffer[tid + 16];
barrier();
}
}
if (gl_WorkGroupSize.x >= 16)
{
if (tid < 8)
{
localBuffer[tid] += localBuffer[tid + 8];
barrier();
}
}
if (gl_WorkGroupSize.x >= 8)
{
if (tid < 4)
{
localBuffer[tid] += localBuffer[tid + 4];
barrier();
}
}
if (gl_WorkGroupSize.x >= 4)
{
if (tid < 2)
{
localBuffer[tid] += localBuffer[tid + 2];
barrier();
}
}
if (gl_WorkGroupSize.x >= 2)
{
if (tid < 1)
{
localBuffer[tid] += localBuffer[tid + 1];
barrier();
}
}
if (tid == 0)
{
g_v[gid] = localBuffer[tid];
}
}