93 lines
2.4 KiB
Plaintext
93 lines
2.4 KiB
Plaintext
#version 450
|
|
|
|
/* built in:
|
|
in uvec3 gl_NumWorkGroups;
|
|
in uvec3 gl_WorkGroupID;
|
|
in uvec3 gl_LocalInvocationID;
|
|
in uvec3 gl_GlobalInvocationID;
|
|
in uint gl_LocalInvocationIndex;
|
|
*/
|
|
|
|
// Why did we not have conflicts in the Reduction?
|
|
// Because of the sequential addressing (here we use interleaved => we have conflicts).
|
|
// TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs)
|
|
#define NUM_BANKS 32
|
|
#define NUM_BANKS_LOG 5
|
|
#define SIMD_GROUP_SIZE 32
|
|
#define BUFFER_SIZE 256
|
|
|
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
layout(push_constant) uniform PushStruct
|
|
{
|
|
uint size;
|
|
}
|
|
p;
|
|
|
|
layout(binding = 0) buffer inoutBufer { uint array[]; };
|
|
layout(binding = 1) buffer offsetBuffer { uint g_v[]; };
|
|
|
|
shared uint temp[BUFFER_SIZE + (BUFFER_SIZE >> NUM_BANKS_LOG)];
|
|
|
|
// Bank conflicts
|
|
#ifdef AVOID_BANK_CONFLICTS
|
|
#define OFFSET(A) ((A) + ((A) >> NUM_BANKS_LOG))
|
|
#else
|
|
#define OFFSET(A) (A)
|
|
#endif
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
void main()
|
|
{
|
|
const uint tid = gl_LocalInvocationID.x;
|
|
const uint gid = gl_GlobalInvocationID.x;
|
|
const uint size = BUFFER_SIZE;
|
|
|
|
uint val0 = 0;
|
|
uint val1 = 0;
|
|
|
|
if (2 * gid < p.size)
|
|
val0 = array[2 * gid];
|
|
if (2 * gid + 1 < p.size)
|
|
val1 = array[2 * gid + 1];
|
|
|
|
temp[OFFSET(2 * tid)] = val0;
|
|
temp[OFFSET(2 * tid + 1)] = val1;
|
|
|
|
// Up-Sweep (Reduction) phase
|
|
for (uint stride = 1; stride < size; stride <<= 1)
|
|
{
|
|
barrier();
|
|
uint idx = (tid + 1) * stride * 2 - 1;
|
|
if (idx < size)
|
|
{
|
|
temp[OFFSET(idx)] += temp[OFFSET(idx - stride)];
|
|
}
|
|
}
|
|
|
|
// Clear the last element
|
|
if (tid == 0)
|
|
{
|
|
g_v[gl_WorkGroupID.x] = temp[OFFSET(size - 1)];
|
|
temp[OFFSET(size - 1)] = 0;
|
|
}
|
|
|
|
// Down-Sweep phase
|
|
for (uint stride = size >> 1; stride > 0; stride >>= 1)
|
|
{
|
|
barrier();
|
|
uint idx = (tid + 1) * stride * 2 - 1;
|
|
if (idx < size)
|
|
{
|
|
uint t = temp[OFFSET(idx - stride)];
|
|
temp[OFFSET(idx - stride)] = temp[OFFSET(idx)];
|
|
temp[OFFSET(idx)] += t;
|
|
}
|
|
}
|
|
|
|
if (2 * gid < p.size)
|
|
array[2 * gid] = temp[OFFSET(2 * tid)] + val0;
|
|
if (2 * gid + 1 < p.size)
|
|
array[2 * gid + 1] = temp[OFFSET(2 * tid + 1)] + val1;
|
|
}
|