#version 450 /* built in: in uvec3 gl_NumWorkGroups; in uvec3 gl_WorkGroupID; in uvec3 gl_LocalInvocationID; in uvec3 gl_GlobalInvocationID; in uint gl_LocalInvocationIndex; */ // Why did we not have conflicts in the Reduction? // Because of the sequential addressing (here we use interleaved => we have conflicts). // TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs) #define NUM_BANKS 32 #define NUM_BANKS_LOG 5 #define SIMD_GROUP_SIZE 32 #define BUFFER_SIZE 256 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout(push_constant) uniform PushStruct { uint size; } p; layout(binding = 0) buffer inoutBufer { uint array[]; }; layout(binding = 1) buffer offsetBuffer { uint g_v[]; }; shared uint temp[BUFFER_SIZE + (BUFFER_SIZE >> NUM_BANKS_LOG)]; // Bank conflicts #ifdef AVOID_BANK_CONFLICTS #define OFFSET(A) ((A) + ((A) >> NUM_BANKS_LOG)) #else #define OFFSET(A) (A) #endif ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void main() { const uint tid = gl_LocalInvocationID.x; const uint gid = gl_GlobalInvocationID.x; const uint size = BUFFER_SIZE; uint val0 = 0; uint val1 = 0; if (2 * gid < p.size) val0 = array[2 * gid]; if (2 * gid + 1 < p.size) val1 = array[2 * gid + 1]; temp[OFFSET(2 * tid)] = val0; temp[OFFSET(2 * tid + 1)] = val1; // Up-Sweep (Reduction) phase for (uint stride = 1; stride < size; stride <<= 1) { barrier(); uint idx = (tid + 1) * stride * 2 - 1; if (idx < size) { temp[OFFSET(idx)] += temp[OFFSET(idx - stride)]; } } // Clear the last element if (tid == 0) { g_v[gl_WorkGroupID.x] = temp[OFFSET(size - 1)]; temp[OFFSET(size - 1)] = 0; } // Down-Sweep phase for (uint stride = size >> 1; stride > 0; stride >>= 1) { barrier(); uint idx = (tid + 1) * stride * 2 - 1; if (idx < size) { uint t = temp[OFFSET(idx - stride)]; temp[OFFSET(idx - stride)] = temp[OFFSET(idx)]; temp[OFFSET(idx)] += t; } } if (2 * gid < p.size) array[2 * gid] = temp[OFFSET(2 * tid)] + val0; if (2 * gid + 1 < p.size) array[2 * gid + 1] = temp[OFFSET(2 * tid + 1)] + val1; }