quick commit

This commit is contained in:
2026-01-03 22:55:08 +01:00
parent 488b5a7b03
commit 9131bf063e
12 changed files with 435 additions and 150 deletions

View File

@@ -8,28 +8,30 @@ in uvec3 gl_GlobalInvocationID;
in uint gl_LocalInvocationIndex;
*/
// Why did we not have conflicts in the Reduction?
// Why did we not have conflicts in the Reduction?
// Because of the sequential addressing (here we use interleaved => we have conflicts).
// TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs)
#define NUM_BANKS 32
#define NUM_BANKS_LOG 5
#define SIMD_GROUP_SIZE 32
#define NUM_BANKS 32
#define NUM_BANKS_LOG 5
#define SIMD_GROUP_SIZE 32
#define BUFFER_SIZE 256
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(push_constant) uniform PushStruct {
layout(push_constant) uniform PushStruct
{
uint size;
} p;
}
p;
layout(binding = 0) buffer inoutBufer {uint array[];};
layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];};
layout(binding = 0) buffer inoutBufer { uint array[]; };
layout(binding = 1) buffer offsetBuffer { uint g_v[]; };
// TODO: Shared variables
shared uint temp[BUFFER_SIZE + (BUFFER_SIZE >> NUM_BANKS_LOG)];
// Bank conflicts
#define AVOID_BANK_CONFLICTS
#ifdef AVOID_BANK_CONFLICTS
// TODO: define your conflict-free macro here
#define OFFSET(A) ((A) + ((A) >> NUM_BANKS_LOG))
#else
#define OFFSET(A) (A)
#endif
@@ -37,16 +39,54 @@ layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];};
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void main()
{
// TODO: Kernel implementation
const uint tid = gl_LocalInvocationID.x;
const uint gid = gl_GlobalInvocationID.x;
const uint size = BUFFER_SIZE;
// Cache first half of elements in the local memory
// Cache second half of elements
uint val0 = 0;
uint val1 = 0;
// Perform up-sweep
if (2 * gid < p.size)
val0 = array[2 * gid];
if (2 * gid + 1 < p.size)
val1 = array[2 * gid + 1];
// Unroll the last steps when arrived at warp size
// Set the last element to 0
temp[OFFSET(2 * tid)] = val0;
temp[OFFSET(2 * tid + 1)] = val1;
// Up-Sweep (Reduction) phase
for (uint stride = 1; stride < size; stride <<= 1)
{
barrier();
uint idx = (tid + 1) * stride * 2 - 1;
if (idx < size)
{
temp[OFFSET(idx)] += temp[OFFSET(idx - stride)];
}
}
// Perform down-sweep
// Clear the last element
if (tid == 0)
{
g_v[gl_WorkGroupID.x] = temp[OFFSET(size - 1)];
temp[OFFSET(size - 1)] = 0;
}
// Down-Sweep phase
for (uint stride = size >> 1; stride > 0; stride >>= 1)
{
barrier();
uint idx = (tid + 1) * stride * 2 - 1;
if (idx < size)
{
uint t = temp[OFFSET(idx - stride)];
temp[OFFSET(idx - stride)] = temp[OFFSET(idx)];
temp[OFFSET(idx)] += t;
}
}
if (2 * gid < p.size)
array[2 * gid] = temp[OFFSET(2 * tid)] + val0;
if (2 * gid + 1 < p.size)
array[2 * gid + 1] = temp[OFFSET(2 * tid + 1)] + val1;
}