53 lines
1.3 KiB
Plaintext
53 lines
1.3 KiB
Plaintext
#version 450
|
|
|
|
/* built in:
|
|
in uvec3 gl_NumWorkGroups;
|
|
in uvec3 gl_WorkGroupID;
|
|
in uvec3 gl_LocalInvocationID;
|
|
in uvec3 gl_GlobalInvocationID;
|
|
in uint gl_LocalInvocationIndex;
|
|
*/
|
|
|
|
// Why did we not have conflicts in the Reduction?
|
|
// Because of the sequential addressing (here we use interleaved => we have conflicts).
|
|
// TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs)
|
|
#define NUM_BANKS 32
|
|
#define NUM_BANKS_LOG 5
|
|
#define SIMD_GROUP_SIZE 32
|
|
|
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
layout(push_constant) uniform PushStruct {
|
|
uint size;
|
|
} p;
|
|
|
|
layout(binding = 0) buffer inoutBufer {uint array[];};
|
|
layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];};
|
|
|
|
// TODO: Shared variables
|
|
|
|
// Bank conflicts
|
|
#define AVOID_BANK_CONFLICTS
|
|
#ifdef AVOID_BANK_CONFLICTS
|
|
// TODO: define your conflict-free macro here
|
|
#else
|
|
#define OFFSET(A) (A)
|
|
#endif
|
|
|
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
|
void main()
|
|
{
|
|
// TODO: Kernel implementation
|
|
|
|
// Cache first half of elements in the local memory
|
|
// Cache second half of elements
|
|
|
|
// Perform up-sweep
|
|
|
|
// Unroll the last steps when arrived at warp size
|
|
// Set the last element to 0
|
|
|
|
|
|
// Perform down-sweep
|
|
}
|