quick commit

2026-01-03 22:55:08 +01:00
parent 488b5a7b03
commit 9131bf063e
12 changed files with 435 additions and 150 deletions
--- a/shaders/A2Task2KernelDecomposition.comp
+++ b/shaders/A2Task2KernelDecomposition.comp
@@ -8,28 +8,30 @@ in uvec3 gl_GlobalInvocationID;
 in uint  gl_LocalInvocationIndex;
 */

-// Why did we not have conflicts in the Reduction? 
+// Why did we not have conflicts in the Reduction?
 // Because of the sequential addressing (here we use interleaved => we have conflicts).
 // TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs)
-#define NUM_BANKS			32
-#define NUM_BANKS_LOG		5
-#define SIMD_GROUP_SIZE		32
+#define NUM_BANKS 32
+#define NUM_BANKS_LOG 5
+#define SIMD_GROUP_SIZE 32
+#define BUFFER_SIZE 256

 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

-layout(push_constant) uniform PushStruct {
+layout(push_constant) uniform PushStruct
+{
    uint size;
-} p;
+}
+p;

-layout(binding = 0) buffer inoutBufer {uint array[];};
-layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];};
+layout(binding = 0) buffer inoutBufer { uint array[]; };
+layout(binding = 1) buffer offsetBuffer { uint g_v[]; };

-// TODO: Shared variables
+shared uint temp[BUFFER_SIZE + (BUFFER_SIZE >> NUM_BANKS_LOG)];

 // Bank conflicts
-#define AVOID_BANK_CONFLICTS
 #ifdef AVOID_BANK_CONFLICTS
-// TODO: define your conflict-free macro here
+#define OFFSET(A) ((A) + ((A) >> NUM_BANKS_LOG))
 #else
 #define OFFSET(A) (A)
 #endif
@@ -37,16 +39,54 @@ layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];};
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void main()
 {
-    // TODO: Kernel implementation
+    const uint tid  = gl_LocalInvocationID.x;
+    const uint gid  = gl_GlobalInvocationID.x;
+    const uint size = BUFFER_SIZE;

-    // Cache first half of elements in the local memory
-    // Cache second half of elements
+    uint val0 = 0;
+    uint val1 = 0;

-    // Perform up-sweep
+    if (2 * gid < p.size)
+        val0 = array[2 * gid];
+    if (2 * gid + 1 < p.size)
+        val1 = array[2 * gid + 1];

-    // Unroll the last steps when arrived at warp size
-    // Set the last element to 0
+    temp[OFFSET(2 * tid)]     = val0;
+    temp[OFFSET(2 * tid + 1)] = val1;

+    // Up-Sweep (Reduction) phase
+    for (uint stride = 1; stride < size; stride <<= 1)
+    {
+        barrier();
+        uint idx = (tid + 1) * stride * 2 - 1;
+        if (idx < size)
+        {
+            temp[OFFSET(idx)] += temp[OFFSET(idx - stride)];
+        }
+    }

-    // Perform down-sweep
+    // Clear the last element
+    if (tid == 0)
+    {
+        g_v[gl_WorkGroupID.x] = temp[OFFSET(size - 1)];
+        temp[OFFSET(size - 1)] = 0;
+    }
+
+    // Down-Sweep phase
+    for (uint stride = size >> 1; stride > 0; stride >>= 1)
+    {
+        barrier();
+        uint idx = (tid + 1) * stride * 2 - 1;
+        if (idx < size)
+        {
+            uint t                     = temp[OFFSET(idx - stride)];
+            temp[OFFSET(idx - stride)] = temp[OFFSET(idx)];
+            temp[OFFSET(idx)] += t;
+        }
+    }
+
+    if (2 * gid < p.size)
+        array[2 * gid]     = temp[OFFSET(2 * tid)] + val0;
+    if (2 * gid + 1 < p.size)
+        array[2 * gid + 1] = temp[OFFSET(2 * tid + 1)] + val1;
 }