quick commit

2026-01-03 22:55:08 +01:00
parent 488b5a7b03
commit 9131bf063e
12 changed files with 435 additions and 150 deletions
--- a/shaders/A2Task1KernelDecomposition.comp
+++ b/shaders/A2Task1KernelDecomposition.comp
@@ -9,14 +9,18 @@ in uint  gl_LocalInvocationIndex;
 */
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

-layout(push_constant) uniform PushStruct {
-    uint size;
-} p;
+layout(push_constant) uniform PushStruct
+{
+    uint offset;
+}
+p;

-layout(binding = 0) buffer inBuffer {
+layout(binding = 0) buffer inBuffer
+{
    uint v[];
 };
-layout(binding = 1) buffer outBuffer {
+layout(binding = 1) buffer outBuffer
+{
    uint g_v[];
 };

@@ -25,22 +29,37 @@ layout(binding = 1) buffer outBuffer {
 const uint bufferSize = 256;
 shared uint[bufferSize] localBuffer;

-void main() {
-    // TODO: Kernel implementation
+void main()
+{
+    uint tid    = gl_LocalInvocationID.x;
+    uint gid    = gl_WorkGroupID.x;
+    uint offset = gid * bufferSize;

-    for (uint i = p.size / 2; i < 0; i -= 2) {
-        localBuffer[i] = v[i] + v[i + 1];
-    }
+    uint idx1 = offset + tid;
+    uint idx2 = offset + tid + gl_WorkGroupSize.x;

-    for (uint j = bufferSize ; j != 0; j / 2) {
-        for (uint i = bufferSize / 2; i < 0; i -= 2) {
-            localBuffer[i] = localBuffer[i] + localBuffer[i + 1];
+    uint val1 = 0;
+    uint val2 = 0;
+
+    if (idx1 < p.offset)
+        val1 = v[idx1];
+    if (idx2 < p.offset)
+        val2 = v[idx2];
+
+    localBuffer[tid] = val1 + val2;
+
+    // Reduction in shared memory
+    for (uint s = gl_WorkGroupSize.x / 2; s > 0; s /= 2)
+    {
+        if (tid < s)
+        {
+            barrier();
+            localBuffer[tid] += localBuffer[tid + s];
        }
    }

-    localBuffer[0] = localBuffer[0] + localBuffer[1];
-
-    for (uint i = 0; i < bufferSize; i ++) {
-        g_v[i] = localBuffer[i];
+    if (tid == 0)
+    {
+        g_v[gid] = localBuffer[tid];
    }
-}
+}
--- a/shaders/A2Task1KernelDecompositionAtomic.comp
+++ b/shaders/A2Task1KernelDecompositionAtomic.comp
@@ -9,16 +9,48 @@ in uint  gl_LocalInvocationIndex;
 */
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

-layout(push_constant) uniform PushStruct {
-    uint size;
+layout(push_constant) uniform PushStruct
+{
    uint offset;
-} p;
+}
+p;

-layout(binding = 0) buffer inBuffer { uint v[]; };
-layout(binding = 1) buffer outBuffer { uint g_v[]; };
+layout(binding = 0) buffer inBuffer
+{
+    uint v[];
+};
+layout(binding = 1) buffer outBuffer
+{
+    uint g_v[];
+};

-// TODO: Shared variables
+const uint bufferSize = 256;
+shared uint localBuffer;

-void main() {
-    // TODO: Kernel implementation
-}
+void main()
+{
+    uint tid    = gl_LocalInvocationID.x;
+    uint gid    = gl_WorkGroupID.x;
+    uint offset = gid * bufferSize;
+
+    uint idx1 = offset + tid;
+    uint idx2 = offset + tid + gl_WorkGroupSize.x;
+
+    uint val1 = 0;
+    uint val2 = 0;
+
+    if (idx1 < p.offset)
+        val1 = v[idx1];
+    if (idx2 < p.offset)
+        val2 = v[idx2];
+
+    if (tid == 0)
+        localBuffer = 0;
+    barrier();
+
+    uint partial = val1 + val2;
+    atomicAdd(localBuffer, partial);
+    barrier();
+    if (tid == 0)
+        g_v[gid] = localBuffer;
+}
--- a/shaders/A2Task1KernelDecompositionUnroll.comp
+++ b/shaders/A2Task1KernelDecompositionUnroll.comp
@@ -9,16 +9,106 @@ in uint  gl_LocalInvocationIndex;
 */
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

-layout(push_constant) uniform PushStruct {
-    uint size;
+layout(push_constant) uniform PushStruct
+{
    uint offset;
-} p;
+}
+p;

-layout(binding = 0) buffer inBuffer { uint v[]; };
-layout(binding = 1) buffer outBuffer { uint g_v[]; };
+layout(binding = 0) buffer inBuffer
+{
+    uint v[];
+};
+layout(binding = 1) buffer outBuffer
+{
+    uint g_v[];
+};

 // TODO: Shared variables
+// 512 Elements but initial reduction is done
+const uint bufferSize = 256;
+shared uint[bufferSize] localBuffer;

-void main() {
-    // TODO: Kernel implementation
-}
+void main()
+{
+    uint tid    = gl_LocalInvocationID.x;
+    uint gid    = gl_WorkGroupID.x;
+    uint offset = gid * bufferSize;
+
+    uint idx1 = offset + tid;
+    uint idx2 = offset + tid + gl_WorkGroupSize.x;
+
+    uint val1 = 0;
+    uint val2 = 0;
+
+    if (idx1 < p.offset)
+        val1 = v[idx1];
+    if (idx2 < p.offset)
+        val2 = v[idx2];
+
+    localBuffer[tid] = val1 + val2;
+    barrier();
+
+    // Reduction in shared memory (unrolled for last 5 steps)
+    for (uint s = gl_WorkGroupSize.x / 2; s > 32; s >>= 1)
+    {
+        if (tid < s)
+            localBuffer[tid] += localBuffer[tid + s];
+        barrier();
+    }
+
+    // Unrolled tail with full barriers for safety on Vulkan
+    if (gl_WorkGroupSize.x >= 64)
+    {
+        if (tid < 32)
+        {
+            localBuffer[tid] += localBuffer[tid + 32];
+            barrier();
+        }
+    }
+    if (gl_WorkGroupSize.x >= 32)
+    {
+        if (tid < 16)
+        {
+            localBuffer[tid] += localBuffer[tid + 16];
+            barrier();
+        }
+    }
+    if (gl_WorkGroupSize.x >= 16)
+    {
+        if (tid < 8)
+        {
+            localBuffer[tid] += localBuffer[tid + 8];
+            barrier();
+        }
+    }
+    if (gl_WorkGroupSize.x >= 8)
+    {
+        if (tid < 4)
+        {
+            localBuffer[tid] += localBuffer[tid + 4];
+            barrier();
+        }
+    }
+    if (gl_WorkGroupSize.x >= 4)
+    {
+        if (tid < 2)
+        {
+            localBuffer[tid] += localBuffer[tid + 2];
+            barrier();
+        }
+    }
+    if (gl_WorkGroupSize.x >= 2)
+    {
+        if (tid < 1)
+        {
+            localBuffer[tid] += localBuffer[tid + 1];
+            barrier();
+        }
+    }
+
+    if (tid == 0)
+    {
+        g_v[gid] = localBuffer[tid];
+    }
+}
--- a/shaders/A2Task2KernelDecomposition.comp
+++ b/shaders/A2Task2KernelDecomposition.comp
@@ -8,28 +8,30 @@ in uvec3 gl_GlobalInvocationID;
 in uint  gl_LocalInvocationIndex;
 */

-// Why did we not have conflicts in the Reduction? 
+// Why did we not have conflicts in the Reduction?
 // Because of the sequential addressing (here we use interleaved => we have conflicts).
 // TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs)
-#define NUM_BANKS			32
-#define NUM_BANKS_LOG		5
-#define SIMD_GROUP_SIZE		32
+#define NUM_BANKS 32
+#define NUM_BANKS_LOG 5
+#define SIMD_GROUP_SIZE 32
+#define BUFFER_SIZE 256

 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

-layout(push_constant) uniform PushStruct {
+layout(push_constant) uniform PushStruct
+{
    uint size;
-} p;
+}
+p;

-layout(binding = 0) buffer inoutBufer {uint array[];};
-layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];};
+layout(binding = 0) buffer inoutBufer { uint array[]; };
+layout(binding = 1) buffer offsetBuffer { uint g_v[]; };

-// TODO: Shared variables
+shared uint temp[BUFFER_SIZE + (BUFFER_SIZE >> NUM_BANKS_LOG)];

 // Bank conflicts
-#define AVOID_BANK_CONFLICTS
 #ifdef AVOID_BANK_CONFLICTS
-// TODO: define your conflict-free macro here
+#define OFFSET(A) ((A) + ((A) >> NUM_BANKS_LOG))
 #else
 #define OFFSET(A) (A)
 #endif
@@ -37,16 +39,54 @@ layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];};
 //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 void main()
 {
-    // TODO: Kernel implementation
+    const uint tid  = gl_LocalInvocationID.x;
+    const uint gid  = gl_GlobalInvocationID.x;
+    const uint size = BUFFER_SIZE;

-    // Cache first half of elements in the local memory
-    // Cache second half of elements
+    uint val0 = 0;
+    uint val1 = 0;

-    // Perform up-sweep
+    if (2 * gid < p.size)
+        val0 = array[2 * gid];
+    if (2 * gid + 1 < p.size)
+        val1 = array[2 * gid + 1];

-    // Unroll the last steps when arrived at warp size
-    // Set the last element to 0
+    temp[OFFSET(2 * tid)]     = val0;
+    temp[OFFSET(2 * tid + 1)] = val1;

+    // Up-Sweep (Reduction) phase
+    for (uint stride = 1; stride < size; stride <<= 1)
+    {
+        barrier();
+        uint idx = (tid + 1) * stride * 2 - 1;
+        if (idx < size)
+        {
+            temp[OFFSET(idx)] += temp[OFFSET(idx - stride)];
+        }
+    }

-    // Perform down-sweep
+    // Clear the last element
+    if (tid == 0)
+    {
+        g_v[gl_WorkGroupID.x] = temp[OFFSET(size - 1)];
+        temp[OFFSET(size - 1)] = 0;
+    }
+
+    // Down-Sweep phase
+    for (uint stride = size >> 1; stride > 0; stride >>= 1)
+    {
+        barrier();
+        uint idx = (tid + 1) * stride * 2 - 1;
+        if (idx < size)
+        {
+            uint t                     = temp[OFFSET(idx - stride)];
+            temp[OFFSET(idx - stride)] = temp[OFFSET(idx)];
+            temp[OFFSET(idx)] += t;
+        }
+    }
+
+    if (2 * gid < p.size)
+        array[2 * gid]     = temp[OFFSET(2 * tid)] + val0;
+    if (2 * gid + 1 < p.size)
+        array[2 * gid + 1] = temp[OFFSET(2 * tid + 1)] + val1;
 }
--- a/shaders/A2Task2KernelDecompositionOffset.comp
+++ b/shaders/A2Task2KernelDecompositionOffset.comp
@@ -8,18 +8,27 @@ in uvec3 gl_GlobalInvocationID;
 in uint  gl_LocalInvocationIndex;
 */
 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
-layout (constant_id = 1) const uint SAMPLE_MULTIPLIER = 1;
+layout(constant_id = 1) const uint SAMPLE_MULTIPLIER = 1;

 // Push constant
-layout(push_constant) uniform PushStruct {
+layout(push_constant) uniform PushStruct
+{
    uint size;
-} p;
+}
+p;

-layout(binding = 0) buffer inoutBufer { uint v[]; };
-layout(binding = 1) buffer offsetBufer { uint g_v[]; };
+layout(binding = 0) buffer inoutBufer { uint data[]; };
+layout(binding = 1) buffer offsetBufer { uint offsets[]; };

-// TODO: Shared variables
+void main()
+{
+    uint tid      = gl_LocalInvocationID.x;
+    uint group_id = gl_WorkGroupID.x;

-void main() {
-    // TODO: Shared variables
-}
+    uint gid0 = group_id * 256 + 2 * tid;
+    uint gid1 = group_id * 256 + 2 * tid + 1;
+
+    uint offset = offsets[group_id - 1];
+    data[gid0] += offset;
+    data[gid1] += offset;
+}
--- a/shaders/A2Task2Naive.comp
+++ b/shaders/A2Task2Naive.comp
@@ -19,4 +19,14 @@ layout(binding = 0) buffer inBuffer { uint v[]; };
 layout(binding = 1) buffer outBufer { uint g_v[]; };

 void main() {
-}
+    uint gid = gl_GlobalInvocationID.x;
+    
+    if (gid >= p.size) {
+        return;
+    }
+    
+    if (gid < p.offset)
+        g_v[gid] = v[gid];
+    else
+        g_v[gid] = v[gid - p.offset] + v[gid];
+}