From 9131bf063ed50991a327612d0f901d610f2c4445 Mon Sep 17 00:00:00 2001 From: Arif Hasanic Date: Sat, 3 Jan 2026 22:55:08 +0100 Subject: [PATCH] quick commit --- shaders/A2Task1KernelDecomposition.comp | 55 +++++--- shaders/A2Task1KernelDecompositionAtomic.comp | 50 +++++-- shaders/A2Task1KernelDecompositionUnroll.comp | 106 ++++++++++++-- shaders/A2Task2KernelDecomposition.comp | 76 +++++++--- shaders/A2Task2KernelDecompositionOffset.comp | 27 ++-- shaders/A2Task2Naive.comp | 12 +- src/A2Task1Solution/KernelDecomposition.cpp | 32 ++--- src/A2Task1Solution/KernelDecomposition.h | 2 +- src/A2Task2Solution/KernelDecomposition.cpp | 131 ++++++++++++++---- src/A2Task2Solution/KernelDecomposition.h | 3 +- src/A2Task2Solution/Naive.cpp | 59 +++++--- src/main.cpp | 32 ++--- 12 files changed, 435 insertions(+), 150 deletions(-) diff --git a/shaders/A2Task1KernelDecomposition.comp b/shaders/A2Task1KernelDecomposition.comp index f9b0d48..584d6db 100644 --- a/shaders/A2Task1KernelDecomposition.comp +++ b/shaders/A2Task1KernelDecomposition.comp @@ -9,14 +9,18 @@ in uint gl_LocalInvocationIndex; */ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -layout(push_constant) uniform PushStruct { - uint size; -} p; +layout(push_constant) uniform PushStruct +{ + uint offset; +} +p; -layout(binding = 0) buffer inBuffer { +layout(binding = 0) buffer inBuffer +{ uint v[]; }; -layout(binding = 1) buffer outBuffer { +layout(binding = 1) buffer outBuffer +{ uint g_v[]; }; @@ -25,22 +29,37 @@ layout(binding = 1) buffer outBuffer { const uint bufferSize = 256; shared uint[bufferSize] localBuffer; -void main() { - // TODO: Kernel implementation +void main() +{ + uint tid = gl_LocalInvocationID.x; + uint gid = gl_WorkGroupID.x; + uint offset = gid * bufferSize; - for (uint i = p.size / 2; i < 0; i -= 2) { - localBuffer[i] = v[i] + v[i + 1]; - } + uint idx1 = offset + tid; + uint idx2 = offset + tid + gl_WorkGroupSize.x; - for (uint j = bufferSize ; j != 0; j / 2) { - for (uint i = bufferSize / 2; i < 0; i -= 2) { - localBuffer[i] = localBuffer[i] + localBuffer[i + 1]; + uint val1 = 0; + uint val2 = 0; + + if (idx1 < p.offset) + val1 = v[idx1]; + if (idx2 < p.offset) + val2 = v[idx2]; + + localBuffer[tid] = val1 + val2; + + // Reduction in shared memory + for (uint s = gl_WorkGroupSize.x / 2; s > 0; s /= 2) + { + if (tid < s) + { + barrier(); + localBuffer[tid] += localBuffer[tid + s]; } } - localBuffer[0] = localBuffer[0] + localBuffer[1]; - - for (uint i = 0; i < bufferSize; i ++) { - g_v[i] = localBuffer[i]; + if (tid == 0) + { + g_v[gid] = localBuffer[tid]; } -} \ No newline at end of file +} diff --git a/shaders/A2Task1KernelDecompositionAtomic.comp b/shaders/A2Task1KernelDecompositionAtomic.comp index 8967b69..d7e79e6 100644 --- a/shaders/A2Task1KernelDecompositionAtomic.comp +++ b/shaders/A2Task1KernelDecompositionAtomic.comp @@ -9,16 +9,48 @@ in uint gl_LocalInvocationIndex; */ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -layout(push_constant) uniform PushStruct { - uint size; +layout(push_constant) uniform PushStruct +{ uint offset; -} p; +} +p; -layout(binding = 0) buffer inBuffer { uint v[]; }; -layout(binding = 1) buffer outBuffer { uint g_v[]; }; +layout(binding = 0) buffer inBuffer +{ + uint v[]; +}; +layout(binding = 1) buffer outBuffer +{ + uint g_v[]; +}; -// TODO: Shared variables +const uint bufferSize = 256; +shared uint localBuffer; -void main() { - // TODO: Kernel implementation -} \ No newline at end of file +void main() +{ + uint tid = gl_LocalInvocationID.x; + uint gid = gl_WorkGroupID.x; + uint offset = gid * bufferSize; + + uint idx1 = offset + tid; + uint idx2 = offset + tid + gl_WorkGroupSize.x; + + uint val1 = 0; + uint val2 = 0; + + if (idx1 < p.offset) + val1 = v[idx1]; + if (idx2 < p.offset) + val2 = v[idx2]; + + if (tid == 0) + localBuffer = 0; + barrier(); + + uint partial = val1 + val2; + atomicAdd(localBuffer, partial); + barrier(); + if (tid == 0) + g_v[gid] = localBuffer; +} diff --git a/shaders/A2Task1KernelDecompositionUnroll.comp b/shaders/A2Task1KernelDecompositionUnroll.comp index 8967b69..dd8852b 100644 --- a/shaders/A2Task1KernelDecompositionUnroll.comp +++ b/shaders/A2Task1KernelDecompositionUnroll.comp @@ -9,16 +9,106 @@ in uint gl_LocalInvocationIndex; */ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -layout(push_constant) uniform PushStruct { - uint size; +layout(push_constant) uniform PushStruct +{ uint offset; -} p; +} +p; -layout(binding = 0) buffer inBuffer { uint v[]; }; -layout(binding = 1) buffer outBuffer { uint g_v[]; }; +layout(binding = 0) buffer inBuffer +{ + uint v[]; +}; +layout(binding = 1) buffer outBuffer +{ + uint g_v[]; +}; // TODO: Shared variables +// 512 Elements but initial reduction is done +const uint bufferSize = 256; +shared uint[bufferSize] localBuffer; -void main() { - // TODO: Kernel implementation -} \ No newline at end of file +void main() +{ + uint tid = gl_LocalInvocationID.x; + uint gid = gl_WorkGroupID.x; + uint offset = gid * bufferSize; + + uint idx1 = offset + tid; + uint idx2 = offset + tid + gl_WorkGroupSize.x; + + uint val1 = 0; + uint val2 = 0; + + if (idx1 < p.offset) + val1 = v[idx1]; + if (idx2 < p.offset) + val2 = v[idx2]; + + localBuffer[tid] = val1 + val2; + barrier(); + + // Reduction in shared memory (unrolled for last 5 steps) + for (uint s = gl_WorkGroupSize.x / 2; s > 32; s >>= 1) + { + if (tid < s) + localBuffer[tid] += localBuffer[tid + s]; + barrier(); + } + + // Unrolled tail with full barriers for safety on Vulkan + if (gl_WorkGroupSize.x >= 64) + { + if (tid < 32) + { + localBuffer[tid] += localBuffer[tid + 32]; + barrier(); + } + } + if (gl_WorkGroupSize.x >= 32) + { + if (tid < 16) + { + localBuffer[tid] += localBuffer[tid + 16]; + barrier(); + } + } + if (gl_WorkGroupSize.x >= 16) + { + if (tid < 8) + { + localBuffer[tid] += localBuffer[tid + 8]; + barrier(); + } + } + if (gl_WorkGroupSize.x >= 8) + { + if (tid < 4) + { + localBuffer[tid] += localBuffer[tid + 4]; + barrier(); + } + } + if (gl_WorkGroupSize.x >= 4) + { + if (tid < 2) + { + localBuffer[tid] += localBuffer[tid + 2]; + barrier(); + } + } + if (gl_WorkGroupSize.x >= 2) + { + if (tid < 1) + { + localBuffer[tid] += localBuffer[tid + 1]; + barrier(); + } + } + + if (tid == 0) + { + g_v[gid] = localBuffer[tid]; + } +} diff --git a/shaders/A2Task2KernelDecomposition.comp b/shaders/A2Task2KernelDecomposition.comp index d308fa9..472f340 100644 --- a/shaders/A2Task2KernelDecomposition.comp +++ b/shaders/A2Task2KernelDecomposition.comp @@ -8,28 +8,30 @@ in uvec3 gl_GlobalInvocationID; in uint gl_LocalInvocationIndex; */ -// Why did we not have conflicts in the Reduction? +// Why did we not have conflicts in the Reduction? // Because of the sequential addressing (here we use interleaved => we have conflicts). // TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs) -#define NUM_BANKS 32 -#define NUM_BANKS_LOG 5 -#define SIMD_GROUP_SIZE 32 +#define NUM_BANKS 32 +#define NUM_BANKS_LOG 5 +#define SIMD_GROUP_SIZE 32 +#define BUFFER_SIZE 256 layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -layout(push_constant) uniform PushStruct { +layout(push_constant) uniform PushStruct +{ uint size; -} p; +} +p; -layout(binding = 0) buffer inoutBufer {uint array[];}; -layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];}; +layout(binding = 0) buffer inoutBufer { uint array[]; }; +layout(binding = 1) buffer offsetBuffer { uint g_v[]; }; -// TODO: Shared variables +shared uint temp[BUFFER_SIZE + (BUFFER_SIZE >> NUM_BANKS_LOG)]; // Bank conflicts -#define AVOID_BANK_CONFLICTS #ifdef AVOID_BANK_CONFLICTS -// TODO: define your conflict-free macro here +#define OFFSET(A) ((A) + ((A) >> NUM_BANKS_LOG)) #else #define OFFSET(A) (A) #endif @@ -37,16 +39,54 @@ layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];}; ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void main() { - // TODO: Kernel implementation + const uint tid = gl_LocalInvocationID.x; + const uint gid = gl_GlobalInvocationID.x; + const uint size = BUFFER_SIZE; - // Cache first half of elements in the local memory - // Cache second half of elements + uint val0 = 0; + uint val1 = 0; - // Perform up-sweep + if (2 * gid < p.size) + val0 = array[2 * gid]; + if (2 * gid + 1 < p.size) + val1 = array[2 * gid + 1]; - // Unroll the last steps when arrived at warp size - // Set the last element to 0 + temp[OFFSET(2 * tid)] = val0; + temp[OFFSET(2 * tid + 1)] = val1; + // Up-Sweep (Reduction) phase + for (uint stride = 1; stride < size; stride <<= 1) + { + barrier(); + uint idx = (tid + 1) * stride * 2 - 1; + if (idx < size) + { + temp[OFFSET(idx)] += temp[OFFSET(idx - stride)]; + } + } - // Perform down-sweep + // Clear the last element + if (tid == 0) + { + g_v[gl_WorkGroupID.x] = temp[OFFSET(size - 1)]; + temp[OFFSET(size - 1)] = 0; + } + + // Down-Sweep phase + for (uint stride = size >> 1; stride > 0; stride >>= 1) + { + barrier(); + uint idx = (tid + 1) * stride * 2 - 1; + if (idx < size) + { + uint t = temp[OFFSET(idx - stride)]; + temp[OFFSET(idx - stride)] = temp[OFFSET(idx)]; + temp[OFFSET(idx)] += t; + } + } + + if (2 * gid < p.size) + array[2 * gid] = temp[OFFSET(2 * tid)] + val0; + if (2 * gid + 1 < p.size) + array[2 * gid + 1] = temp[OFFSET(2 * tid + 1)] + val1; } diff --git a/shaders/A2Task2KernelDecompositionOffset.comp b/shaders/A2Task2KernelDecompositionOffset.comp index dd02fad..b5b308b 100644 --- a/shaders/A2Task2KernelDecompositionOffset.comp +++ b/shaders/A2Task2KernelDecompositionOffset.comp @@ -8,18 +8,27 @@ in uvec3 gl_GlobalInvocationID; in uint gl_LocalInvocationIndex; */ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; -layout (constant_id = 1) const uint SAMPLE_MULTIPLIER = 1; +layout(constant_id = 1) const uint SAMPLE_MULTIPLIER = 1; // Push constant -layout(push_constant) uniform PushStruct { +layout(push_constant) uniform PushStruct +{ uint size; -} p; +} +p; -layout(binding = 0) buffer inoutBufer { uint v[]; }; -layout(binding = 1) buffer offsetBufer { uint g_v[]; }; +layout(binding = 0) buffer inoutBufer { uint data[]; }; +layout(binding = 1) buffer offsetBufer { uint offsets[]; }; -// TODO: Shared variables +void main() +{ + uint tid = gl_LocalInvocationID.x; + uint group_id = gl_WorkGroupID.x; -void main() { - // TODO: Shared variables -} \ No newline at end of file + uint gid0 = group_id * 256 + 2 * tid; + uint gid1 = group_id * 256 + 2 * tid + 1; + + uint offset = offsets[group_id - 1]; + data[gid0] += offset; + data[gid1] += offset; +} diff --git a/shaders/A2Task2Naive.comp b/shaders/A2Task2Naive.comp index 6edfd98..58d54cd 100644 --- a/shaders/A2Task2Naive.comp +++ b/shaders/A2Task2Naive.comp @@ -19,4 +19,14 @@ layout(binding = 0) buffer inBuffer { uint v[]; }; layout(binding = 1) buffer outBufer { uint g_v[]; }; void main() { -} + uint gid = gl_GlobalInvocationID.x; + + if (gid >= p.size) { + return; + } + + if (gid < p.offset) + g_v[gid] = v[gid]; + else + g_v[gid] = v[gid - p.offset] + v[gid]; +} \ No newline at end of file diff --git a/src/A2Task1Solution/KernelDecomposition.cpp b/src/A2Task1Solution/KernelDecomposition.cpp index b9008c7..c564ed0 100644 --- a/src/A2Task1Solution/KernelDecomposition.cpp +++ b/src/A2Task1Solution/KernelDecomposition.cpp @@ -50,36 +50,32 @@ void A2Task1SolutionKernelDecomposition::compute() vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); cb.begin(beginInfo); + cb.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline); - // TODO: Implement reduction with kernel decomposition - // NOTE: make sure that activeBuffer points to the buffer with the final result in the end - // That buffer is read back for the correctness check - // (A2Task1SolutionKernelDecomposition::result()) - // HINT: You can alternate between the two provided descriptor sets to implement ping-pong - - uint blocksize = 512; - uint kernelCount = mpInput->size() / blocksize; + uint blocksize = workGroupSize * 2; + uint windowSize = mpInput->size(); PushConstant p; - for (; kernelCount > 0; kernelCount /= 2) + activeBuffer = 1; + + while (windowSize > 1) { - activeBuffer = activeBuffer == 0 ? 1 : 0; + activeBuffer = 1 - activeBuffer; + uint numGroups = (windowSize + blocksize - 1) / blocksize; + cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[activeBuffer], 0U, nullptr); - p.size = blocksize; + p.offset = windowSize; cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &p); - // for loop to call each kernel - for (int i = 0; i < kernelCount; i++) - { - cb.dispatch(i * blocksize, 0, 0); - } + cb.dispatch(numGroups, 1, 1); vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead); cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr); - std::cout << "deine mum \n" ; + windowSize = numGroups; } + // todo check which buffer is active and read from that one cb.end(); vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb); @@ -97,7 +93,7 @@ void A2Task1SolutionKernelDecomposition::compute() uint A2Task1SolutionKernelDecomposition::result() const { std::vector result(1, 0); - fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[activeBuffer], result); + fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[1 - activeBuffer], result); return result[0]; } diff --git a/src/A2Task1Solution/KernelDecomposition.h b/src/A2Task1Solution/KernelDecomposition.h index cac390f..bc17faa 100644 --- a/src/A2Task1Solution/KernelDecomposition.h +++ b/src/A2Task1Solution/KernelDecomposition.h @@ -14,7 +14,7 @@ public: private: struct PushConstant { - uint size; + uint offset; }; AppResources &app; diff --git a/src/A2Task2Solution/KernelDecomposition.cpp b/src/A2Task2Solution/KernelDecomposition.cpp index 95b5e19..8a3bb44 100644 --- a/src/A2Task2Solution/KernelDecomposition.cpp +++ b/src/A2Task2Solution/KernelDecomposition.cpp @@ -1,12 +1,16 @@ #include "KernelDecomposition.h" +#include + #include "host_timer.h" -A2Task2SolutionKernelDecomposition::A2Task2SolutionKernelDecomposition(AppResources& app, uint workGroupSize): app(app), - workGroupSize(workGroupSize) { +A2Task2SolutionKernelDecomposition::A2Task2SolutionKernelDecomposition(AppResources &app, uint workGroupSize) : app(app), + workGroupSize(workGroupSize) +{ } -void A2Task2SolutionKernelDecomposition::prepare(const std::vector& input) { +void A2Task2SolutionKernelDecomposition::prepare(const std::vector &input) +{ workSize = input.size(); // Descriptor & Pipeline Layout @@ -19,41 +23,70 @@ void A2Task2SolutionKernelDecomposition::prepare(const std::vector& input) // Specialization constant for workgroup size std::array specEntries = std::array{ - {{0U, 0U, sizeof(workGroupSize)}}, - }; - std::array specValues = {workGroupSize}; //for workgroup sizes - vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), - CAST(specValues) * sizeof(int), specValues.data()); + {{0U, 0U, sizeof(workGroupSize)}}, + }; + std::array specValues = {workGroupSize}; // for workgroup sizes + vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), CAST(specValues) * sizeof(int), specValues.data()); // Local PPS Pipeline Cmn::createShader(app.device, cShaderLocalPPS, workingDir + "build/shaders/A2Task2KernelDecomposition.comp.spv"); Cmn::createPipeline(app.device, pipelineLocalPPS, pipelineLayout, specInfo, cShaderLocalPPS); // Local PPS Offset Pipeline - Cmn::createShader(app.device, cShaderLocalPPSOffset, - workingDir + "build/shaders/A2Task2KernelDecompositionOffset.comp.spv"); + Cmn::createShader(app.device, cShaderLocalPPSOffset, workingDir + "build/shaders/A2Task2KernelDecompositionOffset.comp.spv"); Cmn::createPipeline(app.device, pipelineLocalPPSOffset, pipelineLayout, specInfo, cShaderLocalPPSOffset); // ### create buffers, get their index in the task.buffers[] array ### - using BFlag = vk::BufferUsageFlagBits; - auto makeDLocalBuffer = [ this ](vk::BufferUsageFlags usage, vk::DeviceSize size, std::string name) -> Buffer { + using BFlag = vk::BufferUsageFlagBits; + auto makeDLocalBuffer = [this](vk::BufferUsageFlags usage, vk::DeviceSize size, std::string name) -> Buffer + { Buffer b; - createBuffer(app.pDevice, app.device, size, usage, vk::MemoryPropertyFlagBits::eDeviceLocal, name, b.buf, - b.mem); + createBuffer(app.pDevice, app.device, size, usage, vk::MemoryPropertyFlagBits::eDeviceLocal, name, b.buf, b.mem); return b; }; + size_t current_size = input.size(); + levelSizes.clear(); + levelSizes.push_back(current_size); + while (current_size > 256) // BUFFER_SIZE + { + current_size = (current_size + 255) / 256; + levelSizes.push_back(current_size); + } + + // Create buffers + // We need one buffer for each level, plus one for the final sum + for (size_t size : levelSizes) + { + inoutBuffers.push_back(makeDLocalBuffer(BFlag::eTransferDst | BFlag::eTransferSrc | BFlag::eStorageBuffer, + size * sizeof(uint32_t), + "buffer_level_" + std::to_string(inoutBuffers.size()))); + } + // Extra buffer for the last level's output (total sum) inoutBuffers.push_back(makeDLocalBuffer(BFlag::eTransferDst | BFlag::eTransferSrc | BFlag::eStorageBuffer, - input.size() * sizeof(uint32_t), "buffer_inout_0")); + sizeof(uint32_t), + "buffer_final_sum")); - fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffers[0], - input); + fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffers[0], input); - // TO DO create additional buffers (by pushing into inoutBuffers) and descriptors (by pushing into descriptorSets) - // You need to create an appropriately-sized DescriptorPool first + uint32_t numSets = static_cast(levelSizes.size()); + Cmn::createDescriptorPool(app.device, bindings, descriptorPool, numSets); + + // Allocate Descriptor Sets + std::vector layouts(numSets, descriptorSetLayout); + vk::DescriptorSetAllocateInfo allocInfo(descriptorPool, numSets, layouts.data()); + descriptorSets = app.device.allocateDescriptorSets(allocInfo); + + // Update Descriptor Sets + for (size_t i = 0; i < numSets; ++i) + { + Cmn::bindBuffers(app.device, inoutBuffers[i].buf, descriptorSets[i], 0); + Cmn::bindBuffers(app.device, inoutBuffers[i+1].buf, descriptorSets[i], 1); + } } -void A2Task2SolutionKernelDecomposition::compute() { +void A2Task2SolutionKernelDecomposition::compute() +{ vk::CommandBufferAllocateInfo allocInfo( app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U); vk::CommandBuffer cb = app.device.allocateCommandBuffers(allocInfo)[0]; @@ -62,8 +95,46 @@ void A2Task2SolutionKernelDecomposition::compute() { cb.begin(beginInfo); - // TODO: Implement efficient version of scan - // Make sure that the local prefix sum works before you start experimenting with large arrays + cb.bindPipeline(vk::PipelineBindPoint::eCompute, pipelineLocalPPS); + + for (size_t i = 0; i < levelSizes.size(); ++i) + { + PushStruct p; + p.size = static_cast(levelSizes[i]); + + cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[i], 0U, nullptr); + cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushStruct), &p); + + uint32_t groupCount = (levelSizes[i] + 255) / 256; + cb.dispatch(groupCount, 1, 1); + + // Barrier between levels + vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead); + cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, + vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr); + } + + cb.bindPipeline(vk::PipelineBindPoint::eCompute, pipelineLocalPPSOffset); + + if (levelSizes.size() > 1) + { + for (int i = levelSizes.size() - 2; i >= 0; i--) + { + PushStruct p; + p.size = static_cast(levelSizes[i]); + + cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[i], 0U, nullptr); + cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushStruct), &p); + + uint32_t groupCount = (levelSizes[i] + 255) / 256; + + cb.dispatch(groupCount, 1, 1); + + vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead); + cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, + vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr); + } + } cb.end(); @@ -79,15 +150,15 @@ void A2Task2SolutionKernelDecomposition::compute() { app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb); } -std::vector A2Task2SolutionKernelDecomposition::result() const { +std::vector A2Task2SolutionKernelDecomposition::result() const +{ std::vector result(workSize, 0); - fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffers[0], - result); + fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffers[0], result); return result; } - -void A2Task2SolutionKernelDecomposition::cleanup() { +void A2Task2SolutionKernelDecomposition::cleanup() +{ app.device.destroyDescriptorPool(descriptorPool); @@ -101,12 +172,14 @@ void A2Task2SolutionKernelDecomposition::cleanup() { app.device.destroyDescriptorSetLayout(descriptorSetLayout); bindings.clear(); - auto Bclean = [&](Buffer& b) { + auto Bclean = [&](Buffer &b) + { app.device.destroyBuffer(b.buf); app.device.freeMemory(b.mem); }; - for (auto inoutBuffer: inoutBuffers) { + for (auto inoutBuffer : inoutBuffers) + { Bclean(inoutBuffer); } diff --git a/src/A2Task2Solution/KernelDecomposition.h b/src/A2Task2Solution/KernelDecomposition.h index a31694c..92992da 100644 --- a/src/A2Task2Solution/KernelDecomposition.h +++ b/src/A2Task2Solution/KernelDecomposition.h @@ -50,6 +50,7 @@ private: // Descriptor Pool vk::DescriptorPool descriptorPool; - // TODO extend with any additional members you may need + std::vector descriptorSets; + std::vector levelSizes; }; \ No newline at end of file diff --git a/src/A2Task2Solution/Naive.cpp b/src/A2Task2Solution/Naive.cpp index 671320f..f5b0e25 100644 --- a/src/A2Task2Solution/Naive.cpp +++ b/src/A2Task2Solution/Naive.cpp @@ -1,13 +1,14 @@ #include "Naive.h" -#include "host_timer.h" #include -A2Task2SolutioNaive::A2Task2SolutioNaive( - AppResources &app, uint workGroupSize): - app(app), workGroupSize(workGroupSize) {} +#include "host_timer.h" -void A2Task2SolutioNaive::prepare(const std::vector &input) { +A2Task2SolutioNaive::A2Task2SolutioNaive( + AppResources &app, uint workGroupSize) : app(app), workGroupSize(workGroupSize) {} + +void A2Task2SolutioNaive::prepare(const std::vector &input) +{ workSize = input.size(); // Descriptor & Pipeline Layout @@ -19,15 +20,14 @@ void A2Task2SolutioNaive::prepare(const std::vector &input) { pipelineLayout = app.device.createPipelineLayout(pipInfo); // Specialization constant for workgroup size - std::array specEntries = std::array{ + std::array specEntries = std::array{ {{0U, 0U, sizeof(workGroupSize)}}, - }; - std::array specValues = {workGroupSize}; //for workgroup sizes - vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), - CAST(specValues) * sizeof(int), specValues.data()); + }; + std::array specValues = {workGroupSize}; // for workgroup sizes + vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), CAST(specValues) * sizeof(int), specValues.data()); // Local PPS Offset Pipeline - Cmn::createShader(app.device, cShader, workingDir +"build/shaders/A2Task2Naive.comp.spv"); + Cmn::createShader(app.device, cShader, workingDir + "build/shaders/A2Task2Naive.comp.spv"); Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, cShader); // ### create buffers, get their index in the task.buffers[] array ### @@ -49,22 +49,35 @@ void A2Task2SolutioNaive::prepare(const std::vector &input) { activeBuffer = 0; } -void A2Task2SolutioNaive::compute() { +void A2Task2SolutioNaive::compute() +{ vk::CommandBufferAllocateInfo allocInfo( app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U); - vk::CommandBuffer cb = app.device.allocateCommandBuffers( allocInfo )[0]; + vk::CommandBuffer cb = app.device.allocateCommandBuffers(allocInfo)[0]; vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); cb.begin(beginInfo); - cb.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline); - // TODO: Implement naive scan - // NOTE: make sure that activeBuffer points to the buffer with the final result in the end - // That buffer is read back for the correctness check - // (A2Task2SolutionNaive::result()) - // HINT: You can alternate between the two provided descriptor sets to implement ping-pong + PushStruct p; + p.size = workSize; + + for (uint i = 1; i < workSize; i <<= 1) + { + cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[activeBuffer], 0U, nullptr); + p.offset = i; + + cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushStruct), &p); + + cb.dispatch((workSize + workGroupSize - 1) / workGroupSize, 1, 1); + + vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead); + cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr); + + activeBuffer = 1 - activeBuffer; + } + cb.end(); vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb); @@ -79,13 +92,15 @@ void A2Task2SolutioNaive::compute() { app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb); } -std::vector A2Task2SolutioNaive::result() const { +std::vector A2Task2SolutioNaive::result() const +{ std::vector result(workSize, 0); fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[activeBuffer], result); return result; } -void A2Task2SolutioNaive::cleanup() { +void A2Task2SolutioNaive::cleanup() +{ app.device.destroyDescriptorPool(descriptorPool); app.device.destroyPipeline(pipeline); @@ -97,4 +112,4 @@ void A2Task2SolutioNaive::cleanup() { for (auto buffer : buffers) destroyBuffer(app.device, buffer); -} \ No newline at end of file +} diff --git a/src/main.cpp b/src/main.cpp index 3cc64a4..7035c01 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -39,20 +39,20 @@ void run_A2_task1(AppResources &app){ } }; - // A2Task1SolutionInterleaved interleavedSolution(app, 128); - // evaluateTask1Solution(&interleavedSolution, "Interleaved"); + A2Task1SolutionInterleaved interleavedSolution(app, 128); + evaluateTask1Solution(&interleavedSolution, "Interleaved"); - // A2Task1SolutionSequential sequentialSolution(app, 128); - // evaluateTask1Solution(&sequentialSolution, "Sequential"); + A2Task1SolutionSequential sequentialSolution(app, 128); + evaluateTask1Solution(&sequentialSolution, "Sequential"); A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv"); evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition"); - // A2Task1SolutionKernelDecomposition kernelDecompositionUnrollSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionUnroll.comp.spv"); - // evaluateTask1Solution(&kernelDecompositionUnrollSolution, "KernelDecomposition Unroll"); + A2Task1SolutionKernelDecomposition kernelDecompositionUnrollSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionUnroll.comp.spv"); + evaluateTask1Solution(&kernelDecompositionUnrollSolution, "KernelDecomposition Unroll"); - // A2Task1SolutionKernelDecomposition kernelDecompositionAtomicSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionAtomic.comp.spv"); - // evaluateTask1Solution(&kernelDecompositionAtomicSolution, "KernelDecomposition Atomic"); + A2Task1SolutionKernelDecomposition kernelDecompositionAtomicSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionAtomic.comp.spv"); + evaluateTask1Solution(&kernelDecompositionAtomicSolution, "KernelDecomposition Atomic"); } void run_A2_task2(AppResources& app){ @@ -61,7 +61,7 @@ void run_A2_task2(AppResources& app){ // This is used for testing local kernel decomposition without extension to arbitrary arrays. // Must be power of two and <= 1024! - size_t sizeLocal = 128; + size_t sizeLocal = 1024; A2Task2 a2Task2(size); A2Task2 a2Task2Local(sizeLocal); @@ -75,7 +75,7 @@ void run_A2_task2(AppResources& app){ pass &= task->evaluateSolution(*solution); solution->cleanup(); mstime += solution->mstime / N; - + if (!pass) break; } @@ -91,11 +91,11 @@ void run_A2_task2(AppResources& app){ A2Task2SolutioNaive naiveSolution(app, 128); evaluateTask2Solution(&a2Task2, &naiveSolution, "Naive",5); - // A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, sizeLocal); - // evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5); + A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, 128); + evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5); - // A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128); - // evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5); + A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128); + evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5); } int main() @@ -109,9 +109,9 @@ int main() renderdoc::initialize(); renderdoc::startCapture(); - run_A2_task1(app); + // run_A2_task1(app); - // run_A2_task2(app); + run_A2_task2(app); renderdoc::endCapture();