From 488b5a7b03db0faf1c0049b8f8ff4161e382d787 Mon Sep 17 00:00:00 2001 From: Arif Hasanic Date: Mon, 29 Dec 2025 01:37:19 +0100 Subject: [PATCH] try to do kernel decomp --- .clang-format | 22 ++++++++++ shaders/A2Task1KernelDecomposition.comp | 28 ++++++++++-- shaders/A2Task1Sequential.comp | 4 -- src/A2Task1Solution/KernelDecomposition.cpp | 48 +++++++++++++++------ src/A2Task1Solution/Sequential.cpp | 3 +- src/A2Task2Solution/Naive.cpp | 2 +- src/main.cpp | 20 ++++----- 7 files changed, 93 insertions(+), 34 deletions(-) create mode 100644 .clang-format diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..5ba963f --- /dev/null +++ b/.clang-format @@ -0,0 +1,22 @@ +BasedOnStyle: LLVM +IndentWidth: 4 +AlignConsecutiveAssignments: true +AlignEscapedNewlines: Left +AlignTrailingComments: Always +BreakBeforeBraces: Allman +ColumnLimit: 0 +MaxEmptyLinesToKeep: 1 +InsertNewlineAtEOF: true +BreakBeforeBinaryOperators: NonAssignment +BinPackArguments: false +PenaltyBreakBeforeFirstCallParameter: 1000 +ContinuationIndentWidth: 4 # Adjust the indent width for continuation lines + +IncludeCategories: + - Regex: '^(<.+>)$' + Priority: 1 + - Regex: '^"(.+\.hpp)"$' + Priority: 2 + - Regex: '.*' + Priority: 3 +IncludeBlocks: Regroup \ No newline at end of file diff --git a/shaders/A2Task1KernelDecomposition.comp b/shaders/A2Task1KernelDecomposition.comp index 8967b69..f9b0d48 100644 --- a/shaders/A2Task1KernelDecomposition.comp +++ b/shaders/A2Task1KernelDecomposition.comp @@ -11,14 +11,36 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; layout(push_constant) uniform PushStruct { uint size; - uint offset; } p; -layout(binding = 0) buffer inBuffer { uint v[]; }; -layout(binding = 1) buffer outBuffer { uint g_v[]; }; +layout(binding = 0) buffer inBuffer { + uint v[]; +}; +layout(binding = 1) buffer outBuffer { + uint g_v[]; +}; // TODO: Shared variables +// 512 Elements but initial reduction is done +const uint bufferSize = 256; +shared uint[bufferSize] localBuffer; void main() { // TODO: Kernel implementation + + for (uint i = p.size / 2; i < 0; i -= 2) { + localBuffer[i] = v[i] + v[i + 1]; + } + + for (uint j = bufferSize ; j != 0; j / 2) { + for (uint i = bufferSize / 2; i < 0; i -= 2) { + localBuffer[i] = localBuffer[i] + localBuffer[i + 1]; + } + } + + localBuffer[0] = localBuffer[0] + localBuffer[1]; + + for (uint i = 0; i < bufferSize; i ++) { + g_v[i] = localBuffer[i]; + } } \ No newline at end of file diff --git a/shaders/A2Task1Sequential.comp b/shaders/A2Task1Sequential.comp index afcb285..29bf9fa 100644 --- a/shaders/A2Task1Sequential.comp +++ b/shaders/A2Task1Sequential.comp @@ -19,9 +19,5 @@ layout(binding = 0) buffer inoutBufer { uint v[]; }; void main() { uint gIDx = gl_GlobalInvocationID.x; - if (gIDx + p.offset >= p.size) { - return; - } - v[gIDx] += v[gIDx + p.offset]; } diff --git a/src/A2Task1Solution/KernelDecomposition.cpp b/src/A2Task1Solution/KernelDecomposition.cpp index 7e4265b..b9008c7 100644 --- a/src/A2Task1Solution/KernelDecomposition.cpp +++ b/src/A2Task1Solution/KernelDecomposition.cpp @@ -2,8 +2,7 @@ #include "host_timer.h" -A2Task1SolutionKernelDecomposition::A2Task1SolutionKernelDecomposition(AppResources &app, uint workGroupSize, std::string shaderFileName) : - app(app), workGroupSize(workGroupSize), shaderFileName(shaderFileName) {} +A2Task1SolutionKernelDecomposition::A2Task1SolutionKernelDecomposition(AppResources &app, uint workGroupSize, std::string shaderFileName) : app(app), workGroupSize(workGroupSize), shaderFileName(shaderFileName) {} void A2Task1SolutionKernelDecomposition::prepare(const std::vector &input) { @@ -17,24 +16,22 @@ void A2Task1SolutionKernelDecomposition::prepare(const std::vector &input) pipelineLayout = app.device.createPipelineLayout(pipInfo); // Specialization constant for workgroup size - std::array specEntries = std::array{ + std::array specEntries = std::array{ {{0U, 0U, sizeof(workGroupSize)}}, - }; - std::array specValues = {workGroupSize}; //for workgroup sizes - vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), - CAST(specValues) * sizeof(int), specValues.data()); + }; + std::array specValues = {workGroupSize}; // for workgroup sizes + vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), CAST(specValues) * sizeof(int), specValues.data()); Cmn::createShader(app.device, shaderModule, shaderFileName); Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, shaderModule); - for (int i = 0; i < 2; i++) { - createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]), - vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer, - vk::MemoryPropertyFlagBits::eDeviceLocal, "buffer_" + std::to_string(i), buffers[i].buf, buffers[i].mem); + for (int i = 0; i < 2; i++) + { + createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]), vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer, vk::MemoryPropertyFlagBits::eDeviceLocal, "buffer_" + std::to_string(i), buffers[i].buf, buffers[i].mem); } fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[0], input); - + Cmn::createDescriptorPool(app.device, bindings, descriptorPool, 2); for (int i = 0; i < 2; i++) Cmn::allocateDescriptorSet(app.device, descriptorSets[i], descriptorPool, descriptorSetLayout); @@ -48,7 +45,7 @@ void A2Task1SolutionKernelDecomposition::compute() { vk::CommandBufferAllocateInfo allocInfo( app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U); - vk::CommandBuffer cb = app.device.allocateCommandBuffers( allocInfo )[0]; + vk::CommandBuffer cb = app.device.allocateCommandBuffers(allocInfo)[0]; vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); @@ -60,6 +57,29 @@ void A2Task1SolutionKernelDecomposition::compute() // (A2Task1SolutionKernelDecomposition::result()) // HINT: You can alternate between the two provided descriptor sets to implement ping-pong + uint blocksize = 512; + uint kernelCount = mpInput->size() / blocksize; + PushConstant p; + + for (; kernelCount > 0; kernelCount /= 2) + { + activeBuffer = activeBuffer == 0 ? 1 : 0; + cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[activeBuffer], 0U, nullptr); + p.size = blocksize; + + cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &p); + + // for loop to call each kernel + for (int i = 0; i < kernelCount; i++) + { + cb.dispatch(i * blocksize, 0, 0); + } + + vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead); + cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr); + + std::cout << "deine mum \n" ; + } cb.end(); vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb); @@ -94,4 +114,4 @@ void A2Task1SolutionKernelDecomposition::cleanup() for (int i = 0; i < 2; i++) destroyBuffer(app.device, buffers[i]); -} \ No newline at end of file +} diff --git a/src/A2Task1Solution/Sequential.cpp b/src/A2Task1Solution/Sequential.cpp index f204433..287ed2e 100644 --- a/src/A2Task1Solution/Sequential.cpp +++ b/src/A2Task1Solution/Sequential.cpp @@ -56,9 +56,8 @@ void A2Task1SolutionSequential::compute() uint inputSize = mpInput->size(); PushConstant pc; pc.size = inputSize; - pc.offset = inputSize / 2; - for (; pc.offset != 0 ; pc.offset = pc.offset / 2) + for (pc.offset = inputSize / 2; pc.offset != 0 ; pc.offset = pc.offset / 2) { cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &pc); diff --git a/src/A2Task2Solution/Naive.cpp b/src/A2Task2Solution/Naive.cpp index 96e8243..671320f 100644 --- a/src/A2Task2Solution/Naive.cpp +++ b/src/A2Task2Solution/Naive.cpp @@ -1,6 +1,7 @@ #include "Naive.h" #include "host_timer.h" +#include A2Task2SolutioNaive::A2Task2SolutioNaive( AppResources &app, uint workGroupSize): @@ -64,7 +65,6 @@ void A2Task2SolutioNaive::compute() { // That buffer is read back for the correctness check // (A2Task2SolutionNaive::result()) // HINT: You can alternate between the two provided descriptor sets to implement ping-pong - cb.end(); vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb); diff --git a/src/main.cpp b/src/main.cpp index e1286f7..3cc64a4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -39,14 +39,14 @@ void run_A2_task1(AppResources &app){ } }; - A2Task1SolutionInterleaved interleavedSolution(app, 128); - evaluateTask1Solution(&interleavedSolution, "Interleaved"); + // A2Task1SolutionInterleaved interleavedSolution(app, 128); + // evaluateTask1Solution(&interleavedSolution, "Interleaved"); - A2Task1SolutionSequential sequentialSolution(app, 128); - evaluateTask1Solution(&sequentialSolution, "Sequential"); + // A2Task1SolutionSequential sequentialSolution(app, 128); + // evaluateTask1Solution(&sequentialSolution, "Sequential"); - // A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv"); - // evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition"); + A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv"); + evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition"); // A2Task1SolutionKernelDecomposition kernelDecompositionUnrollSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionUnroll.comp.spv"); // evaluateTask1Solution(&kernelDecompositionUnrollSolution, "KernelDecomposition Unroll"); @@ -91,11 +91,11 @@ void run_A2_task2(AppResources& app){ A2Task2SolutioNaive naiveSolution(app, 128); evaluateTask2Solution(&a2Task2, &naiveSolution, "Naive",5); - A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, sizeLocal); - evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5); + // A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, sizeLocal); + // evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5); - A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128); - evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5); + // A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128); + // evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5); } int main()