try to do kernel decomp

This commit is contained in:
2025-12-29 01:37:19 +01:00
parent 1884daea1f
commit 488b5a7b03
7 changed files with 93 additions and 34 deletions

22
.clang-format Normal file
View File

@@ -0,0 +1,22 @@
BasedOnStyle: LLVM
IndentWidth: 4
AlignConsecutiveAssignments: true
AlignEscapedNewlines: Left
AlignTrailingComments: Always
BreakBeforeBraces: Allman
ColumnLimit: 0
MaxEmptyLinesToKeep: 1
InsertNewlineAtEOF: true
BreakBeforeBinaryOperators: NonAssignment
BinPackArguments: false
PenaltyBreakBeforeFirstCallParameter: 1000
ContinuationIndentWidth: 4 # Adjust the indent width for continuation lines
IncludeCategories:
- Regex: '^(<.+>)$'
Priority: 1
- Regex: '^"(.+\.hpp)"$'
Priority: 2
- Regex: '.*'
Priority: 3
IncludeBlocks: Regroup

View File

@@ -11,14 +11,36 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(push_constant) uniform PushStruct { layout(push_constant) uniform PushStruct {
uint size; uint size;
uint offset;
} p; } p;
layout(binding = 0) buffer inBuffer { uint v[]; }; layout(binding = 0) buffer inBuffer {
layout(binding = 1) buffer outBuffer { uint g_v[]; }; uint v[];
};
layout(binding = 1) buffer outBuffer {
uint g_v[];
};
// TODO: Shared variables // TODO: Shared variables
// 512 Elements but initial reduction is done
const uint bufferSize = 256;
shared uint[bufferSize] localBuffer;
void main() { void main() {
// TODO: Kernel implementation // TODO: Kernel implementation
for (uint i = p.size / 2; i < 0; i -= 2) {
localBuffer[i] = v[i] + v[i + 1];
}
for (uint j = bufferSize ; j != 0; j / 2) {
for (uint i = bufferSize / 2; i < 0; i -= 2) {
localBuffer[i] = localBuffer[i] + localBuffer[i + 1];
}
}
localBuffer[0] = localBuffer[0] + localBuffer[1];
for (uint i = 0; i < bufferSize; i ++) {
g_v[i] = localBuffer[i];
}
} }

View File

@@ -19,9 +19,5 @@ layout(binding = 0) buffer inoutBufer { uint v[]; };
void main() { void main() {
uint gIDx = gl_GlobalInvocationID.x; uint gIDx = gl_GlobalInvocationID.x;
if (gIDx + p.offset >= p.size) {
return;
}
v[gIDx] += v[gIDx + p.offset]; v[gIDx] += v[gIDx + p.offset];
} }

View File

@@ -2,8 +2,7 @@
#include "host_timer.h" #include "host_timer.h"
A2Task1SolutionKernelDecomposition::A2Task1SolutionKernelDecomposition(AppResources &app, uint workGroupSize, std::string shaderFileName) : A2Task1SolutionKernelDecomposition::A2Task1SolutionKernelDecomposition(AppResources &app, uint workGroupSize, std::string shaderFileName) : app(app), workGroupSize(workGroupSize), shaderFileName(shaderFileName) {}
app(app), workGroupSize(workGroupSize), shaderFileName(shaderFileName) {}
void A2Task1SolutionKernelDecomposition::prepare(const std::vector<uint> &input) void A2Task1SolutionKernelDecomposition::prepare(const std::vector<uint> &input)
{ {
@@ -21,16 +20,14 @@ void A2Task1SolutionKernelDecomposition::prepare(const std::vector<uint> &input)
{{0U, 0U, sizeof(workGroupSize)}}, {{0U, 0U, sizeof(workGroupSize)}},
}; };
std::array<uint32_t, 1> specValues = {workGroupSize}; // for workgroup sizes std::array<uint32_t, 1> specValues = {workGroupSize}; // for workgroup sizes
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), CAST(specValues) * sizeof(int), specValues.data());
CAST(specValues) * sizeof(int), specValues.data());
Cmn::createShader(app.device, shaderModule, shaderFileName); Cmn::createShader(app.device, shaderModule, shaderFileName);
Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, shaderModule); Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, shaderModule);
for (int i = 0; i < 2; i++) { for (int i = 0; i < 2; i++)
createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]), {
vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer, createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]), vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer, vk::MemoryPropertyFlagBits::eDeviceLocal, "buffer_" + std::to_string(i), buffers[i].buf, buffers[i].mem);
vk::MemoryPropertyFlagBits::eDeviceLocal, "buffer_" + std::to_string(i), buffers[i].buf, buffers[i].mem);
} }
fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[0], input); fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[0], input);
@@ -60,6 +57,29 @@ void A2Task1SolutionKernelDecomposition::compute()
// (A2Task1SolutionKernelDecomposition::result()) // (A2Task1SolutionKernelDecomposition::result())
// HINT: You can alternate between the two provided descriptor sets to implement ping-pong // HINT: You can alternate between the two provided descriptor sets to implement ping-pong
uint blocksize = 512;
uint kernelCount = mpInput->size() / blocksize;
PushConstant p;
for (; kernelCount > 0; kernelCount /= 2)
{
activeBuffer = activeBuffer == 0 ? 1 : 0;
cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[activeBuffer], 0U, nullptr);
p.size = blocksize;
cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &p);
// for loop to call each kernel
for (int i = 0; i < kernelCount; i++)
{
cb.dispatch(i * blocksize, 0, 0);
}
vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr);
std::cout << "deine mum \n" ;
}
cb.end(); cb.end();
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb); vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);

View File

@@ -56,9 +56,8 @@ void A2Task1SolutionSequential::compute()
uint inputSize = mpInput->size(); uint inputSize = mpInput->size();
PushConstant pc; PushConstant pc;
pc.size = inputSize; pc.size = inputSize;
pc.offset = inputSize / 2;
for (; pc.offset != 0 ; pc.offset = pc.offset / 2) for (pc.offset = inputSize / 2; pc.offset != 0 ; pc.offset = pc.offset / 2)
{ {
cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &pc); cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &pc);

View File

@@ -1,6 +1,7 @@
#include "Naive.h" #include "Naive.h"
#include "host_timer.h" #include "host_timer.h"
#include <iostream>
A2Task2SolutioNaive::A2Task2SolutioNaive( A2Task2SolutioNaive::A2Task2SolutioNaive(
AppResources &app, uint workGroupSize): AppResources &app, uint workGroupSize):
@@ -64,7 +65,6 @@ void A2Task2SolutioNaive::compute() {
// That buffer is read back for the correctness check // That buffer is read back for the correctness check
// (A2Task2SolutionNaive::result()) // (A2Task2SolutionNaive::result())
// HINT: You can alternate between the two provided descriptor sets to implement ping-pong // HINT: You can alternate between the two provided descriptor sets to implement ping-pong
cb.end(); cb.end();
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb); vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);

View File

@@ -39,14 +39,14 @@ void run_A2_task1(AppResources &app){
} }
}; };
A2Task1SolutionInterleaved interleavedSolution(app, 128); // A2Task1SolutionInterleaved interleavedSolution(app, 128);
evaluateTask1Solution(&interleavedSolution, "Interleaved"); // evaluateTask1Solution(&interleavedSolution, "Interleaved");
A2Task1SolutionSequential sequentialSolution(app, 128); // A2Task1SolutionSequential sequentialSolution(app, 128);
evaluateTask1Solution(&sequentialSolution, "Sequential"); // evaluateTask1Solution(&sequentialSolution, "Sequential");
// A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv"); A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv");
// evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition"); evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition");
// A2Task1SolutionKernelDecomposition kernelDecompositionUnrollSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionUnroll.comp.spv"); // A2Task1SolutionKernelDecomposition kernelDecompositionUnrollSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionUnroll.comp.spv");
// evaluateTask1Solution(&kernelDecompositionUnrollSolution, "KernelDecomposition Unroll"); // evaluateTask1Solution(&kernelDecompositionUnrollSolution, "KernelDecomposition Unroll");
@@ -91,11 +91,11 @@ void run_A2_task2(AppResources& app){
A2Task2SolutioNaive naiveSolution(app, 128); A2Task2SolutioNaive naiveSolution(app, 128);
evaluateTask2Solution(&a2Task2, &naiveSolution, "Naive",5); evaluateTask2Solution(&a2Task2, &naiveSolution, "Naive",5);
A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, sizeLocal); // A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, sizeLocal);
evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5); // evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5);
A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128); // A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128);
evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5); // evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5);
} }
int main() int main()