try to do kernel decomp

This commit is contained in:
2025-12-29 01:37:19 +01:00
parent 1884daea1f
commit 488b5a7b03
7 changed files with 93 additions and 34 deletions

22
.clang-format Normal file
View File

@@ -0,0 +1,22 @@
BasedOnStyle: LLVM
IndentWidth: 4
AlignConsecutiveAssignments: true
AlignEscapedNewlines: Left
AlignTrailingComments: Always
BreakBeforeBraces: Allman
ColumnLimit: 0
MaxEmptyLinesToKeep: 1
InsertNewlineAtEOF: true
BreakBeforeBinaryOperators: NonAssignment
BinPackArguments: false
PenaltyBreakBeforeFirstCallParameter: 1000
ContinuationIndentWidth: 4 # Adjust the indent width for continuation lines
IncludeCategories:
- Regex: '^(<.+>)$'
Priority: 1
- Regex: '^"(.+\.hpp)"$'
Priority: 2
- Regex: '.*'
Priority: 3
IncludeBlocks: Regroup

View File

@@ -11,14 +11,36 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(push_constant) uniform PushStruct {
uint size;
uint offset;
} p;
layout(binding = 0) buffer inBuffer { uint v[]; };
layout(binding = 1) buffer outBuffer { uint g_v[]; };
layout(binding = 0) buffer inBuffer {
uint v[];
};
layout(binding = 1) buffer outBuffer {
uint g_v[];
};
// TODO: Shared variables
// 512 Elements but initial reduction is done
const uint bufferSize = 256;
shared uint[bufferSize] localBuffer;
void main() {
// TODO: Kernel implementation
for (uint i = p.size / 2; i < 0; i -= 2) {
localBuffer[i] = v[i] + v[i + 1];
}
for (uint j = bufferSize ; j != 0; j / 2) {
for (uint i = bufferSize / 2; i < 0; i -= 2) {
localBuffer[i] = localBuffer[i] + localBuffer[i + 1];
}
}
localBuffer[0] = localBuffer[0] + localBuffer[1];
for (uint i = 0; i < bufferSize; i ++) {
g_v[i] = localBuffer[i];
}
}

View File

@@ -19,9 +19,5 @@ layout(binding = 0) buffer inoutBufer { uint v[]; };
void main() {
uint gIDx = gl_GlobalInvocationID.x;
if (gIDx + p.offset >= p.size) {
return;
}
v[gIDx] += v[gIDx + p.offset];
}

View File

@@ -2,8 +2,7 @@
#include "host_timer.h"
A2Task1SolutionKernelDecomposition::A2Task1SolutionKernelDecomposition(AppResources &app, uint workGroupSize, std::string shaderFileName) :
app(app), workGroupSize(workGroupSize), shaderFileName(shaderFileName) {}
A2Task1SolutionKernelDecomposition::A2Task1SolutionKernelDecomposition(AppResources &app, uint workGroupSize, std::string shaderFileName) : app(app), workGroupSize(workGroupSize), shaderFileName(shaderFileName) {}
void A2Task1SolutionKernelDecomposition::prepare(const std::vector<uint> &input)
{
@@ -21,16 +20,14 @@ void A2Task1SolutionKernelDecomposition::prepare(const std::vector<uint> &input)
{{0U, 0U, sizeof(workGroupSize)}},
};
std::array<uint32_t, 1> specValues = {workGroupSize}; // for workgroup sizes
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(),
CAST(specValues) * sizeof(int), specValues.data());
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), CAST(specValues) * sizeof(int), specValues.data());
Cmn::createShader(app.device, shaderModule, shaderFileName);
Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, shaderModule);
for (int i = 0; i < 2; i++) {
createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]),
vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer,
vk::MemoryPropertyFlagBits::eDeviceLocal, "buffer_" + std::to_string(i), buffers[i].buf, buffers[i].mem);
for (int i = 0; i < 2; i++)
{
createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]), vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer, vk::MemoryPropertyFlagBits::eDeviceLocal, "buffer_" + std::to_string(i), buffers[i].buf, buffers[i].mem);
}
fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[0], input);
@@ -60,6 +57,29 @@ void A2Task1SolutionKernelDecomposition::compute()
// (A2Task1SolutionKernelDecomposition::result())
// HINT: You can alternate between the two provided descriptor sets to implement ping-pong
uint blocksize = 512;
uint kernelCount = mpInput->size() / blocksize;
PushConstant p;
for (; kernelCount > 0; kernelCount /= 2)
{
activeBuffer = activeBuffer == 0 ? 1 : 0;
cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[activeBuffer], 0U, nullptr);
p.size = blocksize;
cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &p);
// for loop to call each kernel
for (int i = 0; i < kernelCount; i++)
{
cb.dispatch(i * blocksize, 0, 0);
}
vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr);
std::cout << "deine mum \n" ;
}
cb.end();
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);

View File

@@ -56,9 +56,8 @@ void A2Task1SolutionSequential::compute()
uint inputSize = mpInput->size();
PushConstant pc;
pc.size = inputSize;
pc.offset = inputSize / 2;
for (; pc.offset != 0 ; pc.offset = pc.offset / 2)
for (pc.offset = inputSize / 2; pc.offset != 0 ; pc.offset = pc.offset / 2)
{
cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &pc);

View File

@@ -1,6 +1,7 @@
#include "Naive.h"
#include "host_timer.h"
#include <iostream>
A2Task2SolutioNaive::A2Task2SolutioNaive(
AppResources &app, uint workGroupSize):
@@ -64,7 +65,6 @@ void A2Task2SolutioNaive::compute() {
// That buffer is read back for the correctness check
// (A2Task2SolutionNaive::result())
// HINT: You can alternate between the two provided descriptor sets to implement ping-pong
cb.end();
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);

View File

@@ -39,14 +39,14 @@ void run_A2_task1(AppResources &app){
}
};
A2Task1SolutionInterleaved interleavedSolution(app, 128);
evaluateTask1Solution(&interleavedSolution, "Interleaved");
// A2Task1SolutionInterleaved interleavedSolution(app, 128);
// evaluateTask1Solution(&interleavedSolution, "Interleaved");
A2Task1SolutionSequential sequentialSolution(app, 128);
evaluateTask1Solution(&sequentialSolution, "Sequential");
// A2Task1SolutionSequential sequentialSolution(app, 128);
// evaluateTask1Solution(&sequentialSolution, "Sequential");
// A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv");
// evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition");
A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv");
evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition");
// A2Task1SolutionKernelDecomposition kernelDecompositionUnrollSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionUnroll.comp.spv");
// evaluateTask1Solution(&kernelDecompositionUnrollSolution, "KernelDecomposition Unroll");
@@ -91,11 +91,11 @@ void run_A2_task2(AppResources& app){
A2Task2SolutioNaive naiveSolution(app, 128);
evaluateTask2Solution(&a2Task2, &naiveSolution, "Naive",5);
A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, sizeLocal);
evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5);
// A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, sizeLocal);
// evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5);
A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128);
evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5);
// A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128);
// evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5);
}
int main()