try to do kernel decomp
This commit is contained in:
22
.clang-format
Normal file
22
.clang-format
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
BasedOnStyle: LLVM
|
||||||
|
IndentWidth: 4
|
||||||
|
AlignConsecutiveAssignments: true
|
||||||
|
AlignEscapedNewlines: Left
|
||||||
|
AlignTrailingComments: Always
|
||||||
|
BreakBeforeBraces: Allman
|
||||||
|
ColumnLimit: 0
|
||||||
|
MaxEmptyLinesToKeep: 1
|
||||||
|
InsertNewlineAtEOF: true
|
||||||
|
BreakBeforeBinaryOperators: NonAssignment
|
||||||
|
BinPackArguments: false
|
||||||
|
PenaltyBreakBeforeFirstCallParameter: 1000
|
||||||
|
ContinuationIndentWidth: 4 # Adjust the indent width for continuation lines
|
||||||
|
|
||||||
|
IncludeCategories:
|
||||||
|
- Regex: '^(<.+>)$'
|
||||||
|
Priority: 1
|
||||||
|
- Regex: '^"(.+\.hpp)"$'
|
||||||
|
Priority: 2
|
||||||
|
- Regex: '.*'
|
||||||
|
Priority: 3
|
||||||
|
IncludeBlocks: Regroup
|
||||||
@@ -11,14 +11,36 @@ layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
|||||||
|
|
||||||
layout(push_constant) uniform PushStruct {
|
layout(push_constant) uniform PushStruct {
|
||||||
uint size;
|
uint size;
|
||||||
uint offset;
|
|
||||||
} p;
|
} p;
|
||||||
|
|
||||||
layout(binding = 0) buffer inBuffer { uint v[]; };
|
layout(binding = 0) buffer inBuffer {
|
||||||
layout(binding = 1) buffer outBuffer { uint g_v[]; };
|
uint v[];
|
||||||
|
};
|
||||||
|
layout(binding = 1) buffer outBuffer {
|
||||||
|
uint g_v[];
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: Shared variables
|
// TODO: Shared variables
|
||||||
|
// 512 Elements but initial reduction is done
|
||||||
|
const uint bufferSize = 256;
|
||||||
|
shared uint[bufferSize] localBuffer;
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
// TODO: Kernel implementation
|
// TODO: Kernel implementation
|
||||||
|
|
||||||
|
for (uint i = p.size / 2; i < 0; i -= 2) {
|
||||||
|
localBuffer[i] = v[i] + v[i + 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (uint j = bufferSize ; j != 0; j / 2) {
|
||||||
|
for (uint i = bufferSize / 2; i < 0; i -= 2) {
|
||||||
|
localBuffer[i] = localBuffer[i] + localBuffer[i + 1];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
localBuffer[0] = localBuffer[0] + localBuffer[1];
|
||||||
|
|
||||||
|
for (uint i = 0; i < bufferSize; i ++) {
|
||||||
|
g_v[i] = localBuffer[i];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -19,9 +19,5 @@ layout(binding = 0) buffer inoutBufer { uint v[]; };
|
|||||||
void main() {
|
void main() {
|
||||||
uint gIDx = gl_GlobalInvocationID.x;
|
uint gIDx = gl_GlobalInvocationID.x;
|
||||||
|
|
||||||
if (gIDx + p.offset >= p.size) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
v[gIDx] += v[gIDx + p.offset];
|
v[gIDx] += v[gIDx + p.offset];
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,8 +2,7 @@
|
|||||||
|
|
||||||
#include "host_timer.h"
|
#include "host_timer.h"
|
||||||
|
|
||||||
A2Task1SolutionKernelDecomposition::A2Task1SolutionKernelDecomposition(AppResources &app, uint workGroupSize, std::string shaderFileName) :
|
A2Task1SolutionKernelDecomposition::A2Task1SolutionKernelDecomposition(AppResources &app, uint workGroupSize, std::string shaderFileName) : app(app), workGroupSize(workGroupSize), shaderFileName(shaderFileName) {}
|
||||||
app(app), workGroupSize(workGroupSize), shaderFileName(shaderFileName) {}
|
|
||||||
|
|
||||||
void A2Task1SolutionKernelDecomposition::prepare(const std::vector<uint> &input)
|
void A2Task1SolutionKernelDecomposition::prepare(const std::vector<uint> &input)
|
||||||
{
|
{
|
||||||
@@ -17,24 +16,22 @@ void A2Task1SolutionKernelDecomposition::prepare(const std::vector<uint> &input)
|
|||||||
pipelineLayout = app.device.createPipelineLayout(pipInfo);
|
pipelineLayout = app.device.createPipelineLayout(pipInfo);
|
||||||
|
|
||||||
// Specialization constant for workgroup size
|
// Specialization constant for workgroup size
|
||||||
std::array<vk::SpecializationMapEntry, 1> specEntries = std::array<vk::SpecializationMapEntry, 1>{
|
std::array<vk::SpecializationMapEntry, 1> specEntries = std::array<vk::SpecializationMapEntry, 1>{
|
||||||
{{0U, 0U, sizeof(workGroupSize)}},
|
{{0U, 0U, sizeof(workGroupSize)}},
|
||||||
};
|
};
|
||||||
std::array<uint32_t, 1> specValues = {workGroupSize}; //for workgroup sizes
|
std::array<uint32_t, 1> specValues = {workGroupSize}; // for workgroup sizes
|
||||||
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(),
|
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), CAST(specValues) * sizeof(int), specValues.data());
|
||||||
CAST(specValues) * sizeof(int), specValues.data());
|
|
||||||
|
|
||||||
Cmn::createShader(app.device, shaderModule, shaderFileName);
|
Cmn::createShader(app.device, shaderModule, shaderFileName);
|
||||||
Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, shaderModule);
|
Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, shaderModule);
|
||||||
|
|
||||||
for (int i = 0; i < 2; i++) {
|
for (int i = 0; i < 2; i++)
|
||||||
createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]),
|
{
|
||||||
vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer,
|
createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]), vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer, vk::MemoryPropertyFlagBits::eDeviceLocal, "buffer_" + std::to_string(i), buffers[i].buf, buffers[i].mem);
|
||||||
vk::MemoryPropertyFlagBits::eDeviceLocal, "buffer_" + std::to_string(i), buffers[i].buf, buffers[i].mem);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[0], input);
|
fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[0], input);
|
||||||
|
|
||||||
Cmn::createDescriptorPool(app.device, bindings, descriptorPool, 2);
|
Cmn::createDescriptorPool(app.device, bindings, descriptorPool, 2);
|
||||||
for (int i = 0; i < 2; i++)
|
for (int i = 0; i < 2; i++)
|
||||||
Cmn::allocateDescriptorSet(app.device, descriptorSets[i], descriptorPool, descriptorSetLayout);
|
Cmn::allocateDescriptorSet(app.device, descriptorSets[i], descriptorPool, descriptorSetLayout);
|
||||||
@@ -48,7 +45,7 @@ void A2Task1SolutionKernelDecomposition::compute()
|
|||||||
{
|
{
|
||||||
vk::CommandBufferAllocateInfo allocInfo(
|
vk::CommandBufferAllocateInfo allocInfo(
|
||||||
app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U);
|
app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U);
|
||||||
vk::CommandBuffer cb = app.device.allocateCommandBuffers( allocInfo )[0];
|
vk::CommandBuffer cb = app.device.allocateCommandBuffers(allocInfo)[0];
|
||||||
|
|
||||||
vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
||||||
|
|
||||||
@@ -60,6 +57,29 @@ void A2Task1SolutionKernelDecomposition::compute()
|
|||||||
// (A2Task1SolutionKernelDecomposition::result())
|
// (A2Task1SolutionKernelDecomposition::result())
|
||||||
// HINT: You can alternate between the two provided descriptor sets to implement ping-pong
|
// HINT: You can alternate between the two provided descriptor sets to implement ping-pong
|
||||||
|
|
||||||
|
uint blocksize = 512;
|
||||||
|
uint kernelCount = mpInput->size() / blocksize;
|
||||||
|
PushConstant p;
|
||||||
|
|
||||||
|
for (; kernelCount > 0; kernelCount /= 2)
|
||||||
|
{
|
||||||
|
activeBuffer = activeBuffer == 0 ? 1 : 0;
|
||||||
|
cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[activeBuffer], 0U, nullptr);
|
||||||
|
p.size = blocksize;
|
||||||
|
|
||||||
|
cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &p);
|
||||||
|
|
||||||
|
// for loop to call each kernel
|
||||||
|
for (int i = 0; i < kernelCount; i++)
|
||||||
|
{
|
||||||
|
cb.dispatch(i * blocksize, 0, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
|
||||||
|
cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr);
|
||||||
|
|
||||||
|
std::cout << "deine mum \n" ;
|
||||||
|
}
|
||||||
cb.end();
|
cb.end();
|
||||||
|
|
||||||
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
|
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
|
||||||
@@ -94,4 +114,4 @@ void A2Task1SolutionKernelDecomposition::cleanup()
|
|||||||
|
|
||||||
for (int i = 0; i < 2; i++)
|
for (int i = 0; i < 2; i++)
|
||||||
destroyBuffer(app.device, buffers[i]);
|
destroyBuffer(app.device, buffers[i]);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -56,9 +56,8 @@ void A2Task1SolutionSequential::compute()
|
|||||||
uint inputSize = mpInput->size();
|
uint inputSize = mpInput->size();
|
||||||
PushConstant pc;
|
PushConstant pc;
|
||||||
pc.size = inputSize;
|
pc.size = inputSize;
|
||||||
pc.offset = inputSize / 2;
|
|
||||||
|
|
||||||
for (; pc.offset != 0 ; pc.offset = pc.offset / 2)
|
for (pc.offset = inputSize / 2; pc.offset != 0 ; pc.offset = pc.offset / 2)
|
||||||
{
|
{
|
||||||
cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &pc);
|
cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &pc);
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
#include "Naive.h"
|
#include "Naive.h"
|
||||||
|
|
||||||
#include "host_timer.h"
|
#include "host_timer.h"
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
A2Task2SolutioNaive::A2Task2SolutioNaive(
|
A2Task2SolutioNaive::A2Task2SolutioNaive(
|
||||||
AppResources &app, uint workGroupSize):
|
AppResources &app, uint workGroupSize):
|
||||||
@@ -64,7 +65,6 @@ void A2Task2SolutioNaive::compute() {
|
|||||||
// That buffer is read back for the correctness check
|
// That buffer is read back for the correctness check
|
||||||
// (A2Task2SolutionNaive::result())
|
// (A2Task2SolutionNaive::result())
|
||||||
// HINT: You can alternate between the two provided descriptor sets to implement ping-pong
|
// HINT: You can alternate between the two provided descriptor sets to implement ping-pong
|
||||||
|
|
||||||
cb.end();
|
cb.end();
|
||||||
|
|
||||||
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
|
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
|
||||||
|
|||||||
20
src/main.cpp
20
src/main.cpp
@@ -39,14 +39,14 @@ void run_A2_task1(AppResources &app){
|
|||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
A2Task1SolutionInterleaved interleavedSolution(app, 128);
|
// A2Task1SolutionInterleaved interleavedSolution(app, 128);
|
||||||
evaluateTask1Solution(&interleavedSolution, "Interleaved");
|
// evaluateTask1Solution(&interleavedSolution, "Interleaved");
|
||||||
|
|
||||||
A2Task1SolutionSequential sequentialSolution(app, 128);
|
// A2Task1SolutionSequential sequentialSolution(app, 128);
|
||||||
evaluateTask1Solution(&sequentialSolution, "Sequential");
|
// evaluateTask1Solution(&sequentialSolution, "Sequential");
|
||||||
|
|
||||||
// A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv");
|
A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv");
|
||||||
// evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition");
|
evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition");
|
||||||
|
|
||||||
// A2Task1SolutionKernelDecomposition kernelDecompositionUnrollSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionUnroll.comp.spv");
|
// A2Task1SolutionKernelDecomposition kernelDecompositionUnrollSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionUnroll.comp.spv");
|
||||||
// evaluateTask1Solution(&kernelDecompositionUnrollSolution, "KernelDecomposition Unroll");
|
// evaluateTask1Solution(&kernelDecompositionUnrollSolution, "KernelDecomposition Unroll");
|
||||||
@@ -91,11 +91,11 @@ void run_A2_task2(AppResources& app){
|
|||||||
A2Task2SolutioNaive naiveSolution(app, 128);
|
A2Task2SolutioNaive naiveSolution(app, 128);
|
||||||
evaluateTask2Solution(&a2Task2, &naiveSolution, "Naive",5);
|
evaluateTask2Solution(&a2Task2, &naiveSolution, "Naive",5);
|
||||||
|
|
||||||
A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, sizeLocal);
|
// A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, sizeLocal);
|
||||||
evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5);
|
// evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5);
|
||||||
|
|
||||||
A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128);
|
// A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128);
|
||||||
evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5);
|
// evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5);
|
||||||
|
|
||||||
}
|
}
|
||||||
int main()
|
int main()
|
||||||
|
|||||||
Reference in New Issue
Block a user