init commit

This commit is contained in:
2025-12-21 15:41:59 +01:00
commit c86af91a92
36 changed files with 2399 additions and 0 deletions

45
.gitignore vendored Normal file
View File

@@ -0,0 +1,45 @@
# Compiled Object files
**/.DS_Store
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
*.smod
# Compiled Static libraries
*.lai
*.la
*.lib
# Executables
*.exe
*.out
*.app
**/cmake-build-debug
**/CMakeCache.txt
**/cmake_install.cmake
**/install_manifest.txt
**/CMakeFiles/
**/CTestTestfile.cmake
**/*.cbp
**/CMakeScripts
**/compile_commands.json
include/divisible/*
build/
.cache/
.vscode/

77
CMakeLists.txt Normal file
View File

@@ -0,0 +1,77 @@
cmake_minimum_required(VERSION 3.16)
set(CMAKE_CXX_STANDARD 17)
project(Assignment2)
function(add_shader TARGET SHADER)
find_program(GLSLC glslc)
set(current-shader-path ${CMAKE_CURRENT_SOURCE_DIR}/${SHADER})
set(current-output-path ${CMAKE_CURRENT_SOURCE_DIR}/build/${SHADER}.spv)
file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/build)
get_filename_component(current-output-dir ${current-output-path} DIRECTORY)
file(MAKE_DIRECTORY ${current-output-dir})
add_custom_command(
OUTPUT ${current-output-path}
COMMAND ${GLSLC} --target-env=vulkan1.2 -o ${current-output-path} ${current-shader-path}
DEPENDS ${current-shader-path}
IMPLICIT_DEPENDS CXX ${current-shader-path}
VERBATIM)
# Make sure our build depends on this output.
set_source_files_properties(${current-output-path} PROPERTIES GENERATED TRUE)
target_sources(${TARGET} PRIVATE ${current-output-path})
endfunction(add_shader)
find_package(Vulkan REQUIRED)
if (DEFINED ENV{RENDERDOC_PATH})
set(RENDERDOC_PATH ENV{RENDERDOC_PATH})
elseif (WIN32)
if(EXISTS "C:\\Program Files\\RenderDoc")
set(RENDERDOC_PATH "C:\\Program Files\\RenderDoc")
endif()
else ()
#LINUX PATH HERE
endif ()
set(SOURCE_FILE
src/main.cpp
src/task_common.cpp
src/host_timer.cpp
src/initialization.cpp
src/renderdoc.cpp
src/utils.cpp
src/A2Task1.cpp
src/A2Task2.cpp
src/A2Task1Solution/Interleaved.cpp
src/A2Task1Solution/KernelDecomposition.cpp
src/A2Task1Solution/Sequential.cpp
src/A2Task2Solution/KernelDecomposition.cpp
src/A2Task2Solution/Naive.cpp
)
add_compile_definitions(WORKING_DIR="${CMAKE_CURRENT_SOURCE_DIR}")
add_executable(${PROJECT_NAME} ${SOURCE_FILE})
add_shader(${PROJECT_NAME} shaders/A2Task1Interleaved.comp)
add_shader(${PROJECT_NAME} shaders/A2Task1KernelDecomposition.comp)
add_shader(${PROJECT_NAME} shaders/A2Task1KernelDecompositionAtomic.comp)
add_shader(${PROJECT_NAME} shaders/A2Task1KernelDecompositionUnroll.comp)
add_shader(${PROJECT_NAME} shaders/A2Task1Sequential.comp)
add_shader(${PROJECT_NAME} shaders/A2Task2KernelDecomposition.comp)
add_shader(${PROJECT_NAME} shaders/A2Task2KernelDecompositionOffset.comp)
add_shader(${PROJECT_NAME} shaders/A2Task2Naive.comp)
target_include_directories(${PROJECT_NAME} PUBLIC ./include)
target_include_directories(${PROJECT_NAME} PRIVATE ${Vulkan_INCLUDE_DIRS})
target_link_libraries(${PROJECT_NAME} Vulkan::Vulkan)
target_compile_definitions(${PROJECT_NAME} PRIVATE)
if (RENDERDOC_PATH)
target_include_directories(${PROJECT_NAME} PRIVATE /usr/local/renderdoc_1.33/include)
target_include_directories(${PROJECT_NAME} PRIVATE ${RENDERDOC_PATH})
target_compile_definitions(${PROJECT_NAME} PRIVATE ENABLE_RENDERDOC)
endif ()

37
include/A2Task1.h Normal file
View File

@@ -0,0 +1,37 @@
#pragma once
#include "helper.h"
#include <iostream>
#include <cstdlib>
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
#include <vulkan/vulkan.hpp>
#include <fstream>
#include <vector>
#include "initialization.h"
#include "utils.h"
#include "task_common.h"
class A2Task1Solution {
public:
float mstime;
virtual void prepare(const std::vector<uint> &input) = 0;
virtual void compute() = 0;
virtual uint result() const = 0;
virtual void cleanup() = 0;
};
class A2Task1 {
public:
A2Task1(uint problemSize);
A2Task1(std::vector<uint> input);
bool evaluateSolution(A2Task1Solution& solution);
private:
void computeReference();
std::vector<uint> input;
uint reference;
};

40
include/A2Task2.h Normal file
View File

@@ -0,0 +1,40 @@
#pragma once
#include "helper.h"
#include <iostream>
#include <cstdlib>
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
#include <vulkan/vulkan.hpp>
#include <fstream>
#include <vector>
#include "initialization.h"
#include "utils.h"
#include "task_common.h"
class A2Task2Solution {
public:
float mstime;
virtual void prepare(const std::vector<uint> &input) = 0;
virtual void compute() = 0;
virtual std::vector<uint> result() const = 0;
virtual void cleanup() = 0;
};
class A2Task2 {
public:
A2Task2(uint problemSize);
A2Task2(std::vector<uint> input);
bool evaluateSolution(A2Task2Solution& solution);
size_t size() const {
return input.size();
}
private:
void computeReference();
std::vector<uint> input;
std::vector<uint> reference;
};

8
include/helper.h Normal file
View File

@@ -0,0 +1,8 @@
#pragma once
#include <string>
#if defined(WORKING_DIR)
inline std::string workingDir = std::string(WORKING_DIR) + "/";
#else
inline std::string workingDir = std::string("./");
#endif

15
include/host_timer.h Normal file
View File

@@ -0,0 +1,15 @@
#pragma once
#include <chrono>
class HostTimer {
private:
using clock = std::chrono::high_resolution_clock;
clock::time_point start;
public:
HostTimer();
void reset();
double elapsed() const;
};

47
include/initialization.h Normal file
View File

@@ -0,0 +1,47 @@
#ifndef INITIALIZATION
#define INITIALIZATION
#include <vulkan/vulkan.hpp>
#include <cstring>
struct AppResources
{
vk::Instance instance;
vk::DebugUtilsMessengerEXT dbgUtilsMgr;
vk::PhysicalDevice pDevice;
vk::PhysicalDeviceProperties2 pDeviceProperties;
vk::PhysicalDeviceSubgroupProperties pDeviceSubgroupProperties;
vk::Device device;
vk::Queue computeQueue, transferQueue;
uint32_t cQ, tQ;
vk::CommandPool computeCommandPool, transferCommandPool;
vk::QueryPool queryPool;
void destroy();
};
VKAPI_ATTR VkBool32 VKAPI_CALL
debugUtilsMessengerCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity,
VkDebugUtilsMessageTypeFlagsEXT messageTypes,
VkDebugUtilsMessengerCallbackDataEXT const *pCallbackData,
void * /*pUserData*/);
vk::DebugUtilsMessengerCreateInfoEXT makeDebugUtilsMessengerCreateInfoEXT();
void selectPhysicalDevice(vk::Instance &instance, vk::PhysicalDevice &pDevice);
void createInstance(vk::Instance &instance, vk::DebugUtilsMessengerEXT &debugUtilsMessenger,
std::string appName, std::string engineName);
void createLogicalDevice(vk::Instance &instance, vk::PhysicalDevice &pDevice, vk::Device &device);
std::tuple<uint32_t, uint32_t> getComputeAndTransferQueues(vk::PhysicalDevice &pDevice);
void createCommandPool(vk::Device &device, vk::CommandPool &commandPool, uint32_t queueIndex);
void destroyInstance(vk::Instance &instance, vk::DebugUtilsMessengerEXT &debugUtilsMessenger);
void destroyLogicalDevice(vk::Device &device);
void destroyCommandPool(vk::Device &device, vk::CommandPool &commandPool);
void createTimestampQueryPool(vk::Device &device, vk::QueryPool &queryPool, uint32_t queryCount);
void destroyQueryPool(vk::Device &device, vk::QueryPool &queryPool);
void printDeviceCapabilities(vk::PhysicalDevice &pDevice);
void initApp(AppResources &app);
#endif

7
include/renderdoc.h Normal file
View File

@@ -0,0 +1,7 @@
#pragma once
namespace renderdoc {
void initialize();
void startCapture();
void endCapture();
}

47
include/task_common.h Normal file
View File

@@ -0,0 +1,47 @@
#include <iostream>
#include <cstdlib>
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
#include <vulkan/vulkan.hpp>
#include <fstream>
#include <vector>
#include "initialization.h"
#include "utils.h"
#ifndef EX_TEMPLATE
#define EX_TEMPLATE
namespace Cmn {
void createDescriptorSetLayout(vk::Device &device,
std::vector<vk::DescriptorSetLayoutBinding> &bindings, vk::DescriptorSetLayout &descLayout);
void addStorage(std::vector<vk::DescriptorSetLayoutBinding> &bindings, uint32_t binding);
void allocateDescriptorSet(vk::Device &device, vk::DescriptorSet &descSet, vk::DescriptorPool &descPool,
vk::DescriptorSetLayout &descLayout);
void bindBuffers(vk::Device &device, vk::Buffer &b, vk::DescriptorSet &set, uint32_t binding);
void createDescriptorPool(vk::Device &device,
std::vector<vk::DescriptorSetLayoutBinding> &bindings, vk::DescriptorPool &descPool, uint32_t numDescriptors = 1);
void createPipeline(vk::Device &device, vk::Pipeline &pipeline,
vk::PipelineLayout &pipLayout, vk::SpecializationInfo &specInfo, vk::ShaderModule &sModule);
void createShader(vk::Device &device, vk::ShaderModule &shaderModule, const std::string &filename);
}
struct TaskResources
{
//std::vector<Buffer> buffers; move this to user code
vk::ShaderModule cShader;
vk::DescriptorSetLayout descriptorSetLayout;
std::vector<vk::DescriptorSetLayoutBinding> bindings;
vk::DescriptorSet descriptorSet;
vk::DescriptorPool descriptorPool;
vk::Pipeline pipeline;
vk::PipelineLayout pipelineLayout;
void destroy(vk::Device &device);
};
#endif

113
include/utils.h Normal file
View File

@@ -0,0 +1,113 @@
#ifndef UTILS
#define UTILS
#include <vector>
#include <cstring>
#include <vulkan/vulkan.hpp>
#define CAST(a) static_cast<uint32_t>(a.size())
struct Buffer
{
vk::Buffer buf;
vk::DeviceMemory mem;
};
typedef uint32_t uint;
template<typename T, typename V>
T ceilDiv(T x, V y) {
return x / y + (x % y != 0);
}
std::vector<char> readFile(const std::string &filename);
std::string formatSize(uint64_t size);
uint32_t findMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties, vk::PhysicalDevice &pdevice);
void createBuffer(vk::PhysicalDevice &pDevice, vk::Device &device,
const vk::DeviceSize &size, vk::BufferUsageFlags usage,
vk::MemoryPropertyFlags properties, std::string name, vk::Buffer &buffer, vk::DeviceMemory &bufferMemory);
void createBuffer(vk::PhysicalDevice &pDevice, vk::Device &device,
const vk::DeviceSize &size, vk::BufferUsageFlags usage,
vk::MemoryPropertyFlags properties, std::string name, Buffer &buffer);
void destroyBuffer(vk::Device &device, Buffer &buffer);
void copyBuffer(vk::Device &device, vk::Queue &q, vk::CommandPool &commandPool,
const vk::Buffer &srcBuffer, vk::Buffer &dstBuffer, vk::DeviceSize byteSize);
vk::CommandBuffer beginSingleTimeCommands(vk::Device &device, vk::CommandPool &commandPool);
void endSingleTimeCommands(vk::Device &device, vk::Queue &q,
vk::CommandPool &commandPool, vk::CommandBuffer &commandBuffer);
Buffer addHostCoherentBuffer(vk::PhysicalDevice &pDevice, vk::Device &device, vk::DeviceSize size, std::string name);
Buffer addDeviceOnlyBuffer(vk::PhysicalDevice &pDevice, vk::Device &device, vk::DeviceSize size, std::string name);
template <typename T>
void fillDeviceBuffer(vk::Device &device, vk::DeviceMemory &mem, const std::vector<T> &input)
{
void *data = device.mapMemory(mem, 0, input.size() * sizeof(T), vk::MemoryMapFlags());
memcpy(data, input.data(), static_cast<size_t>(input.size() * sizeof(T)));
device.unmapMemory(mem);
}
template <typename T>
void fillHostBuffer(vk::Device &device, vk::DeviceMemory &mem, std::vector<T> &output)
{
// copy memory from mem to output
void *data = device.mapMemory(mem, 0, output.size() * sizeof(T), vk::MemoryMapFlags());
memcpy(output.data(), data, static_cast<size_t>(output.size() * sizeof(T)));
device.unmapMemory(mem);
}
template <typename T>
void fillDeviceWithStagingBuffer(vk::PhysicalDevice &pDevice, vk::Device &device,
vk::CommandPool &commandPool, vk::Queue &q,
Buffer &b, const std::vector<T> &data)
{
// Buffer b requires the eTransferSrc bit
// data (host) -> staging (device) -> Buffer b (device)
vk::Buffer staging;
vk::DeviceMemory mem;
vk::DeviceSize byteSize = data.size() * sizeof(T);
createBuffer(pDevice, device, byteSize, vk::BufferUsageFlagBits::eTransferSrc,
vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostVisible, "staging",
staging, mem);
// V host -> staging V
fillDeviceBuffer<T>(device, mem, data);
// V staging -> buffer V
copyBuffer(device, q, commandPool, staging, b.buf, byteSize);
device.destroyBuffer(staging);
device.freeMemory(mem);
}
template <typename T>
void fillHostWithStagingBuffer(vk::PhysicalDevice &pDevice, vk::Device &device,
vk::CommandPool &commandPool, vk::Queue &q,
const Buffer &b, std::vector<T> &data)
{
// Buffer b requires the eTransferDst bit
// Buffer b (device) -> staging (device) -> data (host)
vk::Buffer staging;
vk::DeviceMemory mem;
vk::DeviceSize byteSize = data.size() * sizeof(T);
createBuffer(pDevice, device, byteSize, vk::BufferUsageFlagBits::eTransferDst,
vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostVisible, "staging",
staging, mem);
// V buffer -> staging V
copyBuffer(device, q, commandPool, b.buf, staging, byteSize);
// V staging -> host V
fillHostBuffer<T>(device, mem, data);
device.destroyBuffer(staging);
device.freeMemory(mem);
}
template <typename T>
void setObjectName(vk::Device &device, T handle, std::string name)
{
#ifndef NDEBUG
vk::DebugUtilsObjectNameInfoEXT infoEXT(handle.objectType, uint64_t(static_cast<typename T::CType>(handle)), name.c_str());
device.setDebugUtilsObjectNameEXT(infoEXT);
#endif
}
#endif

View File

@@ -0,0 +1,21 @@
#version 450
/* built in:
in uvec3 gl_NumWorkGroups;
in uvec3 gl_WorkGroupID;
in uvec3 gl_LocalInvocationID;
in uvec3 gl_GlobalInvocationID;
in uint gl_LocalInvocationIndex;
*/
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(push_constant) uniform PushStruct {
uint size;
uint stride;
} p;
layout(binding = 0) buffer inoutBufer {uint v[];};
void main() {
// TODO: Kernel implementation
}

View File

@@ -0,0 +1,24 @@
#version 450
/* built in:
in uvec3 gl_NumWorkGroups;
in uvec3 gl_WorkGroupID;
in uvec3 gl_LocalInvocationID;
in uvec3 gl_GlobalInvocationID;
in uint gl_LocalInvocationIndex;
*/
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(push_constant) uniform PushStruct {
uint size;
uint offset;
} p;
layout(binding = 0) buffer inBuffer { uint v[]; };
layout(binding = 1) buffer outBuffer { uint g_v[]; };
// TODO: Shared variables
void main() {
// TODO: Kernel implementation
}

View File

@@ -0,0 +1,24 @@
#version 450
/* built in:
in uvec3 gl_NumWorkGroups;
in uvec3 gl_WorkGroupID;
in uvec3 gl_LocalInvocationID;
in uvec3 gl_GlobalInvocationID;
in uint gl_LocalInvocationIndex;
*/
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(push_constant) uniform PushStruct {
uint size;
uint offset;
} p;
layout(binding = 0) buffer inBuffer { uint v[]; };
layout(binding = 1) buffer outBuffer { uint g_v[]; };
// TODO: Shared variables
void main() {
// TODO: Kernel implementation
}

View File

@@ -0,0 +1,24 @@
#version 450
/* built in:
in uvec3 gl_NumWorkGroups;
in uvec3 gl_WorkGroupID;
in uvec3 gl_LocalInvocationID;
in uvec3 gl_GlobalInvocationID;
in uint gl_LocalInvocationIndex;
*/
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(push_constant) uniform PushStruct {
uint size;
uint offset;
} p;
layout(binding = 0) buffer inBuffer { uint v[]; };
layout(binding = 1) buffer outBuffer { uint g_v[]; };
// TODO: Shared variables
void main() {
// TODO: Kernel implementation
}

View File

@@ -0,0 +1,21 @@
#version 450
/* built in:
in uvec3 gl_NumWorkGroups;
in uvec3 gl_WorkGroupID;
in uvec3 gl_LocalInvocationID;
in uvec3 gl_GlobalInvocationID;
in uint gl_LocalInvocationIndex;
*/
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(push_constant) uniform PushStruct {
uint size;
uint offset;
} p;
layout(binding = 0) buffer inoutBufer { uint v[]; };
void main() {
// TODO: Kernel implementation
}

View File

@@ -0,0 +1,52 @@
#version 450
/* built in:
in uvec3 gl_NumWorkGroups;
in uvec3 gl_WorkGroupID;
in uvec3 gl_LocalInvocationID;
in uvec3 gl_GlobalInvocationID;
in uint gl_LocalInvocationIndex;
*/
// Why did we not have conflicts in the Reduction?
// Because of the sequential addressing (here we use interleaved => we have conflicts).
// TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs)
#define NUM_BANKS 32
#define NUM_BANKS_LOG 5
#define SIMD_GROUP_SIZE 32
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(push_constant) uniform PushStruct {
uint size;
} p;
layout(binding = 0) buffer inoutBufer {uint array[];};
layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];};
// TODO: Shared variables
// Bank conflicts
#define AVOID_BANK_CONFLICTS
#ifdef AVOID_BANK_CONFLICTS
// TODO: define your conflict-free macro here
#else
#define OFFSET(A) (A)
#endif
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
void main()
{
// TODO: Kernel implementation
// Cache first half of elements in the local memory
// Cache second half of elements
// Perform up-sweep
// Unroll the last steps when arrived at warp size
// Set the last element to 0
// Perform down-sweep
}

View File

@@ -0,0 +1,25 @@
#version 450
/* built in:
in uvec3 gl_NumWorkGroups;
in uvec3 gl_WorkGroupID;
in uvec3 gl_LocalInvocationID;
in uvec3 gl_GlobalInvocationID;
in uint gl_LocalInvocationIndex;
*/
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout (constant_id = 1) const uint SAMPLE_MULTIPLIER = 1;
// Push constant
layout(push_constant) uniform PushStruct {
uint size;
} p;
layout(binding = 0) buffer inoutBufer { uint v[]; };
layout(binding = 1) buffer offsetBufer { uint g_v[]; };
// TODO: Shared variables
void main() {
// TODO: Shared variables
}

23
shaders/A2Task2Naive.comp Normal file
View File

@@ -0,0 +1,23 @@
#version 450
/* built in:
in uvec3 gl_NumWorkGroups;
in uvec3 gl_WorkGroupID;
in uvec3 gl_LocalInvocationID;
in uvec3 gl_GlobalInvocationID;
in uint gl_LocalInvocationIndex;
*/
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
layout(push_constant) uniform PushStruct {
uint size;
uint offset;
} p;
layout(binding = 0) buffer inBuffer { uint v[]; };
layout(binding = 1) buffer outBufer { uint g_v[]; };
void main() {
// TODO: Kernel implementation
}

41
src/A2Task1.cpp Normal file
View File

@@ -0,0 +1,41 @@
#include "A2Task1.h"
#include <iostream>
#include <cstdlib>
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
#include <vulkan/vulkan.hpp>
#include <fstream>
#include <vector>
#include "initialization.h"
#include "utils.h"
#include "task_common.h"
#include "host_timer.h"
A2Task1::A2Task1(uint problemSize) : input(problemSize, 0) {
for (auto i = 0; i < problemSize; i++)
input[i] = i % 97;
computeReference();
}
A2Task1::A2Task1(std::vector<uint> input) : input(input) {
computeReference();
}
bool A2Task1::evaluateSolution(A2Task1Solution& solution) {
solution.prepare(input);
solution.compute();
auto result = solution.result();
if (reference != result) {
std::cout << "error: expected " << reference << ", but got " << result << std::endl;
return false;
}
return true;
}
void A2Task1::computeReference() {
reference = 0;
for (auto e : input)
reference += e;
}

View File

@@ -0,0 +1,85 @@
#include "Interleaved.h"
#include "host_timer.h"
A2Task1SolutionInterleaved::A2Task1SolutionInterleaved(AppResources &app, uint workGroupSize) :
app(app), workGroupSize(workGroupSize) {}
void A2Task1SolutionInterleaved::prepare(const std::vector<uint> &input)
{
mpInput = &input;
Cmn::addStorage(bindings, 0);
Cmn::createDescriptorSetLayout(app.device, bindings, descriptorSetLayout);
vk::PushConstantRange pcr(vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant));
vk::PipelineLayoutCreateInfo pipInfo(vk::PipelineLayoutCreateFlags(), 1U, &descriptorSetLayout, 1U, &pcr);
pipelineLayout = app.device.createPipelineLayout(pipInfo);
// Specialization constant for workgroup size
std::array<vk::SpecializationMapEntry, 1> specEntries = std::array<vk::SpecializationMapEntry, 1>{
{{0U, 0U, sizeof(workGroupSize)}},
};
std::array<uint32_t, 1> specValues = {workGroupSize}; //for workgroup sizes
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(),
CAST(specValues) * sizeof(int), specValues.data());
Cmn::createShader(app.device, shaderModule, workingDir +"build/shaders/A2Task1Interleaved.comp.spv");
Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, shaderModule);
createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]),
vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer,
vk::MemoryPropertyFlagBits::eDeviceLocal, "inoutBuffer", inoutBuffer);
fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffer, input);
Cmn::createDescriptorPool(app.device, bindings, descriptorPool);
Cmn::allocateDescriptorSet(app.device, descriptorSet, descriptorPool, descriptorSetLayout);
Cmn::bindBuffers(app.device, inoutBuffer.buf, descriptorSet, 0);
}
void A2Task1SolutionInterleaved::compute()
{
vk::CommandBufferAllocateInfo allocInfo(
app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U);
vk::CommandBuffer cb = app.device.allocateCommandBuffers( allocInfo )[0];
vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
cb.begin(beginInfo);
// TODO: Implement reduction with interleaved addressing
cb.end();
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
HostTimer timer;
app.computeQueue.submit({submitInfo});
app.device.waitIdle();
mstime = timer.elapsed() * 1000;
app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb);
}
uint A2Task1SolutionInterleaved::result() const
{
std::vector<uint> result(1, 0);
fillHostWithStagingBuffer<uint>(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffer, result);
return result[0];
}
void A2Task1SolutionInterleaved::cleanup()
{
app.device.destroyDescriptorPool(descriptorPool);
app.device.destroyPipeline(pipeline);
app.device.destroyShaderModule(shaderModule);
app.device.destroyPipelineLayout(pipelineLayout);
app.device.destroyDescriptorSetLayout(descriptorSetLayout);
bindings.clear();
destroyBuffer(app.device, inoutBuffer);
}

View File

@@ -0,0 +1,42 @@
#pragma once
#include "A2Task1.h"
class A2Task1SolutionInterleaved : public A2Task1Solution{
public:
A2Task1SolutionInterleaved(AppResources &app, uint workGroupSize);
void prepare(const std::vector<uint> &input) override;
void compute() override;
uint result() const override;
void cleanup() override;
private:
struct PushConstant
{
uint size;
uint stride;
};
AppResources &app;
uint workGroupSize;
const std::vector<uint>* mpInput;
Buffer inoutBuffer;
// Descriptor & Pipeline Layout
std::vector<vk::DescriptorSetLayoutBinding> bindings;
vk::DescriptorSetLayout descriptorSetLayout;
vk::PipelineLayout pipelineLayout;
// Local PPS Pipeline
vk::ShaderModule shaderModule;
vk::Pipeline pipeline;
// Descriptor Pool
vk::DescriptorPool descriptorPool;
// Per-dispatch data
vk::DescriptorSet descriptorSet;
};

View File

@@ -0,0 +1,97 @@
#include "KernelDecomposition.h"
#include "host_timer.h"
A2Task1SolutionKernelDecomposition::A2Task1SolutionKernelDecomposition(AppResources &app, uint workGroupSize, std::string shaderFileName) :
app(app), workGroupSize(workGroupSize), shaderFileName(shaderFileName) {}
void A2Task1SolutionKernelDecomposition::prepare(const std::vector<uint> &input)
{
mpInput = &input;
Cmn::addStorage(bindings, 0);
Cmn::addStorage(bindings, 1);
Cmn::createDescriptorSetLayout(app.device, bindings, descriptorSetLayout);
vk::PushConstantRange pcr(vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant));
vk::PipelineLayoutCreateInfo pipInfo(vk::PipelineLayoutCreateFlags(), 1U, &descriptorSetLayout, 1U, &pcr);
pipelineLayout = app.device.createPipelineLayout(pipInfo);
// Specialization constant for workgroup size
std::array<vk::SpecializationMapEntry, 1> specEntries = std::array<vk::SpecializationMapEntry, 1>{
{{0U, 0U, sizeof(workGroupSize)}},
};
std::array<uint32_t, 1> specValues = {workGroupSize}; //for workgroup sizes
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(),
CAST(specValues) * sizeof(int), specValues.data());
Cmn::createShader(app.device, shaderModule, shaderFileName);
Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, shaderModule);
for (int i = 0; i < 2; i++) {
createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]),
vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer,
vk::MemoryPropertyFlagBits::eDeviceLocal, "buffer_" + std::to_string(i), buffers[i].buf, buffers[i].mem);
}
fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[0], input);
Cmn::createDescriptorPool(app.device, bindings, descriptorPool, 2);
for (int i = 0; i < 2; i++)
Cmn::allocateDescriptorSet(app.device, descriptorSets[i], descriptorPool, descriptorSetLayout);
Cmn::bindBuffers(app.device, buffers[0].buf, descriptorSets[0], 0);
Cmn::bindBuffers(app.device, buffers[1].buf, descriptorSets[0], 1);
Cmn::bindBuffers(app.device, buffers[1].buf, descriptorSets[1], 0);
Cmn::bindBuffers(app.device, buffers[0].buf, descriptorSets[1], 1);
}
void A2Task1SolutionKernelDecomposition::compute()
{
vk::CommandBufferAllocateInfo allocInfo(
app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U);
vk::CommandBuffer cb = app.device.allocateCommandBuffers( allocInfo )[0];
vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
cb.begin(beginInfo);
// TODO: Implement reduction with kernel decomposition
// NOTE: make sure that activeBuffer points to the buffer with the final result in the end
// That buffer is read back for the correctness check
// (A2Task1SolutionKernelDecomposition::result())
// HINT: You can alternate between the two provided descriptor sets to implement ping-pong
cb.end();
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
HostTimer timer;
app.computeQueue.submit({submitInfo});
app.device.waitIdle();
mstime = timer.elapsed() * 1000;
app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb);
}
uint A2Task1SolutionKernelDecomposition::result() const
{
std::vector<uint> result(1, 0);
fillHostWithStagingBuffer<uint>(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[activeBuffer], result);
return result[0];
}
void A2Task1SolutionKernelDecomposition::cleanup()
{
app.device.destroyDescriptorPool(descriptorPool);
app.device.destroyPipeline(pipeline);
app.device.destroyShaderModule(shaderModule);
app.device.destroyPipelineLayout(pipelineLayout);
app.device.destroyDescriptorSetLayout(descriptorSetLayout);
bindings.clear();
for (int i = 0; i < 2; i++)
destroyBuffer(app.device, buffers[i]);
}

View File

@@ -0,0 +1,44 @@
#pragma once
#include "A2Task1.h"
class A2Task1SolutionKernelDecomposition : public A2Task1Solution{
public:
A2Task1SolutionKernelDecomposition(AppResources &app, uint workGroupSize, std::string shaderFileName);
void prepare(const std::vector<uint> &input) override;
void compute() override;
uint result() const override;
void cleanup() override;
private:
struct PushConstant
{
uint size;
};
AppResources &app;
uint workGroupSize;
std::string shaderFileName;
const std::vector<uint>* mpInput;
Buffer buffers[2];
// Descriptor & Pipeline Layout
std::vector<vk::DescriptorSetLayoutBinding> bindings;
vk::DescriptorSetLayout descriptorSetLayout;
vk::PipelineLayout pipelineLayout;
// Local PPS Pipeline
vk::ShaderModule shaderModule;
vk::Pipeline pipeline;
// Descriptor Pool
vk::DescriptorPool descriptorPool;
// Per-dispatch data
vk::DescriptorSet descriptorSets[2];
uint activeBuffer = 0;
};

View File

@@ -0,0 +1,90 @@
#include "Sequential.h"
#include "host_timer.h"
A2Task1SolutionSequential::A2Task1SolutionSequential(AppResources &app, uint workGroupSize) :
app(app), workGroupSize(workGroupSize) {}
void A2Task1SolutionSequential::prepare(const std::vector<uint> &input)
{
mpInput = &input;
Cmn::addStorage(bindings, 0);
Cmn::createDescriptorSetLayout(app.device, bindings, descriptorSetLayout);
vk::PushConstantRange pcr(vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant));
vk::PipelineLayoutCreateInfo pipInfo(vk::PipelineLayoutCreateFlags(), 1U, &descriptorSetLayout, 1U, &pcr);
pipelineLayout = app.device.createPipelineLayout(pipInfo);
// Specialization constant for workgroup size
std::array<vk::SpecializationMapEntry, 1> specEntries = std::array<vk::SpecializationMapEntry, 1>{
{{0U, 0U, sizeof(workGroupSize)}},
};
std::array<uint32_t, 1> specValues = {workGroupSize}; //for workgroup sizes
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(),
CAST(specValues) * sizeof(int), specValues.data());
Cmn::createShader(app.device, shaderModule, workingDir +"build/shaders/A2Task1Sequential.comp.spv");
Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, shaderModule);
createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]),
vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer,
vk::MemoryPropertyFlagBits::eDeviceLocal, "inoutBuffer", inoutBuffer.buf, inoutBuffer.mem);
fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffer, input);
Cmn::createDescriptorPool(app.device, bindings, descriptorPool);
Cmn::allocateDescriptorSet(app.device, descriptorSet, descriptorPool, descriptorSetLayout);
Cmn::bindBuffers(app.device, inoutBuffer.buf, descriptorSet, 0);
}
void A2Task1SolutionSequential::compute()
{
vk::CommandBufferAllocateInfo allocInfo(
app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U);
vk::CommandBuffer cb = app.device.allocateCommandBuffers( allocInfo )[0];
vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
cb.begin(beginInfo);
// TODO: Implement reduction with sequential addressing
cb.end();
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
HostTimer timer;
app.computeQueue.submit({submitInfo});
app.device.waitIdle();
mstime = timer.elapsed() * 1000;
app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb);
}
uint A2Task1SolutionSequential::result() const
{
std::vector<uint> result(1, 0);
fillHostWithStagingBuffer<uint>(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffer, result);
return result[0];
}
void A2Task1SolutionSequential::cleanup()
{
app.device.destroyDescriptorPool(descriptorPool);
app.device.destroyPipeline(pipeline);
app.device.destroyShaderModule(shaderModule);
app.device.destroyPipelineLayout(pipelineLayout);
app.device.destroyDescriptorSetLayout(descriptorSetLayout);
bindings.clear();
auto Bclean = [&](Buffer &b){
app.device.destroyBuffer(b.buf);
app.device.freeMemory(b.mem);
};
Bclean(inoutBuffer);
}

View File

@@ -0,0 +1,42 @@
#pragma once
#include "A2Task1.h"
class A2Task1SolutionSequential : public A2Task1Solution{
public:
A2Task1SolutionSequential(AppResources &app, uint workGroupSize);
void prepare(const std::vector<uint> &input) override;
void compute() override;
uint result() const override;
void cleanup() override;
private:
struct PushConstant
{
uint size;
uint offset;
};
AppResources &app;
uint workGroupSize;
const std::vector<uint>* mpInput;
Buffer inoutBuffer;
// Descriptor & Pipeline Layout
std::vector<vk::DescriptorSetLayoutBinding> bindings;
vk::DescriptorSetLayout descriptorSetLayout;
vk::PipelineLayout pipelineLayout;
// Local PPS Pipeline
vk::ShaderModule shaderModule;
vk::Pipeline pipeline;
// Descriptor Pool
vk::DescriptorPool descriptorPool;
// Per-dispatch data
vk::DescriptorSet descriptorSet;
};

42
src/A2Task2.cpp Normal file
View File

@@ -0,0 +1,42 @@
#include "A2Task2.h"
A2Task2::A2Task2(uint problemSize) : input(problemSize, 0) {
for (auto i = 0; i < problemSize; i++)
input[i] = i % 97;
computeReference();
}
A2Task2::A2Task2(std::vector<uint> input) : input(input) {
computeReference();
}
void A2Task2::computeReference() {
reference.reserve(input.size());
uint acc = 0;
for (auto i = 0; i < input.size(); i++) {
acc += input[i];
reference.push_back(acc);
}
}
bool A2Task2::evaluateSolution(A2Task2Solution& solution) {
solution.prepare(input);
solution.compute();
auto result = solution.result();
if (result.size() != reference.size()) {
std::cout << "error: result and reference vector size don't match!";
return false;
}
for (uint i = 0; i < reference.size(); i++) {
if (result[i] != reference[i]) {
std::cout << "error: result and reference don't match at index " << i << "!" << std::endl;
std::cout << "\tresult: " << result[i] << std::endl;
std::cout << "\treference: " << reference[i] << std::endl;
return false;
}
}
return true;
}

View File

@@ -0,0 +1,114 @@
#include "KernelDecomposition.h"
#include "host_timer.h"
A2Task2SolutionKernelDecomposition::A2Task2SolutionKernelDecomposition(AppResources& app, uint workGroupSize): app(app),
workGroupSize(workGroupSize) {
}
void A2Task2SolutionKernelDecomposition::prepare(const std::vector<uint>& input) {
workSize = input.size();
// Descriptor & Pipeline Layout
Cmn::addStorage(bindings, 0);
Cmn::addStorage(bindings, 1);
Cmn::createDescriptorSetLayout(app.device, bindings, descriptorSetLayout);
vk::PushConstantRange pcr(vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushStruct));
vk::PipelineLayoutCreateInfo pipInfo(vk::PipelineLayoutCreateFlags(), 1U, &descriptorSetLayout, 1U, &pcr);
pipelineLayout = app.device.createPipelineLayout(pipInfo);
// Specialization constant for workgroup size
std::array<vk::SpecializationMapEntry, 1> specEntries = std::array<vk::SpecializationMapEntry, 1>{
{{0U, 0U, sizeof(workGroupSize)}},
};
std::array<uint32_t, 1> specValues = {workGroupSize}; //for workgroup sizes
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(),
CAST(specValues) * sizeof(int), specValues.data());
// Local PPS Pipeline
Cmn::createShader(app.device, cShaderLocalPPS, workingDir + "build/shaders/A2Task2KernelDecomposition.comp.spv");
Cmn::createPipeline(app.device, pipelineLocalPPS, pipelineLayout, specInfo, cShaderLocalPPS);
// Local PPS Offset Pipeline
Cmn::createShader(app.device, cShaderLocalPPSOffset,
workingDir + "build/shaders/A2Task2KernelDecompositionOffset.comp.spv");
Cmn::createPipeline(app.device, pipelineLocalPPSOffset, pipelineLayout, specInfo, cShaderLocalPPSOffset);
// ### create buffers, get their index in the task.buffers[] array ###
using BFlag = vk::BufferUsageFlagBits;
auto makeDLocalBuffer = [ this ](vk::BufferUsageFlags usage, vk::DeviceSize size, std::string name) -> Buffer {
Buffer b;
createBuffer(app.pDevice, app.device, size, usage, vk::MemoryPropertyFlagBits::eDeviceLocal, name, b.buf,
b.mem);
return b;
};
inoutBuffers.push_back(makeDLocalBuffer(BFlag::eTransferDst | BFlag::eTransferSrc | BFlag::eStorageBuffer,
input.size() * sizeof(uint32_t), "buffer_inout_0"));
fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffers[0],
input);
// TO DO create additional buffers (by pushing into inoutBuffers) and descriptors (by pushing into descriptorSets)
// You need to create an appropriately-sized DescriptorPool first
}
void A2Task2SolutionKernelDecomposition::compute() {
vk::CommandBufferAllocateInfo allocInfo(
app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U);
vk::CommandBuffer cb = app.device.allocateCommandBuffers(allocInfo)[0];
vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
cb.begin(beginInfo);
// TODO: Implement efficient version of scan
// Make sure that the local prefix sum works before you start experimenting with large arrays
cb.end();
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
HostTimer timer;
app.computeQueue.submit({submitInfo});
app.device.waitIdle();
mstime = timer.elapsed() * 1000;
app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb);
}
std::vector<uint> A2Task2SolutionKernelDecomposition::result() const {
std::vector<uint> result(workSize, 0);
fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffers[0],
result);
return result;
}
void A2Task2SolutionKernelDecomposition::cleanup() {
app.device.destroyDescriptorPool(descriptorPool);
app.device.destroyPipeline(pipelineLocalPPSOffset);
app.device.destroyShaderModule(cShaderLocalPPSOffset);
app.device.destroyPipeline(pipelineLocalPPS);
app.device.destroyShaderModule(cShaderLocalPPS);
app.device.destroyPipelineLayout(pipelineLayout);
app.device.destroyDescriptorSetLayout(descriptorSetLayout);
bindings.clear();
auto Bclean = [&](Buffer& b) {
app.device.destroyBuffer(b.buf);
app.device.freeMemory(b.mem);
};
for (auto inoutBuffer: inoutBuffers) {
Bclean(inoutBuffer);
}
inoutBuffers.clear();
}

View File

@@ -0,0 +1,55 @@
#include <iostream>
#include <cstdlib>
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
#include <vulkan/vulkan.hpp>
#include <fstream>
#include <vector>
#include "initialization.h"
#include "utils.h"
#include "task_common.h"
#include "A2Task2.h"
struct A2Task2SolutionKernelDecomposition : A2Task2Solution {
public:
A2Task2SolutionKernelDecomposition(AppResources &app, uint workGroupSize);
void prepare(const std::vector<uint> &input) override;
void compute() override;
std::vector<uint> result() const override;
void cleanup() override;
private:
struct PushStruct
{
uint32_t size;
};
AppResources &app;
uint workGroupSize;
std::string localPPSShaderFileName;
uint workSize;
std::vector<Buffer> inoutBuffers;
// Descriptor & Pipeline Layout
std::vector<vk::DescriptorSetLayoutBinding> bindings;
vk::DescriptorSetLayout descriptorSetLayout;
vk::PipelineLayout pipelineLayout;
// Local PPS Pipeline
vk::ShaderModule cShaderLocalPPS;
vk::Pipeline pipelineLocalPPS;
// Local PPS Offset Pipeline
vk::ShaderModule cShaderLocalPPSOffset;
vk::Pipeline pipelineLocalPPSOffset;
// Descriptor Pool
vk::DescriptorPool descriptorPool;
// TODO extend with any additional members you may need
};

View File

@@ -0,0 +1,100 @@
#include "Naive.h"
#include "host_timer.h"
A2Task2SolutioNaive::A2Task2SolutioNaive(
AppResources &app, uint workGroupSize):
app(app), workGroupSize(workGroupSize) {}
void A2Task2SolutioNaive::prepare(const std::vector<uint> &input) {
workSize = input.size();
// Descriptor & Pipeline Layout
Cmn::addStorage(bindings, 0);
Cmn::addStorage(bindings, 1);
Cmn::createDescriptorSetLayout(app.device, bindings, descriptorSetLayout);
vk::PushConstantRange pcr(vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushStruct));
vk::PipelineLayoutCreateInfo pipInfo(vk::PipelineLayoutCreateFlags(), 1U, &descriptorSetLayout, 1U, &pcr);
pipelineLayout = app.device.createPipelineLayout(pipInfo);
// Specialization constant for workgroup size
std::array<vk::SpecializationMapEntry, 1> specEntries = std::array<vk::SpecializationMapEntry, 1>{
{{0U, 0U, sizeof(workGroupSize)}},
};
std::array<uint32_t, 2> specValues = {workGroupSize}; //for workgroup sizes
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(),
CAST(specValues) * sizeof(int), specValues.data());
// Local PPS Offset Pipeline
Cmn::createShader(app.device, cShader, workingDir +"build/shaders/A2Task2Naive.comp.spv");
Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, cShader);
// ### create buffers, get their index in the task.buffers[] array ###
using BFlag = vk::BufferUsageFlagBits;
for (int i = 0; i < 2; i++)
createBuffer(app.pDevice, app.device, input.size() * sizeof(uint32_t), BFlag::eTransferDst | BFlag::eTransferSrc | BFlag::eStorageBuffer, vk::MemoryPropertyFlagBits::eDeviceLocal, "buffer_" + std::to_string(i), buffers[i]);
fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[0], input);
Cmn::createDescriptorPool(app.device, bindings, descriptorPool, 2);
for (uint i = 0; i < 2; i++)
Cmn::allocateDescriptorSet(app.device, descriptorSets[i], descriptorPool, descriptorSetLayout);
Cmn::bindBuffers(app.device, buffers[0].buf, descriptorSets[0], 0);
Cmn::bindBuffers(app.device, buffers[1].buf, descriptorSets[0], 1);
Cmn::bindBuffers(app.device, buffers[1].buf, descriptorSets[1], 0);
Cmn::bindBuffers(app.device, buffers[0].buf, descriptorSets[1], 1);
activeBuffer = 0;
}
void A2Task2SolutioNaive::compute() {
vk::CommandBufferAllocateInfo allocInfo(
app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U);
vk::CommandBuffer cb = app.device.allocateCommandBuffers( allocInfo )[0];
vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
cb.begin(beginInfo);
cb.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline);
// TODO: Implement naive scan
// NOTE: make sure that activeBuffer points to the buffer with the final result in the end
// That buffer is read back for the correctness check
// (A2Task2SolutionNaive::result())
// HINT: You can alternate between the two provided descriptor sets to implement ping-pong
cb.end();
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
HostTimer timer;
app.computeQueue.submit({submitInfo});
app.device.waitIdle();
mstime = timer.elapsed() * 1000;
app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb);
}
std::vector<uint> A2Task2SolutioNaive::result() const {
std::vector<uint> result(workSize, 0);
fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[activeBuffer], result);
return result;
}
void A2Task2SolutioNaive::cleanup() {
app.device.destroyDescriptorPool(descriptorPool);
app.device.destroyPipeline(pipeline);
app.device.destroyShaderModule(cShader);
app.device.destroyPipelineLayout(pipelineLayout);
app.device.destroyDescriptorSetLayout(descriptorSetLayout);
bindings.clear();
for (auto buffer : buffers)
destroyBuffer(app.device, buffer);
}

View File

@@ -0,0 +1,53 @@
#include <iostream>
#include <cstdlib>
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
#include <vulkan/vulkan.hpp>
#include <fstream>
#include <vector>
#include "initialization.h"
#include "utils.h"
#include "task_common.h"
#include "A2Task2.h"
struct A2Task2SolutioNaive : A2Task2Solution {
public:
A2Task2SolutioNaive(AppResources &app, uint workGroupSize);
void prepare(const std::vector<uint> &input) override;
void compute() override;
std::vector<uint> result() const override;
void cleanup() override;
private:
struct PushStruct
{
uint size;
uint offset;
};
AppResources &app;
uint workGroupSize;
uint workSize;
Buffer buffers[2];
// Descriptor & Pipeline Layout
std::vector<vk::DescriptorSetLayoutBinding> bindings;
vk::DescriptorSetLayout descriptorSetLayout;
vk::PipelineLayout pipelineLayout;
vk::ShaderModule cShader;
vk::Pipeline pipeline;
// Descriptor Pool
vk::DescriptorPool descriptorPool;
// Descriptors
vk::DescriptorSet descriptorSets[2];
uint activeBuffer = 0;
};

15
src/host_timer.cpp Normal file
View File

@@ -0,0 +1,15 @@
#include "host_timer.h"
HostTimer::HostTimer() {
reset();
}
void HostTimer::reset() {
start = clock::now();
}
double HostTimer::elapsed() const {
auto end = clock::now();
std::chrono::duration<double> duration = end - start;
return duration.count();
}

518
src/initialization.cpp Normal file
View File

@@ -0,0 +1,518 @@
#include <iostream>
#include <fstream>
#include <cstring>
#include <functional>
#include <optional>
#define VK_ENABLE_BETA_EXTENSIONS
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
#include <vulkan/vulkan.hpp>
VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE
#include "initialization.h"
#include "utils.h"
#include <optional>
// Here you create the instance and physical / logical device and maybe compute/transfer queues
// Also check if device is suitable etc
struct DeviceSelectionCache {
uint32_t vendorID;
uint32_t deviceID;
};
#ifdef NDEBUG
const bool enableValidationLayers = false;
#else
const bool enableValidationLayers = true;
#endif
const std::vector<const char*> validationLayers = {
#ifndef NDEBUG
"VK_LAYER_KHRONOS_validation"
#endif
};
const std::vector<const char*> instanceExtensions = {
#ifndef NDEBUG
VK_EXT_DEBUG_UTILS_EXTENSION_NAME,
#endif
};
const std::vector<const char*> extensionNames = {
#ifndef NDEBUG
#endif
};
void AppResources::destroy()
{
this->device.destroyQueryPool(this->queryPool);
//this->device.freeCommandBuffers(this->computeCommandPool, 1U, &this->computeCommandBuffer);
//this->device.freeCommandBuffers(this->transferCommandPool, 1U, &this->transferCommandBuffer);
this->device.destroyCommandPool(this->computeCommandPool);
//this->device.destroyCommandPool(this->transferCommandPool);
this->device.destroy();
#ifndef NDEBUG
this->instance.destroyDebugUtilsMessengerEXT(this->dbgUtilsMgr);
#endif
this->instance.destroy();
}
void initApp(AppResources& app)
{
createInstance(app.instance, app.dbgUtilsMgr, "Assignment1, Task 1", "Idkwhattowrite");
selectPhysicalDevice(app.instance, app.pDevice);
auto chain = app.pDevice.getProperties2<vk::PhysicalDeviceProperties2, vk::PhysicalDeviceSubgroupProperties>();
app.pDeviceProperties = chain.get<vk::PhysicalDeviceProperties2>();
app.pDeviceSubgroupProperties = chain.get<vk::PhysicalDeviceSubgroupProperties>();
std::tie(app.cQ, app.tQ) = getComputeAndTransferQueues(app.pDevice);
createLogicalDevice(app.instance, app.pDevice, app.device);
app.device.getQueue(app.cQ, 0U, &app.computeQueue);
app.transferQueue = app.computeQueue;
app.tQ = app.cQ;
//app.device.getQueue(app.tQ, 0U, &app.transferQueue);
//createCommandPool(app.device, app.transferCommandPool, app.tQ);
createCommandPool(app.device, app.computeCommandPool, app.cQ);
app.transferCommandPool = app.computeCommandPool;
createTimestampQueryPool(app.device, app.queryPool, 2);
}
//This is the function in which errors will go through to be displayed.
VKAPI_ATTR VkBool32 VKAPI_CALL
debugUtilsMessengerCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity,
VkDebugUtilsMessageTypeFlagsEXT messageTypes,
VkDebugUtilsMessengerCallbackDataEXT const* pCallbackData,
void* /*pUserData*/)
{
if (enableValidationLayers)
{
if (pCallbackData->messageIdNumber == 648835635)
{
// UNASSIGNED-khronos-Validation-debug-build-warning-message
return VK_FALSE;
}
if (pCallbackData->messageIdNumber == 767975156)
{
// UNASSIGNED-BestPractices-vkCreateInstance-specialuse-extension
return VK_FALSE;
}
}
std::cerr << vk::to_string(static_cast<vk::DebugUtilsMessageSeverityFlagBitsEXT>(messageSeverity)) << ": "
<< vk::to_string(static_cast<vk::DebugUtilsMessageTypeFlagsEXT>(messageTypes)) << ":\n";
std::cerr << "\t"
<< "messageIDName = <" << pCallbackData->pMessageIdName << ">\n";
std::cerr << "\t"
<< "messageIdNumber = " << pCallbackData->messageIdNumber << "\n";
std::cerr << "\t"
<< "message = <" << pCallbackData->pMessage << ">\n";
if (0 < pCallbackData->queueLabelCount)
{
std::cerr << "\t"
<< "Queue Labels:\n";
for (uint8_t i = 0; i < pCallbackData->queueLabelCount; i++)
{
std::cerr << "\t\t"
<< "labelName = <" << pCallbackData->pQueueLabels[i].pLabelName << ">\n";
}
}
if (0 < pCallbackData->cmdBufLabelCount)
{
std::cerr << "\t"
<< "CommandBuffer Labels:\n";
for (uint8_t i = 0; i < pCallbackData->cmdBufLabelCount; i++)
{
std::cerr << "\t\t"
<< "labelName = <" << pCallbackData->pCmdBufLabels[i].pLabelName << ">\n";
}
}
if (0 < pCallbackData->objectCount)
{
std::cerr << "\t"
<< "Objects:\n";
for (uint8_t i = 0; i < pCallbackData->objectCount; i++)
{
std::cerr << "\t\t"
<< "Object " << i << "\n";
std::cerr << "\t\t\t"
<< "objectType = "
<< vk::to_string(static_cast<vk::ObjectType>(pCallbackData->pObjects[i].objectType)) << "\n";
std::cerr << "\t\t\t"
<< "objectHandle = " << pCallbackData->pObjects[i].objectHandle << "\n";
if (pCallbackData->pObjects[i].pObjectName)
{
std::cerr << "\t\t\t"
<< "objectName = <" << pCallbackData->pObjects[i].pObjectName << ">\n";
}
}
}
return VK_TRUE;
}
/*
This function fills the structure with flags indicating
which error messages should go through
*/
vk::DebugUtilsMessengerCreateInfoEXT makeDebugUtilsMessengerCreateInfoEXT()
{
using SEVERITY = vk::DebugUtilsMessageSeverityFlagBitsEXT; // for readability
using MESSAGE = vk::DebugUtilsMessageTypeFlagBitsEXT;
return { {},
SEVERITY::eWarning | SEVERITY::eError,
MESSAGE::eGeneral | MESSAGE::ePerformance | MESSAGE::eValidation,
&debugUtilsMessengerCallback };
}
/*
The dynamic loader allows us to access many extensions
Required before creating instance for loading the extension VK_EXT_DEBUG_UTILS_EXTENSION_NAME
*/
void initDynamicLoader()
{
#if VK_HEADER_VERSION >= 301
using VulkanDynamicLoader = vk::detail::DynamicLoader;
#else
using VulkanDynamicLoader = vk::DynamicLoader;
#endif
static VulkanDynamicLoader dl;
static PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr = dl.getProcAddress<PFN_vkGetInstanceProcAddr>("vkGetInstanceProcAddr");
VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr);
}
void createInstance(vk::Instance& instance, vk::DebugUtilsMessengerEXT& debugUtilsMessenger,
std::string appName, std::string engineName)
{
initDynamicLoader();
vk::ApplicationInfo applicationInfo(appName.c_str(), 1, engineName.c_str(), 1, VK_API_VERSION_1_2);
//Initialize the InstanceCreateInfo
vk::InstanceCreateInfo instanceCreateInfo( //flags, pAppInfo, layerCount, layerNames, extcount, extNames
{}, &applicationInfo,
static_cast<uint32_t>(validationLayers.size()), validationLayers.data(),
static_cast<uint32_t>(instanceExtensions.size()), instanceExtensions.data());
// DebugInfo: use of StructureChain instead of pNext
// DebugUtils is used to catch errors from the instance
vk::DebugUtilsMessengerCreateInfoEXT debugCreateInfo = makeDebugUtilsMessengerCreateInfoEXT();
// The StructureChain fills the pNext member of the struct in a typesafe way
// This is only possible with vulkan-hpp, in plain vulkan there is no typechecking
vk::StructureChain<vk::InstanceCreateInfo, vk::DebugUtilsMessengerCreateInfoEXT> chain =
{ instanceCreateInfo, debugCreateInfo };
if (!enableValidationLayers) //For Release mode
chain.unlink<vk::DebugUtilsMessengerCreateInfoEXT>();
// Create an Instance
instance = vk::createInstance(chain.get<vk::InstanceCreateInfo>());
// Update the dispatcher to use instance related extensions
VULKAN_HPP_DEFAULT_DISPATCHER.init(instance);
if (enableValidationLayers)
debugUtilsMessenger = instance.createDebugUtilsMessengerEXT(makeDebugUtilsMessengerCreateInfoEXT());
}
std::tuple<uint32_t, uint32_t> getComputeAndTransferQueues(vk::PhysicalDevice& pDevice)
{
uint32_t tq = -1;
std::optional<uint32_t> otq;
uint32_t cq = -1;
std::optional<uint32_t> ocq;
using Chain = vk::StructureChain<vk::QueueFamilyProperties2, vk::QueueFamilyCheckpointPropertiesNV>;
using QFB = vk::QueueFlagBits;
#if VK_HEADER_VERSION >= 301
using VulkanDispatchLoaderDynamic = vk::detail::DispatchLoaderDynamic;
#else
using VulkanDispatchLoaderDynamic = vk::DispatchLoaderDynamic;
#endif
auto queueFamilyProperties2 = pDevice.getQueueFamilyProperties2<Chain, std::allocator<Chain>, VulkanDispatchLoaderDynamic>();
for (uint32_t j = 0; j < queueFamilyProperties2.size(); j++)
{
vk::QueueFamilyProperties const& properties =
queueFamilyProperties2[static_cast<size_t>(j)].get<vk::QueueFamilyProperties2>().queueFamilyProperties;
if (properties.queueFlags & QFB::eCompute)
{
if (!(properties.queueFlags & QFB::eGraphics ||
properties.queueFlags & QFB::eProtected))
ocq = j; // When a queue supports only compute and not graphics we want to use that
cq = j;
}
if (properties.queueFlags & QFB::eTransfer)
{
if (!(properties.queueFlags & QFB::eCompute ||
properties.queueFlags & QFB::eGraphics ||
properties.queueFlags & QFB::eProtected))
otq = j; // When a queue supports only transfer, we want to use this one
tq = j;
}
}
if (otq.has_value())
tq = otq.value();
if (ocq.has_value())
cq = ocq.value();
return std::tuple<uint32_t, uint32_t>(cq, tq);
}
void selectPhysicalDevice(vk::Instance& instance, vk::PhysicalDevice& pDevice)
{
// Takes the first one
std::vector<vk::PhysicalDevice> physDs = instance.enumeratePhysicalDevices();
const static char* cache_name = "device_selection_cache";
const static char* recreation_message = "To select a new device, delete the file \"device_selection_cache\" in your working directory before executing the framework.";
std::ifstream ifile(cache_name, std::ios::binary);
if (ifile.is_open()) {
DeviceSelectionCache cache;
ifile.read(reinterpret_cast<char*>(&cache), sizeof(cache));
ifile.close();
for (auto physD : physDs) {
auto props = physD.getProperties2().properties;
if (props.vendorID == cache.vendorID && props.deviceID == cache.deviceID) {
std::cout << "Selecting previously selected device: \"" << props.deviceName << "\"" << std::endl;
std::cout << recreation_message << std::endl;
pDevice = physD;
return;
}
}
std::cout << "Previously selected device was not found." << std::endl;
}
else {
std::cout << "No previous device selection found." << std::endl;
}
std::cout << "Select one of the available devices:" << std::endl;
for (int i = 0; i < physDs.size(); i++) {
auto props = physDs[i].getProperties2().properties;
std::cout << i << ")\t" << props.deviceName.data() << std::endl;
}
uint32_t i;
while (true) {
std::cout << "Enter device number: ";
std::cin >> i;
if (i < physDs.size()) break;
}
auto props = physDs[i].getProperties2().properties;
DeviceSelectionCache cache;
cache.vendorID = props.vendorID;
cache.deviceID = props.deviceID;
std::ofstream ofile(cache_name, std::ios::out | std::ios::binary);
ofile.write(reinterpret_cast<const char*>(&cache), sizeof(cache));
ofile.close();
std::cout << "Selected device: \"" << props.deviceName.data() << "\"" << std::endl
<< "This device will be automatically selected in the future." << std::endl
<< recreation_message << std::endl;
pDevice = physDs[i];
}
// The logical device holds the queues and will be used in almost every call from now on
void createLogicalDevice(vk::Instance& instance, vk::PhysicalDevice& pDevice, vk::Device& device)
{
//First get the queues
uint32_t cQ, tQ;
std::tie(cQ, tQ) = getComputeAndTransferQueues(pDevice);
std::vector<vk::DeviceQueueCreateInfo> queuesInfo;
// flags, queueFamily, queueCount, queuePriority
float prio = 1.f;
vk::DeviceQueueCreateInfo computeInfo({}, cQ, 1U, &prio);
vk::DeviceQueueCreateInfo transferInfo({}, tQ, 1U, &prio);
queuesInfo.push_back(computeInfo);
//queuesInfo.push_back(transferInfo);
// {}, queueCreateInfoCount, pQueueCreateInfos, enabledLayerCount, ppEnabledLayerNames, enabledExtensionCount, ppEnabledExtensionNames, pEnabledFeatures
std::vector extensionNames_(extensionNames);
auto deviceExtensionProperties = pDevice.enumerateDeviceExtensionProperties();
bool enable_portability_subset = false;;
for (auto ext : deviceExtensionProperties) {
if (strcmp(ext.extensionName.data(), VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME) == 0) {
enable_portability_subset = true;
}
}
if (enable_portability_subset) {
extensionNames_.push_back(VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME);
}
vk::DeviceCreateInfo dci({}, CAST(queuesInfo), queuesInfo.data(),
CAST(validationLayers), validationLayers.data(),
CAST(extensionNames_), extensionNames_.data()); // no extension
device = pDevice.createDevice(dci);
VULKAN_HPP_DEFAULT_DISPATCHER.init(device);
setObjectName(device, device, "This is my lovely device !");
}
void createCommandPool(vk::Device& device, vk::CommandPool& commandPool, uint32_t queueIndex)
{
vk::CommandPoolCreateInfo cpi(vk::CommandPoolCreateFlags(), queueIndex);
commandPool = device.createCommandPool(cpi);
}
void destroyInstance(vk::Instance& instance, vk::DebugUtilsMessengerEXT& debugUtilsMessenger)
{
#ifndef NDEBUG
instance.destroyDebugUtilsMessengerEXT(debugUtilsMessenger);
#endif
instance.destroy();
}
void destroyLogicalDevice(vk::Device& device)
{
device.destroy();
}
void destroyCommandPool(vk::Device& device, vk::CommandPool& commandPool)
{
device.destroyCommandPool(commandPool);
commandPool = vk::CommandPool();
}
void showAvailableQueues(vk::PhysicalDevice& pDevice, bool diagExt)
{
using Chain = vk::StructureChain<vk::QueueFamilyProperties2, vk::QueueFamilyCheckpointPropertiesNV>;
#if VK_HEADER_VERSION >= 301
using VulkanDispatchLoaderDynamic = vk::detail::DispatchLoaderDynamic;
#else
using VulkanDispatchLoaderDynamic = vk::DispatchLoaderDynamic;
#endif
auto queueFamilyProperties2 = pDevice.getQueueFamilyProperties2<Chain, std::allocator<Chain>, VulkanDispatchLoaderDynamic>();
for (size_t j = 0; j < queueFamilyProperties2.size(); j++)
{
std::cout << "\t"
<< "QueueFamily " << j << "\n";
vk::QueueFamilyProperties const& properties =
queueFamilyProperties2[j].get<vk::QueueFamilyProperties2>().queueFamilyProperties;
std::cout << "\t\t"
<< "QueueFamilyProperties:\n";
std::cout << "\t\t\t"
<< "queueFlags = " << vk::to_string(properties.queueFlags) << "\n";
std::cout << "\t\t\t"
<< "queueCount = " << properties.queueCount << "\n";
std::cout << "\t\t\t"
<< "timestampValidBits = " << properties.timestampValidBits << "\n";
std::cout << "\t\t\t"
<< "minImageTransferGranularity = " << properties.minImageTransferGranularity.width << " x "
<< properties.minImageTransferGranularity.height << " x "
<< properties.minImageTransferGranularity.depth << "\n";
std::cout << "\n";
if (diagExt)
{
vk::QueueFamilyCheckpointPropertiesNV const& checkpointProperties =
queueFamilyProperties2[j].get<vk::QueueFamilyCheckpointPropertiesNV>();
std::cout << "\t\t"
<< "CheckPointPropertiesNV:\n";
std::cout << "\t\t\t"
<< "checkpointExecutionStageMask = "
<< vk::to_string(checkpointProperties.checkpointExecutionStageMask) << "\n";
std::cout << "\n";
}
}
}
void createTimestampQueryPool(vk::Device& device, vk::QueryPool& queryPool, uint32_t queryCount)
{
vk::QueryPoolCreateInfo createInfo({}, vk::QueryType::eTimestamp, queryCount);
queryPool = device.createQueryPool(createInfo);
}
void destroyQueryPool(vk::Device& device, vk::QueryPool& queryPool)
{
device.destroyQueryPool(queryPool);
queryPool = vk::QueryPool();
}
void printDeviceCapabilities(vk::PhysicalDevice& pDevice)
{
//vk::PhysicalDeviceFeatures features = physicalDevice.getFeatures();
std::vector<vk::ExtensionProperties> ext = pDevice.enumerateDeviceExtensionProperties();
std::vector<vk::LayerProperties> layers = pDevice.enumerateDeviceLayerProperties();
vk::PhysicalDeviceMemoryProperties memoryProperties = pDevice.getMemoryProperties();
vk::PhysicalDeviceProperties properties = pDevice.getProperties();
vk::PhysicalDeviceType dt = properties.deviceType;
std::cout << "====================" << std::endl
<< "Device Name: " << properties.deviceName << std::endl
<< "Device ID: " << properties.deviceID << std::endl
<< "Device Type: " << vk::to_string(properties.deviceType) << std::endl
<< "Driver Version: " << properties.driverVersion << std::endl
<< "API Version: " << properties.apiVersion << std::endl
<< "====================" << std::endl
<< std::endl;
bool budgetExt = false;
bool diagExt = false;
std::cout << "This device supports the following extensions (" << ext.size() << "): " << std::endl;
for (vk::ExtensionProperties e : ext)
{
std::cout << std::string(e.extensionName.data()) << std::endl;
if (std::string(e.extensionName.data()) == VK_EXT_MEMORY_BUDGET_EXTENSION_NAME)
budgetExt = true;
if (std::string(e.extensionName.data()) == VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME)
diagExt = true;
}
std::cout << "This device supports the following memory types (" << memoryProperties.memoryTypeCount << "): " << std::endl;
uint32_t c = 0U;
for (vk::MemoryType e : memoryProperties.memoryTypes)
{
if (c > memoryProperties.memoryTypeCount)
break;
std::cout << e.heapIndex << "\t ";
std::cout << vk::to_string(e.propertyFlags) << std::endl;
c++;
}
std::cout << "====================" << std::endl
<< std::endl;
if (budgetExt)
{
std::cout << "This device has the following heaps (" << memoryProperties.memoryHeapCount << "): " << std::endl;
c = 0U;
for (vk::MemoryHeap e : memoryProperties.memoryHeaps)
{
if (c > memoryProperties.memoryHeapCount)
break;
std::cout << "Size: " << formatSize(e.size) << "\t ";
std::cout << vk::to_string(e.flags) << std::endl;
c++;
}
}
std::cout << "====================" << std::endl
<< std::endl
<< "This device has the following layers (" << layers.size() << "): " << std::endl;
for (vk::LayerProperties l : layers)
std::cout << std::string(l.layerName.data()) << "\t : " << std::string(l.description.data()) << std::endl;
std::cout << "====================" << std::endl
<< std::endl;
showAvailableQueues(pDevice, diagExt);
}

136
src/main.cpp Normal file
View File

@@ -0,0 +1,136 @@
#include <iostream>
#include <cstdlib>
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
#include <vulkan/vulkan.hpp>
#include <fstream>
#include <vector>
#include "initialization.h"
#include "utils.h"
#include "A2Task1.h"
#include "A2Task2.h"
#include "A2Task1Solution/Sequential.h"
#include "A2Task1Solution/Interleaved.h"
#include "A2Task1Solution/KernelDecomposition.h"
#include "A2Task2Solution/Naive.h"
#include "A2Task2Solution/KernelDecomposition.h"
#include "renderdoc.h"
void run_A2_task1(AppResources &app){
size_t size = 128*1024*1024;
A2Task1 a2Task1(size);
std::cout<<"====== A2 TASK 1 ======" <<std::endl;
auto evaluateTask1Solution = [&](A2Task1Solution* solution, std::string name, int N=10) {
std::cout << "[Task1] evaluating " << name << " with size: "<<size<< std::endl;
bool pass = true;
float mstime = 0.f;
for (int i = 0; i < N; i++) {
pass &= a2Task1.evaluateSolution(*solution);
solution->cleanup();
mstime += solution->mstime / N;
if (!pass) break;
}
if (pass) {
std::cout << "TEST PASSED. Execution time: " << mstime<< " ms, "
<< "Throughput: " << size / mstime / 1000000 << " GE/s" << std::endl;
} else {
std::cout << "TEST FAILED" << std::endl;
}
};
A2Task1SolutionInterleaved interleavedSolution(app, 128);
evaluateTask1Solution(&interleavedSolution, "Interleaved");
A2Task1SolutionSequential sequentialSolution(app, 128);
evaluateTask1Solution(&sequentialSolution, "Sequential");
A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv");
evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition");
A2Task1SolutionKernelDecomposition kernelDecompositionUnrollSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionUnroll.comp.spv");
evaluateTask1Solution(&kernelDecompositionUnrollSolution, "KernelDecomposition Unroll");
A2Task1SolutionKernelDecomposition kernelDecompositionAtomicSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionAtomic.comp.spv");
evaluateTask1Solution(&kernelDecompositionAtomicSolution, "KernelDecomposition Atomic");
}
void run_A2_task2(AppResources& app){
size_t size = 128*1024*1024;
std::cout<<"====== A2 TASK 2 ======" <<std::endl;
// This is used for testing local kernel decomposition without extension to arbitrary arrays.
// Must be power of two and <= 1024!
size_t sizeLocal = 128;
A2Task2 a2Task2(size);
A2Task2 a2Task2Local(sizeLocal);
auto evaluateTask2Solution = [&](A2Task2 *task, A2Task2Solution* solution, std::string name, int N) {
std::cout << "[Task2] evaluating " << name << " with size: "<< task->size() << std::endl;
bool pass = true;
float mstime = 0.f;
for (int i = 0; i < N; i++) {
pass &= task->evaluateSolution(*solution);
solution->cleanup();
mstime += solution->mstime / N;
if (!pass) break;
}
if (pass) {
std::cout << "Execution time: " << mstime<< " ms, "
<< "Throughput: " << task->size() / mstime / 1000000 << " GE/s" << std::endl;
std::cout << "TEST PASSED" << std::endl;
} else {
std::cout << "TEST FAILED" << std::endl;
}
};
A2Task2SolutioNaive naiveSolution(app, 128);
evaluateTask2Solution(&a2Task2, &naiveSolution, "Naive",5);
A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, sizeLocal);
evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5);
A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128);
evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5);
}
int main()
{
try
{
AppResources app;
initApp(app);
renderdoc::initialize();
renderdoc::startCapture();
run_A2_task1(app);
run_A2_task2(app);
renderdoc::endCapture();
app.destroy();
}
catch (vk::SystemError &err)
{
std::cout << "vk::SystemError: " << err.what() << std::endl;
exit(-1);
}
catch (std::exception &err)
{
std::cout << "std::exception: " << err.what() << std::endl;
exit(-1);
}
catch (...)
{
std::cout << "unknown error\n";
exit(-1);
}
return EXIT_SUCCESS;
}

50
src/renderdoc.cpp Normal file
View File

@@ -0,0 +1,50 @@
#include "renderdoc.h"
#include <cassert>
#ifdef ENABLE_RENDERDOC
#include "renderdoc_app.h"
#ifdef _WIN32
#include <windows.h>
#elif __linux__
#include <dlfcn.h>
#endif
static RENDERDOC_API_1_1_2 *rdoc_api = nullptr;
#endif
namespace renderdoc {
void initialize() {
#ifdef ENABLE_RENDERDOC
pRENDERDOC_GetAPI RENDERDOC_GetAPI = nullptr;
#ifdef _WIN32
if(HMODULE mod = GetModuleHandleA("renderdoc.dll"))
RENDERDOC_GetAPI = (pRENDERDOC_GetAPI)GetProcAddress(mod, "RENDERDOC_GetAPI");
#elif __linux__
if(void *mod = dlopen("librenderdoc.so", RTLD_NOW | RTLD_NOLOAD))
RENDERDOC_GetAPI = (pRENDERDOC_GetAPI)dlsym(mod, "RENDERDOC_GetAPI");
#endif
if (RENDERDOC_GetAPI != nullptr) {
int ret = RENDERDOC_GetAPI(eRENDERDOC_API_Version_1_1_2, (void **)&rdoc_api);
assert(ret == 1);
}
#endif
}
void startCapture() {
#ifdef ENABLE_RENDERDOC
if (rdoc_api)
rdoc_api->StartFrameCapture(nullptr, nullptr);
#endif
}
void endCapture() {
#ifdef ENABLE_RENDERDOC
if (rdoc_api)
rdoc_api->EndFrameCapture(nullptr, nullptr);
#endif
}
}

116
src/task_common.cpp Normal file
View File

@@ -0,0 +1,116 @@
#include <iostream>
#include <cstdlib>
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
#include <vulkan/vulkan.hpp>
#include <fstream>
#include <vector>
#include "task_common.h"
#include "initialization.h"
#include "utils.h"
namespace Cmn {
//We have a binding vector ready to become a descriptorSetLayout
void createDescriptorSetLayout(vk::Device& device,
std::vector<vk::DescriptorSetLayoutBinding>& bindings,
vk::DescriptorSetLayout& descLayout) {
vk::DescriptorSetLayoutCreateInfo layoutInfo(
{},
CAST(bindings), // Number of binding infos
bindings.data() // Array of binding infos
);
descLayout = device.createDescriptorSetLayout(layoutInfo);
}
void addStorage(std::vector<vk::DescriptorSetLayoutBinding>& bindings, uint32_t binding) {
//Bindings needed for DescriptorSetLayout
//The DescriptorType eStorageBuffer is used in our case as storage buffer for compute shader
//The ID binding(argument) is needed in the shader
//DescriptorCount is set to 1U
bindings.push_back(vk::DescriptorSetLayoutBinding(
binding, // The binding number of this entry
vk::DescriptorType::eStorageBuffer, // Type of resource descriptors used for this binding
1U, // Number of descriptors contained in the binding
vk::ShaderStageFlagBits::eCompute) // All defined shader stages can access the resource
);
}
void allocateDescriptorSet(vk::Device& device, vk::DescriptorSet& descSet, vk::DescriptorPool& descPool,
vk::DescriptorSetLayout& descLayout) {
// You can technically allocate multiple layouts at once, we don't need that (so we put 1)
vk::DescriptorSetAllocateInfo descAllocInfo(descPool, 1U, &descLayout);
// Therefore the vector is length one, we want to take its (only) element
descSet = device.allocateDescriptorSets(descAllocInfo)[0];
}
//Binding our DescriptorSet to Buffer
//VK_WHOLE_SIZE is specified to bind the entire Buffer
//DescriptorType eStorageBuffer in our case should be coherant with DescriptorSetLayout
//WriteDescriptorSets(creates array) and updateDescriptorSets can be used only once
void bindBuffers(vk::Device& device, vk::Buffer& b, vk::DescriptorSet& set, uint32_t binding) {
// Buffer info and data offset info
vk::DescriptorBufferInfo descInfo(
b, // Buffer to get data from
0ULL, // Position of start of data
VK_WHOLE_SIZE // Size of data
);
// Binding index in the shader V
vk::WriteDescriptorSet write(set, binding, 0U, 1U,
vk::DescriptorType::eStorageBuffer, nullptr, &descInfo);
device.updateDescriptorSets(1U, &write, 0U, nullptr);
}
void createPipeline(vk::Device& device, vk::Pipeline& pipeline,
vk::PipelineLayout& pipLayout, vk::SpecializationInfo& specInfo,
vk::ShaderModule& sModule) {
vk::PipelineShaderStageCreateInfo stageInfo(vk::PipelineShaderStageCreateFlags(),
vk::ShaderStageFlagBits::eCompute, sModule,
"main", &specInfo);
vk::ComputePipelineCreateInfo computeInfo(vk::PipelineCreateFlags(), stageInfo, pipLayout);
// This is a workaround: ideally there should not be a ".value"
// This should be fixed in later releases of the SDK
pipeline = device.createComputePipeline(nullptr, computeInfo, nullptr).value;
}
//Number of DescriptorSets is one by default
void createDescriptorPool(vk::Device& device,
std::vector<vk::DescriptorSetLayoutBinding>& bindings, vk::DescriptorPool& descPool,
uint32_t numDescriptorSets) {
vk::DescriptorPoolSize descriptorPoolSize = vk::DescriptorPoolSize(
vk::DescriptorType::eStorageBuffer, bindings.size() * numDescriptorSets);
vk::DescriptorPoolCreateInfo descriptorPoolCI = vk::DescriptorPoolCreateInfo(
vk::DescriptorPoolCreateFlags(), numDescriptorSets, 1U, &descriptorPoolSize);
descPool = device.createDescriptorPool(descriptorPoolCI);
}
void createShader(vk::Device& device, vk::ShaderModule& shaderModule, const std::string& filename) {
std::vector<char> cshader = readFile(filename);
// Shader Module creation information
vk::ShaderModuleCreateInfo smi(
{},
static_cast<uint32_t>(cshader.size()), // Size of code
reinterpret_cast<const uint32_t *>(cshader.data())); // Pointer to code (of uint32_t pointer type)
shaderModule = device.createShaderModule(smi);
}
}
void TaskResources::destroy(vk::Device& device) {
//Destroy all the resources we created in reverse order
//Pipeline Should be destroyed before PipelineLayout
device.destroyPipeline(this->pipeline);
//PipelineLayout should be destroyed before DescriptorPool
device.destroyPipelineLayout(this->pipelineLayout);
//DescriptorPool should be destroyed before the DescriptorSetLayout
device.destroyDescriptorPool(this->descriptorPool);
device.destroyDescriptorSetLayout(this->descriptorSetLayout);
device.destroyShaderModule(this->cShader);
//The DescriptorSet does not need to be destroyed, It is managed by DescriptorPool.
std::cout << std::endl
<< "destroyed everything successfully in task" << std::endl;
}

109
src/utils.cpp Normal file
View File

@@ -0,0 +1,109 @@
#include <vector>
#include <iostream>
#include <fstream>
#include <cstring>
#include <sstream>
#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1
#include <vulkan/vulkan.hpp>
#include "utils.h"
std::vector<char> readFile(const std::string& filename) {
std::ifstream file(filename, std::ios::ate | std::ios::binary);
if (!file.is_open()) {
std::string error = "failed to open file: " + filename;
throw std::runtime_error(error);
}
size_t fileSize = (size_t) file.tellg();
std::vector<char> buffer(fileSize);
file.seekg(0);
file.read(buffer.data(), fileSize);
file.close();
// uncomment for debug
//std::cout << "read " << buffer.size() << " bytes of data in file " << filename << std::endl;
return buffer;
}
std::string formatSize(uint64_t size) {
std::ostringstream oss;
if (size < 1024) {
oss << size << " B";
} else if (size < 1024 * 1024) {
oss << size / 1024.f << " KB";
} else if (size < 1024 * 1024 * 1024) {
oss << size / (1024.0f * 1024.0f) << " MB";
} else {
oss << size / (1024.0f * 1024.0f * 1024.0f) << " GB";
}
return oss.str();
}
uint32_t findMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties, vk::PhysicalDevice& pdevice) {
vk::PhysicalDeviceMemoryProperties memProperties = pdevice.getMemoryProperties();
for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) {
if ((typeFilter & (1 << i)) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) {
return i;
}
}
throw std::runtime_error("failed to find suitable memory type!");
}
void createBuffer(vk::PhysicalDevice& pDevice, vk::Device& device,
const vk::DeviceSize& size, vk::BufferUsageFlags usage,
vk::MemoryPropertyFlags properties, std::string name, vk::Buffer& buffer,
vk::DeviceMemory& bufferMemory) {
vk::BufferCreateInfo inBufferInfo({}, size, usage);
buffer = device.createBuffer(inBufferInfo);
setObjectName(device, buffer, name);
vk::MemoryRequirements memReq = device.getBufferMemoryRequirements(buffer);
vk::MemoryAllocateInfo allocInfo(memReq.size,
findMemoryType(memReq.memoryTypeBits, properties, pDevice));
bufferMemory = device.allocateMemory(allocInfo);
device.bindBufferMemory(buffer, bufferMemory, 0U);
}
void createBuffer(vk::PhysicalDevice& pDevice, vk::Device& device,
const vk::DeviceSize& size, vk::BufferUsageFlags usage,
vk::MemoryPropertyFlags properties, std::string name, Buffer& buffer) {
createBuffer(pDevice, device, size, usage, properties, name, buffer.buf, buffer.mem);
}
void destroyBuffer(vk::Device& device, Buffer& buffer) {
device.destroyBuffer(buffer.buf);
device.freeMemory(buffer.mem);
}
void copyBuffer(vk::Device& device, vk::Queue& q, vk::CommandPool& commandPool,
const vk::Buffer& srcBuffer, vk::Buffer& dstBuffer, vk::DeviceSize byteSize) {
vk::CommandBuffer commandBuffer = beginSingleTimeCommands(device, commandPool);
vk::BufferCopy copyRegion(0ULL, 0ULL, byteSize);
commandBuffer.copyBuffer(srcBuffer, dstBuffer, 1, &copyRegion);
endSingleTimeCommands(device, q, commandPool, commandBuffer);
}
vk::CommandBuffer beginSingleTimeCommands(vk::Device& device, vk::CommandPool& commandPool) {
vk::CommandBufferAllocateInfo allocInfo(commandPool, vk::CommandBufferLevel::ePrimary, 1);
vk::CommandBuffer commandBuffer = device.allocateCommandBuffers(allocInfo)[0];
vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
commandBuffer.begin(beginInfo);
return commandBuffer;
}
void endSingleTimeCommands(vk::Device& device, vk::Queue& q,
vk::CommandPool& commandPool, vk::CommandBuffer& commandBuffer) {
commandBuffer.end();
vk::SubmitInfo submitInfo(0U, nullptr, nullptr, 1U, &commandBuffer);
q.submit({submitInfo}, nullptr);
q.waitIdle();
device.freeCommandBuffers(commandPool, 1, &commandBuffer);
}