commit c86af91a922c4a5309c7b0c61e61e82fcbf2eea7 Author: Arif Hasanic Date: Sun Dec 21 15:41:59 2025 +0100 init commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..122452b --- /dev/null +++ b/.gitignore @@ -0,0 +1,45 @@ +# Compiled Object files +**/.DS_Store +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.lib + +# Executables +*.exe +*.out +*.app + +**/cmake-build-debug +**/CMakeCache.txt +**/cmake_install.cmake +**/install_manifest.txt +**/CMakeFiles/ +**/CTestTestfile.cmake +**/*.cbp +**/CMakeScripts +**/compile_commands.json + +include/divisible/* + +build/ +.cache/ +.vscode/ diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..3b6d81c --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,77 @@ +cmake_minimum_required(VERSION 3.16) +set(CMAKE_CXX_STANDARD 17) + +project(Assignment2) + +function(add_shader TARGET SHADER) + find_program(GLSLC glslc) + + set(current-shader-path ${CMAKE_CURRENT_SOURCE_DIR}/${SHADER}) + set(current-output-path ${CMAKE_CURRENT_SOURCE_DIR}/build/${SHADER}.spv) + + file(MAKE_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/build) + get_filename_component(current-output-dir ${current-output-path} DIRECTORY) + file(MAKE_DIRECTORY ${current-output-dir}) + + add_custom_command( + OUTPUT ${current-output-path} + COMMAND ${GLSLC} --target-env=vulkan1.2 -o ${current-output-path} ${current-shader-path} + DEPENDS ${current-shader-path} + IMPLICIT_DEPENDS CXX ${current-shader-path} + VERBATIM) + + # Make sure our build depends on this output. + set_source_files_properties(${current-output-path} PROPERTIES GENERATED TRUE) + target_sources(${TARGET} PRIVATE ${current-output-path}) +endfunction(add_shader) + +find_package(Vulkan REQUIRED) + +if (DEFINED ENV{RENDERDOC_PATH}) + set(RENDERDOC_PATH ENV{RENDERDOC_PATH}) +elseif (WIN32) + if(EXISTS "C:\\Program Files\\RenderDoc") + set(RENDERDOC_PATH "C:\\Program Files\\RenderDoc") + endif() +else () + #LINUX PATH HERE +endif () + +set(SOURCE_FILE + src/main.cpp + src/task_common.cpp + src/host_timer.cpp + src/initialization.cpp + src/renderdoc.cpp + src/utils.cpp + src/A2Task1.cpp + src/A2Task2.cpp + src/A2Task1Solution/Interleaved.cpp + src/A2Task1Solution/KernelDecomposition.cpp + src/A2Task1Solution/Sequential.cpp + src/A2Task2Solution/KernelDecomposition.cpp + src/A2Task2Solution/Naive.cpp +) + +add_compile_definitions(WORKING_DIR="${CMAKE_CURRENT_SOURCE_DIR}") + +add_executable(${PROJECT_NAME} ${SOURCE_FILE}) + +add_shader(${PROJECT_NAME} shaders/A2Task1Interleaved.comp) +add_shader(${PROJECT_NAME} shaders/A2Task1KernelDecomposition.comp) +add_shader(${PROJECT_NAME} shaders/A2Task1KernelDecompositionAtomic.comp) +add_shader(${PROJECT_NAME} shaders/A2Task1KernelDecompositionUnroll.comp) +add_shader(${PROJECT_NAME} shaders/A2Task1Sequential.comp) +add_shader(${PROJECT_NAME} shaders/A2Task2KernelDecomposition.comp) +add_shader(${PROJECT_NAME} shaders/A2Task2KernelDecompositionOffset.comp) +add_shader(${PROJECT_NAME} shaders/A2Task2Naive.comp) + +target_include_directories(${PROJECT_NAME} PUBLIC ./include) +target_include_directories(${PROJECT_NAME} PRIVATE ${Vulkan_INCLUDE_DIRS}) +target_link_libraries(${PROJECT_NAME} Vulkan::Vulkan) +target_compile_definitions(${PROJECT_NAME} PRIVATE) +if (RENDERDOC_PATH) + target_include_directories(${PROJECT_NAME} PRIVATE /usr/local/renderdoc_1.33/include) + target_include_directories(${PROJECT_NAME} PRIVATE ${RENDERDOC_PATH}) + target_compile_definitions(${PROJECT_NAME} PRIVATE ENABLE_RENDERDOC) +endif () \ No newline at end of file diff --git a/include/A2Task1.h b/include/A2Task1.h new file mode 100644 index 0000000..b98ff1d --- /dev/null +++ b/include/A2Task1.h @@ -0,0 +1,37 @@ +#pragma once +#include "helper.h" + +#include +#include +#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 + +#include +#include +#include +#include "initialization.h" +#include "utils.h" +#include "task_common.h" + +class A2Task1Solution { +public: + float mstime; + + virtual void prepare(const std::vector &input) = 0; + virtual void compute() = 0; + virtual uint result() const = 0; + virtual void cleanup() = 0; +}; + +class A2Task1 { +public: + A2Task1(uint problemSize); + A2Task1(std::vector input); + + bool evaluateSolution(A2Task1Solution& solution); + +private: + void computeReference(); + + std::vector input; + uint reference; +}; \ No newline at end of file diff --git a/include/A2Task2.h b/include/A2Task2.h new file mode 100644 index 0000000..b12f331 --- /dev/null +++ b/include/A2Task2.h @@ -0,0 +1,40 @@ +#pragma once +#include "helper.h" + +#include +#include +#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 + +#include +#include +#include +#include "initialization.h" +#include "utils.h" +#include "task_common.h" + +class A2Task2Solution { +public: + float mstime; + + virtual void prepare(const std::vector &input) = 0; + virtual void compute() = 0; + virtual std::vector result() const = 0; + virtual void cleanup() = 0; +}; + +class A2Task2 { +public: + A2Task2(uint problemSize); + A2Task2(std::vector input); + + bool evaluateSolution(A2Task2Solution& solution); + size_t size() const { + return input.size(); + } + +private: + void computeReference(); + + std::vector input; + std::vector reference; +}; \ No newline at end of file diff --git a/include/helper.h b/include/helper.h new file mode 100644 index 0000000..237b171 --- /dev/null +++ b/include/helper.h @@ -0,0 +1,8 @@ +#pragma once +#include + +#if defined(WORKING_DIR) +inline std::string workingDir = std::string(WORKING_DIR) + "/"; +#else +inline std::string workingDir = std::string("./"); +#endif diff --git a/include/host_timer.h b/include/host_timer.h new file mode 100644 index 0000000..907f1d9 --- /dev/null +++ b/include/host_timer.h @@ -0,0 +1,15 @@ +#pragma once + +#include + +class HostTimer { +private: + using clock = std::chrono::high_resolution_clock; + + clock::time_point start; + +public: + HostTimer(); + void reset(); + double elapsed() const; +}; \ No newline at end of file diff --git a/include/initialization.h b/include/initialization.h new file mode 100644 index 0000000..82dbab3 --- /dev/null +++ b/include/initialization.h @@ -0,0 +1,47 @@ +#ifndef INITIALIZATION +#define INITIALIZATION +#include +#include + +struct AppResources +{ + vk::Instance instance; + vk::DebugUtilsMessengerEXT dbgUtilsMgr; + vk::PhysicalDevice pDevice; + vk::PhysicalDeviceProperties2 pDeviceProperties; + vk::PhysicalDeviceSubgroupProperties pDeviceSubgroupProperties; + + vk::Device device; + vk::Queue computeQueue, transferQueue; + uint32_t cQ, tQ; + vk::CommandPool computeCommandPool, transferCommandPool; + vk::QueryPool queryPool; + + void destroy(); +}; + +VKAPI_ATTR VkBool32 VKAPI_CALL +debugUtilsMessengerCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, + VkDebugUtilsMessageTypeFlagsEXT messageTypes, + VkDebugUtilsMessengerCallbackDataEXT const *pCallbackData, + void * /*pUserData*/); +vk::DebugUtilsMessengerCreateInfoEXT makeDebugUtilsMessengerCreateInfoEXT(); + +void selectPhysicalDevice(vk::Instance &instance, vk::PhysicalDevice &pDevice); +void createInstance(vk::Instance &instance, vk::DebugUtilsMessengerEXT &debugUtilsMessenger, + std::string appName, std::string engineName); +void createLogicalDevice(vk::Instance &instance, vk::PhysicalDevice &pDevice, vk::Device &device); +std::tuple getComputeAndTransferQueues(vk::PhysicalDevice &pDevice); +void createCommandPool(vk::Device &device, vk::CommandPool &commandPool, uint32_t queueIndex); +void destroyInstance(vk::Instance &instance, vk::DebugUtilsMessengerEXT &debugUtilsMessenger); +void destroyLogicalDevice(vk::Device &device); +void destroyCommandPool(vk::Device &device, vk::CommandPool &commandPool); + +void createTimestampQueryPool(vk::Device &device, vk::QueryPool &queryPool, uint32_t queryCount); +void destroyQueryPool(vk::Device &device, vk::QueryPool &queryPool); + + +void printDeviceCapabilities(vk::PhysicalDevice &pDevice); + +void initApp(AppResources &app); +#endif diff --git a/include/renderdoc.h b/include/renderdoc.h new file mode 100644 index 0000000..91028f0 --- /dev/null +++ b/include/renderdoc.h @@ -0,0 +1,7 @@ +#pragma once + +namespace renderdoc { + void initialize(); + void startCapture(); + void endCapture(); +} diff --git a/include/task_common.h b/include/task_common.h new file mode 100644 index 0000000..1866cc2 --- /dev/null +++ b/include/task_common.h @@ -0,0 +1,47 @@ +#include +#include +#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 + +#include +#include +#include +#include "initialization.h" +#include "utils.h" +#ifndef EX_TEMPLATE +#define EX_TEMPLATE + +namespace Cmn { +void createDescriptorSetLayout(vk::Device &device, + std::vector &bindings, vk::DescriptorSetLayout &descLayout); +void addStorage(std::vector &bindings, uint32_t binding); + +void allocateDescriptorSet(vk::Device &device, vk::DescriptorSet &descSet, vk::DescriptorPool &descPool, + vk::DescriptorSetLayout &descLayout); +void bindBuffers(vk::Device &device, vk::Buffer &b, vk::DescriptorSet &set, uint32_t binding); + +void createDescriptorPool(vk::Device &device, + std::vector &bindings, vk::DescriptorPool &descPool, uint32_t numDescriptors = 1); +void createPipeline(vk::Device &device, vk::Pipeline &pipeline, + vk::PipelineLayout &pipLayout, vk::SpecializationInfo &specInfo, vk::ShaderModule &sModule); +void createShader(vk::Device &device, vk::ShaderModule &shaderModule, const std::string &filename); + +} + +struct TaskResources +{ + //std::vector buffers; move this to user code + vk::ShaderModule cShader; + + vk::DescriptorSetLayout descriptorSetLayout; + std::vector bindings; + vk::DescriptorSet descriptorSet; + vk::DescriptorPool descriptorPool; + + vk::Pipeline pipeline; + vk::PipelineLayout pipelineLayout; + + void destroy(vk::Device &device); + +}; + +#endif diff --git a/include/utils.h b/include/utils.h new file mode 100644 index 0000000..a5f4262 --- /dev/null +++ b/include/utils.h @@ -0,0 +1,113 @@ +#ifndef UTILS +#define UTILS +#include +#include + +#include + +#define CAST(a) static_cast(a.size()) +struct Buffer +{ + vk::Buffer buf; + vk::DeviceMemory mem; +}; + +typedef uint32_t uint; + +template +T ceilDiv(T x, V y) { + return x / y + (x % y != 0); +} + +std::vector readFile(const std::string &filename); +std::string formatSize(uint64_t size); +uint32_t findMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties, vk::PhysicalDevice &pdevice); +void createBuffer(vk::PhysicalDevice &pDevice, vk::Device &device, + const vk::DeviceSize &size, vk::BufferUsageFlags usage, + vk::MemoryPropertyFlags properties, std::string name, vk::Buffer &buffer, vk::DeviceMemory &bufferMemory); +void createBuffer(vk::PhysicalDevice &pDevice, vk::Device &device, + const vk::DeviceSize &size, vk::BufferUsageFlags usage, + vk::MemoryPropertyFlags properties, std::string name, Buffer &buffer); +void destroyBuffer(vk::Device &device, Buffer &buffer); +void copyBuffer(vk::Device &device, vk::Queue &q, vk::CommandPool &commandPool, + const vk::Buffer &srcBuffer, vk::Buffer &dstBuffer, vk::DeviceSize byteSize); + +vk::CommandBuffer beginSingleTimeCommands(vk::Device &device, vk::CommandPool &commandPool); +void endSingleTimeCommands(vk::Device &device, vk::Queue &q, + vk::CommandPool &commandPool, vk::CommandBuffer &commandBuffer); + +Buffer addHostCoherentBuffer(vk::PhysicalDevice &pDevice, vk::Device &device, vk::DeviceSize size, std::string name); +Buffer addDeviceOnlyBuffer(vk::PhysicalDevice &pDevice, vk::Device &device, vk::DeviceSize size, std::string name); + +template +void fillDeviceBuffer(vk::Device &device, vk::DeviceMemory &mem, const std::vector &input) +{ + void *data = device.mapMemory(mem, 0, input.size() * sizeof(T), vk::MemoryMapFlags()); + memcpy(data, input.data(), static_cast(input.size() * sizeof(T))); + device.unmapMemory(mem); +} + +template +void fillHostBuffer(vk::Device &device, vk::DeviceMemory &mem, std::vector &output) +{ + // copy memory from mem to output + void *data = device.mapMemory(mem, 0, output.size() * sizeof(T), vk::MemoryMapFlags()); + memcpy(output.data(), data, static_cast(output.size() * sizeof(T))); + device.unmapMemory(mem); +} + +template +void fillDeviceWithStagingBuffer(vk::PhysicalDevice &pDevice, vk::Device &device, + vk::CommandPool &commandPool, vk::Queue &q, + Buffer &b, const std::vector &data) +{ + // Buffer b requires the eTransferSrc bit + // data (host) -> staging (device) -> Buffer b (device) + vk::Buffer staging; + vk::DeviceMemory mem; + vk::DeviceSize byteSize = data.size() * sizeof(T); + + createBuffer(pDevice, device, byteSize, vk::BufferUsageFlagBits::eTransferSrc, + vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostVisible, "staging", + staging, mem); + // V host -> staging V + fillDeviceBuffer(device, mem, data); + // V staging -> buffer V + copyBuffer(device, q, commandPool, staging, b.buf, byteSize); + device.destroyBuffer(staging); + device.freeMemory(mem); +} + +template +void fillHostWithStagingBuffer(vk::PhysicalDevice &pDevice, vk::Device &device, + vk::CommandPool &commandPool, vk::Queue &q, + const Buffer &b, std::vector &data) +{ + // Buffer b requires the eTransferDst bit + // Buffer b (device) -> staging (device) -> data (host) + vk::Buffer staging; + vk::DeviceMemory mem; + vk::DeviceSize byteSize = data.size() * sizeof(T); + + createBuffer(pDevice, device, byteSize, vk::BufferUsageFlagBits::eTransferDst, + vk::MemoryPropertyFlagBits::eHostCoherent | vk::MemoryPropertyFlagBits::eHostVisible, "staging", + staging, mem); + // V buffer -> staging V + copyBuffer(device, q, commandPool, b.buf, staging, byteSize); + // V staging -> host V + fillHostBuffer(device, mem, data); + + device.destroyBuffer(staging); + device.freeMemory(mem); +} + +template +void setObjectName(vk::Device &device, T handle, std::string name) +{ +#ifndef NDEBUG + vk::DebugUtilsObjectNameInfoEXT infoEXT(handle.objectType, uint64_t(static_cast(handle)), name.c_str()); + device.setDebugUtilsObjectNameEXT(infoEXT); +#endif +} + +#endif diff --git a/shaders/A2Task1Interleaved.comp b/shaders/A2Task1Interleaved.comp new file mode 100644 index 0000000..440da96 --- /dev/null +++ b/shaders/A2Task1Interleaved.comp @@ -0,0 +1,21 @@ +#version 450 + +/* built in: +in uvec3 gl_NumWorkGroups; +in uvec3 gl_WorkGroupID; +in uvec3 gl_LocalInvocationID; +in uvec3 gl_GlobalInvocationID; +in uint gl_LocalInvocationIndex; +*/ +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout(push_constant) uniform PushStruct { + uint size; + uint stride; +} p; + +layout(binding = 0) buffer inoutBufer {uint v[];}; + +void main() { + // TODO: Kernel implementation +} diff --git a/shaders/A2Task1KernelDecomposition.comp b/shaders/A2Task1KernelDecomposition.comp new file mode 100644 index 0000000..8967b69 --- /dev/null +++ b/shaders/A2Task1KernelDecomposition.comp @@ -0,0 +1,24 @@ +#version 450 + +/* built in: +in uvec3 gl_NumWorkGroups; +in uvec3 gl_WorkGroupID; +in uvec3 gl_LocalInvocationID; +in uvec3 gl_GlobalInvocationID; +in uint gl_LocalInvocationIndex; +*/ +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout(push_constant) uniform PushStruct { + uint size; + uint offset; +} p; + +layout(binding = 0) buffer inBuffer { uint v[]; }; +layout(binding = 1) buffer outBuffer { uint g_v[]; }; + +// TODO: Shared variables + +void main() { + // TODO: Kernel implementation +} \ No newline at end of file diff --git a/shaders/A2Task1KernelDecompositionAtomic.comp b/shaders/A2Task1KernelDecompositionAtomic.comp new file mode 100644 index 0000000..8967b69 --- /dev/null +++ b/shaders/A2Task1KernelDecompositionAtomic.comp @@ -0,0 +1,24 @@ +#version 450 + +/* built in: +in uvec3 gl_NumWorkGroups; +in uvec3 gl_WorkGroupID; +in uvec3 gl_LocalInvocationID; +in uvec3 gl_GlobalInvocationID; +in uint gl_LocalInvocationIndex; +*/ +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout(push_constant) uniform PushStruct { + uint size; + uint offset; +} p; + +layout(binding = 0) buffer inBuffer { uint v[]; }; +layout(binding = 1) buffer outBuffer { uint g_v[]; }; + +// TODO: Shared variables + +void main() { + // TODO: Kernel implementation +} \ No newline at end of file diff --git a/shaders/A2Task1KernelDecompositionUnroll.comp b/shaders/A2Task1KernelDecompositionUnroll.comp new file mode 100644 index 0000000..8967b69 --- /dev/null +++ b/shaders/A2Task1KernelDecompositionUnroll.comp @@ -0,0 +1,24 @@ +#version 450 + +/* built in: +in uvec3 gl_NumWorkGroups; +in uvec3 gl_WorkGroupID; +in uvec3 gl_LocalInvocationID; +in uvec3 gl_GlobalInvocationID; +in uint gl_LocalInvocationIndex; +*/ +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout(push_constant) uniform PushStruct { + uint size; + uint offset; +} p; + +layout(binding = 0) buffer inBuffer { uint v[]; }; +layout(binding = 1) buffer outBuffer { uint g_v[]; }; + +// TODO: Shared variables + +void main() { + // TODO: Kernel implementation +} \ No newline at end of file diff --git a/shaders/A2Task1Sequential.comp b/shaders/A2Task1Sequential.comp new file mode 100644 index 0000000..f790993 --- /dev/null +++ b/shaders/A2Task1Sequential.comp @@ -0,0 +1,21 @@ +#version 450 + +/* built in: +in uvec3 gl_NumWorkGroups; +in uvec3 gl_WorkGroupID; +in uvec3 gl_LocalInvocationID; +in uvec3 gl_GlobalInvocationID; +in uint gl_LocalInvocationIndex; +*/ +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout(push_constant) uniform PushStruct { + uint size; + uint offset; +} p; + +layout(binding = 0) buffer inoutBufer { uint v[]; }; + +void main() { + // TODO: Kernel implementation +} diff --git a/shaders/A2Task2KernelDecomposition.comp b/shaders/A2Task2KernelDecomposition.comp new file mode 100644 index 0000000..d308fa9 --- /dev/null +++ b/shaders/A2Task2KernelDecomposition.comp @@ -0,0 +1,52 @@ +#version 450 + +/* built in: +in uvec3 gl_NumWorkGroups; +in uvec3 gl_WorkGroupID; +in uvec3 gl_LocalInvocationID; +in uvec3 gl_GlobalInvocationID; +in uint gl_LocalInvocationIndex; +*/ + +// Why did we not have conflicts in the Reduction? +// Because of the sequential addressing (here we use interleaved => we have conflicts). +// TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs) +#define NUM_BANKS 32 +#define NUM_BANKS_LOG 5 +#define SIMD_GROUP_SIZE 32 + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout(push_constant) uniform PushStruct { + uint size; +} p; + +layout(binding = 0) buffer inoutBufer {uint array[];}; +layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];}; + +// TODO: Shared variables + +// Bank conflicts +#define AVOID_BANK_CONFLICTS +#ifdef AVOID_BANK_CONFLICTS +// TODO: define your conflict-free macro here +#else +#define OFFSET(A) (A) +#endif + +////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +void main() +{ + // TODO: Kernel implementation + + // Cache first half of elements in the local memory + // Cache second half of elements + + // Perform up-sweep + + // Unroll the last steps when arrived at warp size + // Set the last element to 0 + + + // Perform down-sweep +} diff --git a/shaders/A2Task2KernelDecompositionOffset.comp b/shaders/A2Task2KernelDecompositionOffset.comp new file mode 100644 index 0000000..dd02fad --- /dev/null +++ b/shaders/A2Task2KernelDecompositionOffset.comp @@ -0,0 +1,25 @@ +#version 450 + +/* built in: +in uvec3 gl_NumWorkGroups; +in uvec3 gl_WorkGroupID; +in uvec3 gl_LocalInvocationID; +in uvec3 gl_GlobalInvocationID; +in uint gl_LocalInvocationIndex; +*/ +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; +layout (constant_id = 1) const uint SAMPLE_MULTIPLIER = 1; + +// Push constant +layout(push_constant) uniform PushStruct { + uint size; +} p; + +layout(binding = 0) buffer inoutBufer { uint v[]; }; +layout(binding = 1) buffer offsetBufer { uint g_v[]; }; + +// TODO: Shared variables + +void main() { + // TODO: Shared variables +} \ No newline at end of file diff --git a/shaders/A2Task2Naive.comp b/shaders/A2Task2Naive.comp new file mode 100644 index 0000000..e60a643 --- /dev/null +++ b/shaders/A2Task2Naive.comp @@ -0,0 +1,23 @@ +#version 450 + +/* built in: +in uvec3 gl_NumWorkGroups; +in uvec3 gl_WorkGroupID; +in uvec3 gl_LocalInvocationID; +in uvec3 gl_GlobalInvocationID; +in uint gl_LocalInvocationIndex; +*/ + +layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in; + +layout(push_constant) uniform PushStruct { + uint size; + uint offset; +} p; + +layout(binding = 0) buffer inBuffer { uint v[]; }; +layout(binding = 1) buffer outBufer { uint g_v[]; }; + +void main() { + // TODO: Kernel implementation +} diff --git a/src/A2Task1.cpp b/src/A2Task1.cpp new file mode 100644 index 0000000..08a3932 --- /dev/null +++ b/src/A2Task1.cpp @@ -0,0 +1,41 @@ +#include "A2Task1.h" + +#include +#include +#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 + +#include +#include +#include +#include "initialization.h" +#include "utils.h" +#include "task_common.h" +#include "host_timer.h" + + +A2Task1::A2Task1(uint problemSize) : input(problemSize, 0) { + for (auto i = 0; i < problemSize; i++) + input[i] = i % 97; + computeReference(); +} + +A2Task1::A2Task1(std::vector input) : input(input) { + computeReference(); +} + +bool A2Task1::evaluateSolution(A2Task1Solution& solution) { + solution.prepare(input); + solution.compute(); + auto result = solution.result(); + if (reference != result) { + std::cout << "error: expected " << reference << ", but got " << result << std::endl; + return false; + } + return true; +} + +void A2Task1::computeReference() { + reference = 0; + for (auto e : input) + reference += e; +} \ No newline at end of file diff --git a/src/A2Task1Solution/Interleaved.cpp b/src/A2Task1Solution/Interleaved.cpp new file mode 100644 index 0000000..0781d21 --- /dev/null +++ b/src/A2Task1Solution/Interleaved.cpp @@ -0,0 +1,85 @@ +#include "Interleaved.h" + +#include "host_timer.h" + +A2Task1SolutionInterleaved::A2Task1SolutionInterleaved(AppResources &app, uint workGroupSize) : + app(app), workGroupSize(workGroupSize) {} + +void A2Task1SolutionInterleaved::prepare(const std::vector &input) +{ + mpInput = &input; + + Cmn::addStorage(bindings, 0); + Cmn::createDescriptorSetLayout(app.device, bindings, descriptorSetLayout); + vk::PushConstantRange pcr(vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant)); + vk::PipelineLayoutCreateInfo pipInfo(vk::PipelineLayoutCreateFlags(), 1U, &descriptorSetLayout, 1U, &pcr); + pipelineLayout = app.device.createPipelineLayout(pipInfo); + + // Specialization constant for workgroup size + std::array specEntries = std::array{ + {{0U, 0U, sizeof(workGroupSize)}}, + }; + std::array specValues = {workGroupSize}; //for workgroup sizes + vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), + CAST(specValues) * sizeof(int), specValues.data()); + + Cmn::createShader(app.device, shaderModule, workingDir +"build/shaders/A2Task1Interleaved.comp.spv"); + Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, shaderModule); + + createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]), + vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eDeviceLocal, "inoutBuffer", inoutBuffer); + + fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffer, input); + + Cmn::createDescriptorPool(app.device, bindings, descriptorPool); + Cmn::allocateDescriptorSet(app.device, descriptorSet, descriptorPool, descriptorSetLayout); + Cmn::bindBuffers(app.device, inoutBuffer.buf, descriptorSet, 0); +} + +void A2Task1SolutionInterleaved::compute() +{ + vk::CommandBufferAllocateInfo allocInfo( + app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U); + vk::CommandBuffer cb = app.device.allocateCommandBuffers( allocInfo )[0]; + + vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); + + cb.begin(beginInfo); + + // TODO: Implement reduction with interleaved addressing + + cb.end(); + + vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb); + + HostTimer timer; + + app.computeQueue.submit({submitInfo}); + app.device.waitIdle(); + + mstime = timer.elapsed() * 1000; + + app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb); +} + +uint A2Task1SolutionInterleaved::result() const +{ + std::vector result(1, 0); + fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffer, result); + return result[0]; +} + +void A2Task1SolutionInterleaved::cleanup() +{ + app.device.destroyDescriptorPool(descriptorPool); + + app.device.destroyPipeline(pipeline); + app.device.destroyShaderModule(shaderModule); + + app.device.destroyPipelineLayout(pipelineLayout); + app.device.destroyDescriptorSetLayout(descriptorSetLayout); + bindings.clear(); + + destroyBuffer(app.device, inoutBuffer); +} \ No newline at end of file diff --git a/src/A2Task1Solution/Interleaved.h b/src/A2Task1Solution/Interleaved.h new file mode 100644 index 0000000..0f3bd86 --- /dev/null +++ b/src/A2Task1Solution/Interleaved.h @@ -0,0 +1,42 @@ +#pragma once + +#include "A2Task1.h" + +class A2Task1SolutionInterleaved : public A2Task1Solution{ +public: + A2Task1SolutionInterleaved(AppResources &app, uint workGroupSize); + + void prepare(const std::vector &input) override; + void compute() override; + uint result() const override; + void cleanup() override; + +private: + struct PushConstant + { + uint size; + uint stride; + }; + + AppResources &app; + uint workGroupSize; + + const std::vector* mpInput; + + Buffer inoutBuffer; + + // Descriptor & Pipeline Layout + std::vector bindings; + vk::DescriptorSetLayout descriptorSetLayout; + vk::PipelineLayout pipelineLayout; + + // Local PPS Pipeline + vk::ShaderModule shaderModule; + vk::Pipeline pipeline; + + // Descriptor Pool + vk::DescriptorPool descriptorPool; + + // Per-dispatch data + vk::DescriptorSet descriptorSet; +}; diff --git a/src/A2Task1Solution/KernelDecomposition.cpp b/src/A2Task1Solution/KernelDecomposition.cpp new file mode 100644 index 0000000..7e4265b --- /dev/null +++ b/src/A2Task1Solution/KernelDecomposition.cpp @@ -0,0 +1,97 @@ +#include "KernelDecomposition.h" + +#include "host_timer.h" + +A2Task1SolutionKernelDecomposition::A2Task1SolutionKernelDecomposition(AppResources &app, uint workGroupSize, std::string shaderFileName) : + app(app), workGroupSize(workGroupSize), shaderFileName(shaderFileName) {} + +void A2Task1SolutionKernelDecomposition::prepare(const std::vector &input) +{ + mpInput = &input; + + Cmn::addStorage(bindings, 0); + Cmn::addStorage(bindings, 1); + Cmn::createDescriptorSetLayout(app.device, bindings, descriptorSetLayout); + vk::PushConstantRange pcr(vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant)); + vk::PipelineLayoutCreateInfo pipInfo(vk::PipelineLayoutCreateFlags(), 1U, &descriptorSetLayout, 1U, &pcr); + pipelineLayout = app.device.createPipelineLayout(pipInfo); + + // Specialization constant for workgroup size + std::array specEntries = std::array{ + {{0U, 0U, sizeof(workGroupSize)}}, + }; + std::array specValues = {workGroupSize}; //for workgroup sizes + vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), + CAST(specValues) * sizeof(int), specValues.data()); + + Cmn::createShader(app.device, shaderModule, shaderFileName); + Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, shaderModule); + + for (int i = 0; i < 2; i++) { + createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]), + vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eDeviceLocal, "buffer_" + std::to_string(i), buffers[i].buf, buffers[i].mem); + } + + fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[0], input); + + Cmn::createDescriptorPool(app.device, bindings, descriptorPool, 2); + for (int i = 0; i < 2; i++) + Cmn::allocateDescriptorSet(app.device, descriptorSets[i], descriptorPool, descriptorSetLayout); + Cmn::bindBuffers(app.device, buffers[0].buf, descriptorSets[0], 0); + Cmn::bindBuffers(app.device, buffers[1].buf, descriptorSets[0], 1); + Cmn::bindBuffers(app.device, buffers[1].buf, descriptorSets[1], 0); + Cmn::bindBuffers(app.device, buffers[0].buf, descriptorSets[1], 1); +} + +void A2Task1SolutionKernelDecomposition::compute() +{ + vk::CommandBufferAllocateInfo allocInfo( + app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U); + vk::CommandBuffer cb = app.device.allocateCommandBuffers( allocInfo )[0]; + + vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); + + cb.begin(beginInfo); + + // TODO: Implement reduction with kernel decomposition + // NOTE: make sure that activeBuffer points to the buffer with the final result in the end + // That buffer is read back for the correctness check + // (A2Task1SolutionKernelDecomposition::result()) + // HINT: You can alternate between the two provided descriptor sets to implement ping-pong + + cb.end(); + + vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb); + + HostTimer timer; + + app.computeQueue.submit({submitInfo}); + app.device.waitIdle(); + + mstime = timer.elapsed() * 1000; + + app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb); +} + +uint A2Task1SolutionKernelDecomposition::result() const +{ + std::vector result(1, 0); + fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[activeBuffer], result); + return result[0]; +} + +void A2Task1SolutionKernelDecomposition::cleanup() +{ + app.device.destroyDescriptorPool(descriptorPool); + + app.device.destroyPipeline(pipeline); + app.device.destroyShaderModule(shaderModule); + + app.device.destroyPipelineLayout(pipelineLayout); + app.device.destroyDescriptorSetLayout(descriptorSetLayout); + bindings.clear(); + + for (int i = 0; i < 2; i++) + destroyBuffer(app.device, buffers[i]); +} \ No newline at end of file diff --git a/src/A2Task1Solution/KernelDecomposition.h b/src/A2Task1Solution/KernelDecomposition.h new file mode 100644 index 0000000..cac390f --- /dev/null +++ b/src/A2Task1Solution/KernelDecomposition.h @@ -0,0 +1,44 @@ +#pragma once + +#include "A2Task1.h" + +class A2Task1SolutionKernelDecomposition : public A2Task1Solution{ +public: + A2Task1SolutionKernelDecomposition(AppResources &app, uint workGroupSize, std::string shaderFileName); + + void prepare(const std::vector &input) override; + void compute() override; + uint result() const override; + void cleanup() override; + +private: + struct PushConstant + { + uint size; + }; + + AppResources &app; + uint workGroupSize; + std::string shaderFileName; + + const std::vector* mpInput; + + Buffer buffers[2]; + + // Descriptor & Pipeline Layout + std::vector bindings; + vk::DescriptorSetLayout descriptorSetLayout; + vk::PipelineLayout pipelineLayout; + + // Local PPS Pipeline + vk::ShaderModule shaderModule; + vk::Pipeline pipeline; + + // Descriptor Pool + vk::DescriptorPool descriptorPool; + + // Per-dispatch data + vk::DescriptorSet descriptorSets[2]; + + uint activeBuffer = 0; +}; diff --git a/src/A2Task1Solution/Sequential.cpp b/src/A2Task1Solution/Sequential.cpp new file mode 100644 index 0000000..1bc404d --- /dev/null +++ b/src/A2Task1Solution/Sequential.cpp @@ -0,0 +1,90 @@ +#include "Sequential.h" + +#include "host_timer.h" + +A2Task1SolutionSequential::A2Task1SolutionSequential(AppResources &app, uint workGroupSize) : + app(app), workGroupSize(workGroupSize) {} + +void A2Task1SolutionSequential::prepare(const std::vector &input) +{ + mpInput = &input; + + Cmn::addStorage(bindings, 0); + Cmn::createDescriptorSetLayout(app.device, bindings, descriptorSetLayout); + vk::PushConstantRange pcr(vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant)); + vk::PipelineLayoutCreateInfo pipInfo(vk::PipelineLayoutCreateFlags(), 1U, &descriptorSetLayout, 1U, &pcr); + pipelineLayout = app.device.createPipelineLayout(pipInfo); + + // Specialization constant for workgroup size + std::array specEntries = std::array{ + {{0U, 0U, sizeof(workGroupSize)}}, + }; + std::array specValues = {workGroupSize}; //for workgroup sizes + vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), + CAST(specValues) * sizeof(int), specValues.data()); + + Cmn::createShader(app.device, shaderModule, workingDir +"build/shaders/A2Task1Sequential.comp.spv"); + Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, shaderModule); + + createBuffer(app.pDevice, app.device, mpInput->size() * sizeof((*mpInput)[0]), + vk::BufferUsageFlagBits::eTransferDst | vk::BufferUsageFlagBits::eTransferSrc | vk::BufferUsageFlagBits::eStorageBuffer, + vk::MemoryPropertyFlagBits::eDeviceLocal, "inoutBuffer", inoutBuffer.buf, inoutBuffer.mem); + + fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffer, input); + + Cmn::createDescriptorPool(app.device, bindings, descriptorPool); + Cmn::allocateDescriptorSet(app.device, descriptorSet, descriptorPool, descriptorSetLayout); + Cmn::bindBuffers(app.device, inoutBuffer.buf, descriptorSet, 0); +} + +void A2Task1SolutionSequential::compute() +{ + vk::CommandBufferAllocateInfo allocInfo( + app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U); + vk::CommandBuffer cb = app.device.allocateCommandBuffers( allocInfo )[0]; + + vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); + + cb.begin(beginInfo); + + // TODO: Implement reduction with sequential addressing + + cb.end(); + + vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb); + + HostTimer timer; + + app.computeQueue.submit({submitInfo}); + app.device.waitIdle(); + + mstime = timer.elapsed() * 1000; + + app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb); +} + +uint A2Task1SolutionSequential::result() const +{ + std::vector result(1, 0); + fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffer, result); + return result[0]; +} + +void A2Task1SolutionSequential::cleanup() +{ + app.device.destroyDescriptorPool(descriptorPool); + + app.device.destroyPipeline(pipeline); + app.device.destroyShaderModule(shaderModule); + + app.device.destroyPipelineLayout(pipelineLayout); + app.device.destroyDescriptorSetLayout(descriptorSetLayout); + bindings.clear(); + + auto Bclean = [&](Buffer &b){ + app.device.destroyBuffer(b.buf); + app.device.freeMemory(b.mem); + }; + + Bclean(inoutBuffer); +} \ No newline at end of file diff --git a/src/A2Task1Solution/Sequential.h b/src/A2Task1Solution/Sequential.h new file mode 100644 index 0000000..3cfec12 --- /dev/null +++ b/src/A2Task1Solution/Sequential.h @@ -0,0 +1,42 @@ +#pragma once + +#include "A2Task1.h" + +class A2Task1SolutionSequential : public A2Task1Solution{ +public: + A2Task1SolutionSequential(AppResources &app, uint workGroupSize); + + void prepare(const std::vector &input) override; + void compute() override; + uint result() const override; + void cleanup() override; + +private: + struct PushConstant + { + uint size; + uint offset; + }; + + AppResources &app; + uint workGroupSize; + + const std::vector* mpInput; + + Buffer inoutBuffer; + + // Descriptor & Pipeline Layout + std::vector bindings; + vk::DescriptorSetLayout descriptorSetLayout; + vk::PipelineLayout pipelineLayout; + + // Local PPS Pipeline + vk::ShaderModule shaderModule; + vk::Pipeline pipeline; + + // Descriptor Pool + vk::DescriptorPool descriptorPool; + + // Per-dispatch data + vk::DescriptorSet descriptorSet; +}; diff --git a/src/A2Task2.cpp b/src/A2Task2.cpp new file mode 100644 index 0000000..ca78fa4 --- /dev/null +++ b/src/A2Task2.cpp @@ -0,0 +1,42 @@ +#include "A2Task2.h" + +A2Task2::A2Task2(uint problemSize) : input(problemSize, 0) { + for (auto i = 0; i < problemSize; i++) + input[i] = i % 97; + computeReference(); +} + +A2Task2::A2Task2(std::vector input) : input(input) { + computeReference(); +} + +void A2Task2::computeReference() { + reference.reserve(input.size()); + uint acc = 0; + for (auto i = 0; i < input.size(); i++) { + acc += input[i]; + reference.push_back(acc); + } +} + +bool A2Task2::evaluateSolution(A2Task2Solution& solution) { + solution.prepare(input); + solution.compute(); + auto result = solution.result(); + + if (result.size() != reference.size()) { + std::cout << "error: result and reference vector size don't match!"; + return false; + } + + for (uint i = 0; i < reference.size(); i++) { + if (result[i] != reference[i]) { + std::cout << "error: result and reference don't match at index " << i << "!" << std::endl; + std::cout << "\tresult: " << result[i] << std::endl; + std::cout << "\treference: " << reference[i] << std::endl; + return false; + } + } + + return true; +} diff --git a/src/A2Task2Solution/KernelDecomposition.cpp b/src/A2Task2Solution/KernelDecomposition.cpp new file mode 100644 index 0000000..95b5e19 --- /dev/null +++ b/src/A2Task2Solution/KernelDecomposition.cpp @@ -0,0 +1,114 @@ +#include "KernelDecomposition.h" + +#include "host_timer.h" + +A2Task2SolutionKernelDecomposition::A2Task2SolutionKernelDecomposition(AppResources& app, uint workGroupSize): app(app), + workGroupSize(workGroupSize) { +} + +void A2Task2SolutionKernelDecomposition::prepare(const std::vector& input) { + workSize = input.size(); + + // Descriptor & Pipeline Layout + Cmn::addStorage(bindings, 0); + Cmn::addStorage(bindings, 1); + Cmn::createDescriptorSetLayout(app.device, bindings, descriptorSetLayout); + vk::PushConstantRange pcr(vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushStruct)); + vk::PipelineLayoutCreateInfo pipInfo(vk::PipelineLayoutCreateFlags(), 1U, &descriptorSetLayout, 1U, &pcr); + pipelineLayout = app.device.createPipelineLayout(pipInfo); + + // Specialization constant for workgroup size + std::array specEntries = std::array{ + {{0U, 0U, sizeof(workGroupSize)}}, + }; + std::array specValues = {workGroupSize}; //for workgroup sizes + vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), + CAST(specValues) * sizeof(int), specValues.data()); + + // Local PPS Pipeline + Cmn::createShader(app.device, cShaderLocalPPS, workingDir + "build/shaders/A2Task2KernelDecomposition.comp.spv"); + Cmn::createPipeline(app.device, pipelineLocalPPS, pipelineLayout, specInfo, cShaderLocalPPS); + + // Local PPS Offset Pipeline + Cmn::createShader(app.device, cShaderLocalPPSOffset, + workingDir + "build/shaders/A2Task2KernelDecompositionOffset.comp.spv"); + Cmn::createPipeline(app.device, pipelineLocalPPSOffset, pipelineLayout, specInfo, cShaderLocalPPSOffset); + + // ### create buffers, get their index in the task.buffers[] array ### + using BFlag = vk::BufferUsageFlagBits; + auto makeDLocalBuffer = [ this ](vk::BufferUsageFlags usage, vk::DeviceSize size, std::string name) -> Buffer { + Buffer b; + createBuffer(app.pDevice, app.device, size, usage, vk::MemoryPropertyFlagBits::eDeviceLocal, name, b.buf, + b.mem); + return b; + }; + + inoutBuffers.push_back(makeDLocalBuffer(BFlag::eTransferDst | BFlag::eTransferSrc | BFlag::eStorageBuffer, + input.size() * sizeof(uint32_t), "buffer_inout_0")); + + fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffers[0], + input); + + // TO DO create additional buffers (by pushing into inoutBuffers) and descriptors (by pushing into descriptorSets) + // You need to create an appropriately-sized DescriptorPool first +} + +void A2Task2SolutionKernelDecomposition::compute() { + vk::CommandBufferAllocateInfo allocInfo( + app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U); + vk::CommandBuffer cb = app.device.allocateCommandBuffers(allocInfo)[0]; + + vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); + + cb.begin(beginInfo); + + // TODO: Implement efficient version of scan + // Make sure that the local prefix sum works before you start experimenting with large arrays + + cb.end(); + + vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb); + + HostTimer timer; + + app.computeQueue.submit({submitInfo}); + app.device.waitIdle(); + + mstime = timer.elapsed() * 1000; + + app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb); +} + +std::vector A2Task2SolutionKernelDecomposition::result() const { + std::vector result(workSize, 0); + fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffers[0], + result); + return result; +} + + +void A2Task2SolutionKernelDecomposition::cleanup() { + + app.device.destroyDescriptorPool(descriptorPool); + + app.device.destroyPipeline(pipelineLocalPPSOffset); + app.device.destroyShaderModule(cShaderLocalPPSOffset); + + app.device.destroyPipeline(pipelineLocalPPS); + app.device.destroyShaderModule(cShaderLocalPPS); + + app.device.destroyPipelineLayout(pipelineLayout); + app.device.destroyDescriptorSetLayout(descriptorSetLayout); + bindings.clear(); + + auto Bclean = [&](Buffer& b) { + app.device.destroyBuffer(b.buf); + app.device.freeMemory(b.mem); + }; + + for (auto inoutBuffer: inoutBuffers) { + Bclean(inoutBuffer); + } + + inoutBuffers.clear(); +} diff --git a/src/A2Task2Solution/KernelDecomposition.h b/src/A2Task2Solution/KernelDecomposition.h new file mode 100644 index 0000000..a31694c --- /dev/null +++ b/src/A2Task2Solution/KernelDecomposition.h @@ -0,0 +1,55 @@ +#include +#include +#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 + +#include +#include +#include +#include "initialization.h" +#include "utils.h" +#include "task_common.h" + +#include "A2Task2.h" + +struct A2Task2SolutionKernelDecomposition : A2Task2Solution { +public: + A2Task2SolutionKernelDecomposition(AppResources &app, uint workGroupSize); + + void prepare(const std::vector &input) override; + void compute() override; + std::vector result() const override; + void cleanup() override; + +private: + struct PushStruct + { + uint32_t size; + }; + + AppResources &app; + uint workGroupSize; + std::string localPPSShaderFileName; + + uint workSize; + + std::vector inoutBuffers; + + // Descriptor & Pipeline Layout + std::vector bindings; + vk::DescriptorSetLayout descriptorSetLayout; + vk::PipelineLayout pipelineLayout; + + // Local PPS Pipeline + vk::ShaderModule cShaderLocalPPS; + vk::Pipeline pipelineLocalPPS; + + // Local PPS Offset Pipeline + vk::ShaderModule cShaderLocalPPSOffset; + vk::Pipeline pipelineLocalPPSOffset; + + // Descriptor Pool + vk::DescriptorPool descriptorPool; + + // TODO extend with any additional members you may need +}; + \ No newline at end of file diff --git a/src/A2Task2Solution/Naive.cpp b/src/A2Task2Solution/Naive.cpp new file mode 100644 index 0000000..96e8243 --- /dev/null +++ b/src/A2Task2Solution/Naive.cpp @@ -0,0 +1,100 @@ +#include "Naive.h" + +#include "host_timer.h" + +A2Task2SolutioNaive::A2Task2SolutioNaive( + AppResources &app, uint workGroupSize): + app(app), workGroupSize(workGroupSize) {} + +void A2Task2SolutioNaive::prepare(const std::vector &input) { + workSize = input.size(); + + // Descriptor & Pipeline Layout + Cmn::addStorage(bindings, 0); + Cmn::addStorage(bindings, 1); + Cmn::createDescriptorSetLayout(app.device, bindings, descriptorSetLayout); + vk::PushConstantRange pcr(vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushStruct)); + vk::PipelineLayoutCreateInfo pipInfo(vk::PipelineLayoutCreateFlags(), 1U, &descriptorSetLayout, 1U, &pcr); + pipelineLayout = app.device.createPipelineLayout(pipInfo); + + // Specialization constant for workgroup size + std::array specEntries = std::array{ + {{0U, 0U, sizeof(workGroupSize)}}, + }; + std::array specValues = {workGroupSize}; //for workgroup sizes + vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), + CAST(specValues) * sizeof(int), specValues.data()); + + // Local PPS Offset Pipeline + Cmn::createShader(app.device, cShader, workingDir +"build/shaders/A2Task2Naive.comp.spv"); + Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, cShader); + + // ### create buffers, get their index in the task.buffers[] array ### + using BFlag = vk::BufferUsageFlagBits; + for (int i = 0; i < 2; i++) + createBuffer(app.pDevice, app.device, input.size() * sizeof(uint32_t), BFlag::eTransferDst | BFlag::eTransferSrc | BFlag::eStorageBuffer, vk::MemoryPropertyFlagBits::eDeviceLocal, "buffer_" + std::to_string(i), buffers[i]); + + fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[0], input); + + Cmn::createDescriptorPool(app.device, bindings, descriptorPool, 2); + + for (uint i = 0; i < 2; i++) + Cmn::allocateDescriptorSet(app.device, descriptorSets[i], descriptorPool, descriptorSetLayout); + Cmn::bindBuffers(app.device, buffers[0].buf, descriptorSets[0], 0); + Cmn::bindBuffers(app.device, buffers[1].buf, descriptorSets[0], 1); + Cmn::bindBuffers(app.device, buffers[1].buf, descriptorSets[1], 0); + Cmn::bindBuffers(app.device, buffers[0].buf, descriptorSets[1], 1); + + activeBuffer = 0; +} + +void A2Task2SolutioNaive::compute() { + vk::CommandBufferAllocateInfo allocInfo( + app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U); + vk::CommandBuffer cb = app.device.allocateCommandBuffers( allocInfo )[0]; + + vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); + + cb.begin(beginInfo); + + cb.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline); + + // TODO: Implement naive scan + // NOTE: make sure that activeBuffer points to the buffer with the final result in the end + // That buffer is read back for the correctness check + // (A2Task2SolutionNaive::result()) + // HINT: You can alternate between the two provided descriptor sets to implement ping-pong + + cb.end(); + + vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb); + + HostTimer timer; + + app.computeQueue.submit({submitInfo}); + app.device.waitIdle(); + + mstime = timer.elapsed() * 1000; + + app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb); +} + +std::vector A2Task2SolutioNaive::result() const { + std::vector result(workSize, 0); + fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[activeBuffer], result); + return result; +} + +void A2Task2SolutioNaive::cleanup() { + app.device.destroyDescriptorPool(descriptorPool); + + app.device.destroyPipeline(pipeline); + app.device.destroyShaderModule(cShader); + + app.device.destroyPipelineLayout(pipelineLayout); + app.device.destroyDescriptorSetLayout(descriptorSetLayout); + bindings.clear(); + + for (auto buffer : buffers) + destroyBuffer(app.device, buffer); +} \ No newline at end of file diff --git a/src/A2Task2Solution/Naive.h b/src/A2Task2Solution/Naive.h new file mode 100644 index 0000000..5097a77 --- /dev/null +++ b/src/A2Task2Solution/Naive.h @@ -0,0 +1,53 @@ +#include +#include +#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 + +#include +#include +#include +#include "initialization.h" +#include "utils.h" +#include "task_common.h" + +#include "A2Task2.h" + +struct A2Task2SolutioNaive : A2Task2Solution { +public: + A2Task2SolutioNaive(AppResources &app, uint workGroupSize); + + void prepare(const std::vector &input) override; + void compute() override; + std::vector result() const override; + void cleanup() override; + +private: + struct PushStruct + { + uint size; + uint offset; + }; + + AppResources &app; + uint workGroupSize; + + uint workSize; + + Buffer buffers[2]; + + // Descriptor & Pipeline Layout + std::vector bindings; + vk::DescriptorSetLayout descriptorSetLayout; + vk::PipelineLayout pipelineLayout; + + vk::ShaderModule cShader; + vk::Pipeline pipeline; + + // Descriptor Pool + vk::DescriptorPool descriptorPool; + + // Descriptors + vk::DescriptorSet descriptorSets[2]; + + uint activeBuffer = 0; +}; + \ No newline at end of file diff --git a/src/host_timer.cpp b/src/host_timer.cpp new file mode 100644 index 0000000..fa3c2a5 --- /dev/null +++ b/src/host_timer.cpp @@ -0,0 +1,15 @@ +#include "host_timer.h" + +HostTimer::HostTimer() { + reset(); +} + +void HostTimer::reset() { + start = clock::now(); +} + +double HostTimer::elapsed() const { + auto end = clock::now(); + std::chrono::duration duration = end - start; + return duration.count(); +} \ No newline at end of file diff --git a/src/initialization.cpp b/src/initialization.cpp new file mode 100644 index 0000000..c5d161e --- /dev/null +++ b/src/initialization.cpp @@ -0,0 +1,518 @@ +#include +#include +#include +#include +#include + +#define VK_ENABLE_BETA_EXTENSIONS +#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 +#include +VULKAN_HPP_DEFAULT_DISPATCH_LOADER_DYNAMIC_STORAGE + +#include "initialization.h" +#include "utils.h" + +#include + +// Here you create the instance and physical / logical device and maybe compute/transfer queues +// Also check if device is suitable etc + +struct DeviceSelectionCache { + uint32_t vendorID; + uint32_t deviceID; +}; + +#ifdef NDEBUG +const bool enableValidationLayers = false; +#else +const bool enableValidationLayers = true; +#endif + +const std::vector validationLayers = { +#ifndef NDEBUG + "VK_LAYER_KHRONOS_validation" +#endif +}; +const std::vector instanceExtensions = { +#ifndef NDEBUG + VK_EXT_DEBUG_UTILS_EXTENSION_NAME, +#endif +}; + +const std::vector extensionNames = { + #ifndef NDEBUG + + #endif +}; + +void AppResources::destroy() +{ + this->device.destroyQueryPool(this->queryPool); + //this->device.freeCommandBuffers(this->computeCommandPool, 1U, &this->computeCommandBuffer); + //this->device.freeCommandBuffers(this->transferCommandPool, 1U, &this->transferCommandBuffer); + this->device.destroyCommandPool(this->computeCommandPool); + //this->device.destroyCommandPool(this->transferCommandPool); + + this->device.destroy(); + +#ifndef NDEBUG + this->instance.destroyDebugUtilsMessengerEXT(this->dbgUtilsMgr); +#endif + this->instance.destroy(); +} +void initApp(AppResources& app) +{ + createInstance(app.instance, app.dbgUtilsMgr, "Assignment1, Task 1", "Idkwhattowrite"); + + selectPhysicalDevice(app.instance, app.pDevice); + auto chain = app.pDevice.getProperties2(); + app.pDeviceProperties = chain.get(); + app.pDeviceSubgroupProperties = chain.get(); + std::tie(app.cQ, app.tQ) = getComputeAndTransferQueues(app.pDevice); + createLogicalDevice(app.instance, app.pDevice, app.device); + + app.device.getQueue(app.cQ, 0U, &app.computeQueue); + app.transferQueue = app.computeQueue; + app.tQ = app.cQ; + //app.device.getQueue(app.tQ, 0U, &app.transferQueue); + //createCommandPool(app.device, app.transferCommandPool, app.tQ); + + createCommandPool(app.device, app.computeCommandPool, app.cQ); + app.transferCommandPool = app.computeCommandPool; + + createTimestampQueryPool(app.device, app.queryPool, 2); +} + + +//This is the function in which errors will go through to be displayed. + +VKAPI_ATTR VkBool32 VKAPI_CALL +debugUtilsMessengerCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity, + VkDebugUtilsMessageTypeFlagsEXT messageTypes, + VkDebugUtilsMessengerCallbackDataEXT const* pCallbackData, + void* /*pUserData*/) +{ + if (enableValidationLayers) + { + if (pCallbackData->messageIdNumber == 648835635) + { + // UNASSIGNED-khronos-Validation-debug-build-warning-message + return VK_FALSE; + } + if (pCallbackData->messageIdNumber == 767975156) + { + // UNASSIGNED-BestPractices-vkCreateInstance-specialuse-extension + return VK_FALSE; + } + } + + std::cerr << vk::to_string(static_cast(messageSeverity)) << ": " + << vk::to_string(static_cast(messageTypes)) << ":\n"; + std::cerr << "\t" + << "messageIDName = <" << pCallbackData->pMessageIdName << ">\n"; + std::cerr << "\t" + << "messageIdNumber = " << pCallbackData->messageIdNumber << "\n"; + std::cerr << "\t" + << "message = <" << pCallbackData->pMessage << ">\n"; + if (0 < pCallbackData->queueLabelCount) + { + std::cerr << "\t" + << "Queue Labels:\n"; + for (uint8_t i = 0; i < pCallbackData->queueLabelCount; i++) + { + std::cerr << "\t\t" + << "labelName = <" << pCallbackData->pQueueLabels[i].pLabelName << ">\n"; + } + } + if (0 < pCallbackData->cmdBufLabelCount) + { + std::cerr << "\t" + << "CommandBuffer Labels:\n"; + for (uint8_t i = 0; i < pCallbackData->cmdBufLabelCount; i++) + { + std::cerr << "\t\t" + << "labelName = <" << pCallbackData->pCmdBufLabels[i].pLabelName << ">\n"; + } + } + if (0 < pCallbackData->objectCount) + { + std::cerr << "\t" + << "Objects:\n"; + for (uint8_t i = 0; i < pCallbackData->objectCount; i++) + { + std::cerr << "\t\t" + << "Object " << i << "\n"; + std::cerr << "\t\t\t" + << "objectType = " + << vk::to_string(static_cast(pCallbackData->pObjects[i].objectType)) << "\n"; + std::cerr << "\t\t\t" + << "objectHandle = " << pCallbackData->pObjects[i].objectHandle << "\n"; + if (pCallbackData->pObjects[i].pObjectName) + { + std::cerr << "\t\t\t" + << "objectName = <" << pCallbackData->pObjects[i].pObjectName << ">\n"; + } + } + } + return VK_TRUE; +} + +/* + This function fills the structure with flags indicating + which error messages should go through +*/ +vk::DebugUtilsMessengerCreateInfoEXT makeDebugUtilsMessengerCreateInfoEXT() +{ + + using SEVERITY = vk::DebugUtilsMessageSeverityFlagBitsEXT; // for readability + using MESSAGE = vk::DebugUtilsMessageTypeFlagBitsEXT; + return { {}, + SEVERITY::eWarning | SEVERITY::eError, + MESSAGE::eGeneral | MESSAGE::ePerformance | MESSAGE::eValidation, + &debugUtilsMessengerCallback }; +} + +/* + The dynamic loader allows us to access many extensions + Required before creating instance for loading the extension VK_EXT_DEBUG_UTILS_EXTENSION_NAME +*/ +void initDynamicLoader() +{ +#if VK_HEADER_VERSION >= 301 + using VulkanDynamicLoader = vk::detail::DynamicLoader; +#else + using VulkanDynamicLoader = vk::DynamicLoader; +#endif + static VulkanDynamicLoader dl; + static PFN_vkGetInstanceProcAddr vkGetInstanceProcAddr = dl.getProcAddress("vkGetInstanceProcAddr"); + VULKAN_HPP_DEFAULT_DISPATCHER.init(vkGetInstanceProcAddr); +} + + +void createInstance(vk::Instance& instance, vk::DebugUtilsMessengerEXT& debugUtilsMessenger, + std::string appName, std::string engineName) +{ + initDynamicLoader(); + vk::ApplicationInfo applicationInfo(appName.c_str(), 1, engineName.c_str(), 1, VK_API_VERSION_1_2); + + //Initialize the InstanceCreateInfo + vk::InstanceCreateInfo instanceCreateInfo( //flags, pAppInfo, layerCount, layerNames, extcount, extNames + {}, &applicationInfo, + static_cast(validationLayers.size()), validationLayers.data(), + static_cast(instanceExtensions.size()), instanceExtensions.data()); + + // DebugInfo: use of StructureChain instead of pNext + // DebugUtils is used to catch errors from the instance + vk::DebugUtilsMessengerCreateInfoEXT debugCreateInfo = makeDebugUtilsMessengerCreateInfoEXT(); + // The StructureChain fills the pNext member of the struct in a typesafe way + // This is only possible with vulkan-hpp, in plain vulkan there is no typechecking + vk::StructureChain chain = + { instanceCreateInfo, debugCreateInfo }; + + if (!enableValidationLayers) //For Release mode + chain.unlink(); + + // Create an Instance + instance = vk::createInstance(chain.get()); + + // Update the dispatcher to use instance related extensions + VULKAN_HPP_DEFAULT_DISPATCHER.init(instance); + + if (enableValidationLayers) + debugUtilsMessenger = instance.createDebugUtilsMessengerEXT(makeDebugUtilsMessengerCreateInfoEXT()); +} + + +std::tuple getComputeAndTransferQueues(vk::PhysicalDevice& pDevice) +{ + uint32_t tq = -1; + std::optional otq; + uint32_t cq = -1; + std::optional ocq; + + using Chain = vk::StructureChain; + using QFB = vk::QueueFlagBits; +#if VK_HEADER_VERSION >= 301 + using VulkanDispatchLoaderDynamic = vk::detail::DispatchLoaderDynamic; +#else + using VulkanDispatchLoaderDynamic = vk::DispatchLoaderDynamic; +#endif + auto queueFamilyProperties2 = pDevice.getQueueFamilyProperties2, VulkanDispatchLoaderDynamic>(); + + for (uint32_t j = 0; j < queueFamilyProperties2.size(); j++) + { + vk::QueueFamilyProperties const& properties = + queueFamilyProperties2[static_cast(j)].get().queueFamilyProperties; + + if (properties.queueFlags & QFB::eCompute) + { + if (!(properties.queueFlags & QFB::eGraphics || + properties.queueFlags & QFB::eProtected)) + ocq = j; // When a queue supports only compute and not graphics we want to use that + cq = j; + } + + if (properties.queueFlags & QFB::eTransfer) + { + if (!(properties.queueFlags & QFB::eCompute || + properties.queueFlags & QFB::eGraphics || + properties.queueFlags & QFB::eProtected)) + otq = j; // When a queue supports only transfer, we want to use this one + tq = j; + } + } + + if (otq.has_value()) + tq = otq.value(); + if (ocq.has_value()) + cq = ocq.value(); + return std::tuple(cq, tq); +} +void selectPhysicalDevice(vk::Instance& instance, vk::PhysicalDevice& pDevice) +{ + // Takes the first one + std::vector physDs = instance.enumeratePhysicalDevices(); + + const static char* cache_name = "device_selection_cache"; + const static char* recreation_message = "To select a new device, delete the file \"device_selection_cache\" in your working directory before executing the framework."; + + std::ifstream ifile(cache_name, std::ios::binary); + if (ifile.is_open()) { + DeviceSelectionCache cache; + ifile.read(reinterpret_cast(&cache), sizeof(cache)); + ifile.close(); + for (auto physD : physDs) { + auto props = physD.getProperties2().properties; + if (props.vendorID == cache.vendorID && props.deviceID == cache.deviceID) { + std::cout << "Selecting previously selected device: \"" << props.deviceName << "\"" << std::endl; + std::cout << recreation_message << std::endl; + pDevice = physD; + return; + } + } + std::cout << "Previously selected device was not found." << std::endl; + } + else { + std::cout << "No previous device selection found." << std::endl; + } + + std::cout << "Select one of the available devices:" << std::endl; + + for (int i = 0; i < physDs.size(); i++) { + auto props = physDs[i].getProperties2().properties; + std::cout << i << ")\t" << props.deviceName.data() << std::endl; + } + + uint32_t i; + while (true) { + std::cout << "Enter device number: "; + std::cin >> i; + if (i < physDs.size()) break; + } + + auto props = physDs[i].getProperties2().properties; + DeviceSelectionCache cache; + cache.vendorID = props.vendorID; + cache.deviceID = props.deviceID; + + std::ofstream ofile(cache_name, std::ios::out | std::ios::binary); + ofile.write(reinterpret_cast(&cache), sizeof(cache)); + ofile.close(); + std::cout << "Selected device: \"" << props.deviceName.data() << "\"" << std::endl + << "This device will be automatically selected in the future." << std::endl + << recreation_message << std::endl; + + pDevice = physDs[i]; +} + +// The logical device holds the queues and will be used in almost every call from now on + +void createLogicalDevice(vk::Instance& instance, vk::PhysicalDevice& pDevice, vk::Device& device) +{ + + //First get the queues + uint32_t cQ, tQ; + std::tie(cQ, tQ) = getComputeAndTransferQueues(pDevice); + std::vector queuesInfo; + // flags, queueFamily, queueCount, queuePriority + float prio = 1.f; + vk::DeviceQueueCreateInfo computeInfo({}, cQ, 1U, &prio); + vk::DeviceQueueCreateInfo transferInfo({}, tQ, 1U, &prio); + + queuesInfo.push_back(computeInfo); + //queuesInfo.push_back(transferInfo); + // {}, queueCreateInfoCount, pQueueCreateInfos, enabledLayerCount, ppEnabledLayerNames, enabledExtensionCount, ppEnabledExtensionNames, pEnabledFeatures + + std::vector extensionNames_(extensionNames); + + auto deviceExtensionProperties = pDevice.enumerateDeviceExtensionProperties(); + bool enable_portability_subset = false;; + for (auto ext : deviceExtensionProperties) { + if (strcmp(ext.extensionName.data(), VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME) == 0) { + enable_portability_subset = true; + } + } + + if (enable_portability_subset) { + extensionNames_.push_back(VK_KHR_PORTABILITY_SUBSET_EXTENSION_NAME); + } + + vk::DeviceCreateInfo dci({}, CAST(queuesInfo), queuesInfo.data(), + CAST(validationLayers), validationLayers.data(), + CAST(extensionNames_), extensionNames_.data()); // no extension + + device = pDevice.createDevice(dci); + VULKAN_HPP_DEFAULT_DISPATCHER.init(device); + + setObjectName(device, device, "This is my lovely device !"); +} +void createCommandPool(vk::Device& device, vk::CommandPool& commandPool, uint32_t queueIndex) +{ + vk::CommandPoolCreateInfo cpi(vk::CommandPoolCreateFlags(), queueIndex); + commandPool = device.createCommandPool(cpi); +} + +void destroyInstance(vk::Instance& instance, vk::DebugUtilsMessengerEXT& debugUtilsMessenger) +{ +#ifndef NDEBUG + instance.destroyDebugUtilsMessengerEXT(debugUtilsMessenger); +#endif + instance.destroy(); +} +void destroyLogicalDevice(vk::Device& device) +{ + device.destroy(); +} + +void destroyCommandPool(vk::Device& device, vk::CommandPool& commandPool) +{ + device.destroyCommandPool(commandPool); + commandPool = vk::CommandPool(); +} + +void showAvailableQueues(vk::PhysicalDevice& pDevice, bool diagExt) +{ + + using Chain = vk::StructureChain; +#if VK_HEADER_VERSION >= 301 + using VulkanDispatchLoaderDynamic = vk::detail::DispatchLoaderDynamic; +#else + using VulkanDispatchLoaderDynamic = vk::DispatchLoaderDynamic; +#endif + auto queueFamilyProperties2 = pDevice.getQueueFamilyProperties2, VulkanDispatchLoaderDynamic>(); + + for (size_t j = 0; j < queueFamilyProperties2.size(); j++) + { + std::cout << "\t" + << "QueueFamily " << j << "\n"; + vk::QueueFamilyProperties const& properties = + queueFamilyProperties2[j].get().queueFamilyProperties; + std::cout << "\t\t" + << "QueueFamilyProperties:\n"; + std::cout << "\t\t\t" + << "queueFlags = " << vk::to_string(properties.queueFlags) << "\n"; + std::cout << "\t\t\t" + << "queueCount = " << properties.queueCount << "\n"; + std::cout << "\t\t\t" + << "timestampValidBits = " << properties.timestampValidBits << "\n"; + std::cout << "\t\t\t" + << "minImageTransferGranularity = " << properties.minImageTransferGranularity.width << " x " + << properties.minImageTransferGranularity.height << " x " + << properties.minImageTransferGranularity.depth << "\n"; + std::cout << "\n"; + + if (diagExt) + { + vk::QueueFamilyCheckpointPropertiesNV const& checkpointProperties = + queueFamilyProperties2[j].get(); + std::cout << "\t\t" + << "CheckPointPropertiesNV:\n"; + std::cout << "\t\t\t" + << "checkpointExecutionStageMask = " + << vk::to_string(checkpointProperties.checkpointExecutionStageMask) << "\n"; + std::cout << "\n"; + } + } +} + +void createTimestampQueryPool(vk::Device& device, vk::QueryPool& queryPool, uint32_t queryCount) +{ + vk::QueryPoolCreateInfo createInfo({}, vk::QueryType::eTimestamp, queryCount); + queryPool = device.createQueryPool(createInfo); +} + +void destroyQueryPool(vk::Device& device, vk::QueryPool& queryPool) +{ + device.destroyQueryPool(queryPool); + queryPool = vk::QueryPool(); +} + +void printDeviceCapabilities(vk::PhysicalDevice& pDevice) +{ + //vk::PhysicalDeviceFeatures features = physicalDevice.getFeatures(); + std::vector ext = pDevice.enumerateDeviceExtensionProperties(); + std::vector layers = pDevice.enumerateDeviceLayerProperties(); + vk::PhysicalDeviceMemoryProperties memoryProperties = pDevice.getMemoryProperties(); + vk::PhysicalDeviceProperties properties = pDevice.getProperties(); + vk::PhysicalDeviceType dt = properties.deviceType; + + std::cout << "====================" << std::endl + << "Device Name: " << properties.deviceName << std::endl + << "Device ID: " << properties.deviceID << std::endl + << "Device Type: " << vk::to_string(properties.deviceType) << std::endl + << "Driver Version: " << properties.driverVersion << std::endl + << "API Version: " << properties.apiVersion << std::endl + << "====================" << std::endl + << std::endl; + + bool budgetExt = false; + bool diagExt = false; + std::cout << "This device supports the following extensions (" << ext.size() << "): " << std::endl; + for (vk::ExtensionProperties e : ext) + { + std::cout << std::string(e.extensionName.data()) << std::endl; + if (std::string(e.extensionName.data()) == VK_EXT_MEMORY_BUDGET_EXTENSION_NAME) + budgetExt = true; + if (std::string(e.extensionName.data()) == VK_NV_DEVICE_DIAGNOSTIC_CHECKPOINTS_EXTENSION_NAME) + diagExt = true; + } + + std::cout << "This device supports the following memory types (" << memoryProperties.memoryTypeCount << "): " << std::endl; + uint32_t c = 0U; + for (vk::MemoryType e : memoryProperties.memoryTypes) + { + if (c > memoryProperties.memoryTypeCount) + break; + + std::cout << e.heapIndex << "\t "; + std::cout << vk::to_string(e.propertyFlags) << std::endl; + c++; + } + std::cout << "====================" << std::endl + << std::endl; + + if (budgetExt) + { + std::cout << "This device has the following heaps (" << memoryProperties.memoryHeapCount << "): " << std::endl; + c = 0U; + for (vk::MemoryHeap e : memoryProperties.memoryHeaps) + { + if (c > memoryProperties.memoryHeapCount) + break; + + std::cout << "Size: " << formatSize(e.size) << "\t "; + std::cout << vk::to_string(e.flags) << std::endl; + c++; + } + } + + std::cout << "====================" << std::endl + << std::endl + << "This device has the following layers (" << layers.size() << "): " << std::endl; + for (vk::LayerProperties l : layers) + std::cout << std::string(l.layerName.data()) << "\t : " << std::string(l.description.data()) << std::endl; + std::cout << "====================" << std::endl + << std::endl; + + showAvailableQueues(pDevice, diagExt); +} diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..49b4508 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,136 @@ +#include +#include +#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 +#include +#include +#include +#include "initialization.h" +#include "utils.h" +#include "A2Task1.h" +#include "A2Task2.h" +#include "A2Task1Solution/Sequential.h" +#include "A2Task1Solution/Interleaved.h" +#include "A2Task1Solution/KernelDecomposition.h" +#include "A2Task2Solution/Naive.h" +#include "A2Task2Solution/KernelDecomposition.h" +#include "renderdoc.h" + +void run_A2_task1(AppResources &app){ + size_t size = 128*1024*1024; + A2Task1 a2Task1(size); + std::cout<<"====== A2 TASK 1 ======" <cleanup(); + mstime += solution->mstime / N; + + if (!pass) break; + } + + if (pass) { + std::cout << "TEST PASSED. Execution time: " << mstime<< " ms, " + << "Throughput: " << size / mstime / 1000000 << " GE/s" << std::endl; + } else { + std::cout << "TEST FAILED" << std::endl; + } + + }; + A2Task1SolutionInterleaved interleavedSolution(app, 128); + evaluateTask1Solution(&interleavedSolution, "Interleaved"); + + A2Task1SolutionSequential sequentialSolution(app, 128); + evaluateTask1Solution(&sequentialSolution, "Sequential"); + + A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv"); + evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition"); + + A2Task1SolutionKernelDecomposition kernelDecompositionUnrollSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionUnroll.comp.spv"); + evaluateTask1Solution(&kernelDecompositionUnrollSolution, "KernelDecomposition Unroll"); + + A2Task1SolutionKernelDecomposition kernelDecompositionAtomicSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionAtomic.comp.spv"); + evaluateTask1Solution(&kernelDecompositionAtomicSolution, "KernelDecomposition Atomic"); +} +void run_A2_task2(AppResources& app){ + + size_t size = 128*1024*1024; + std::cout<<"====== A2 TASK 2 ======" <size() << std::endl; + + bool pass = true; + float mstime = 0.f; + for (int i = 0; i < N; i++) { + pass &= task->evaluateSolution(*solution); + solution->cleanup(); + mstime += solution->mstime / N; + + if (!pass) break; + } + + if (pass) { + std::cout << "Execution time: " << mstime<< " ms, " + << "Throughput: " << task->size() / mstime / 1000000 << " GE/s" << std::endl; + std::cout << "TEST PASSED" << std::endl; + } else { + std::cout << "TEST FAILED" << std::endl; + } + }; + + A2Task2SolutioNaive naiveSolution(app, 128); + evaluateTask2Solution(&a2Task2, &naiveSolution, "Naive",5); + + A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, sizeLocal); + evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5); + + A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128); + evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5); + +} +int main() +{ + try + { + AppResources app; + + initApp(app); + + renderdoc::initialize(); + renderdoc::startCapture(); + + run_A2_task1(app); + + run_A2_task2(app); + + renderdoc::endCapture(); + + app.destroy(); + } + catch (vk::SystemError &err) + { + std::cout << "vk::SystemError: " << err.what() << std::endl; + exit(-1); + } + catch (std::exception &err) + { + std::cout << "std::exception: " << err.what() << std::endl; + exit(-1); + } + catch (...) + { + std::cout << "unknown error\n"; + exit(-1); + } + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/src/renderdoc.cpp b/src/renderdoc.cpp new file mode 100644 index 0000000..27e311e --- /dev/null +++ b/src/renderdoc.cpp @@ -0,0 +1,50 @@ +#include "renderdoc.h" + +#include + +#ifdef ENABLE_RENDERDOC + #include "renderdoc_app.h" + + #ifdef _WIN32 + #include + #elif __linux__ + #include + #endif + + static RENDERDOC_API_1_1_2 *rdoc_api = nullptr; +#endif + +namespace renderdoc { + void initialize() { + #ifdef ENABLE_RENDERDOC + pRENDERDOC_GetAPI RENDERDOC_GetAPI = nullptr; + + #ifdef _WIN32 + if(HMODULE mod = GetModuleHandleA("renderdoc.dll")) + RENDERDOC_GetAPI = (pRENDERDOC_GetAPI)GetProcAddress(mod, "RENDERDOC_GetAPI"); + #elif __linux__ + if(void *mod = dlopen("librenderdoc.so", RTLD_NOW | RTLD_NOLOAD)) + RENDERDOC_GetAPI = (pRENDERDOC_GetAPI)dlsym(mod, "RENDERDOC_GetAPI"); + #endif + + if (RENDERDOC_GetAPI != nullptr) { + int ret = RENDERDOC_GetAPI(eRENDERDOC_API_Version_1_1_2, (void **)&rdoc_api); + assert(ret == 1); + } + #endif + } + + void startCapture() { + #ifdef ENABLE_RENDERDOC + if (rdoc_api) + rdoc_api->StartFrameCapture(nullptr, nullptr); + #endif + } + + void endCapture() { + #ifdef ENABLE_RENDERDOC + if (rdoc_api) + rdoc_api->EndFrameCapture(nullptr, nullptr); + #endif + } +} diff --git a/src/task_common.cpp b/src/task_common.cpp new file mode 100644 index 0000000..ed1e1cb --- /dev/null +++ b/src/task_common.cpp @@ -0,0 +1,116 @@ +#include +#include +#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 +#include +#include +#include +#include "task_common.h" +#include "initialization.h" +#include "utils.h" + +namespace Cmn { + //We have a binding vector ready to become a descriptorSetLayout + void createDescriptorSetLayout(vk::Device& device, + std::vector& bindings, + vk::DescriptorSetLayout& descLayout) { + vk::DescriptorSetLayoutCreateInfo layoutInfo( + {}, + CAST(bindings), // Number of binding infos + bindings.data() // Array of binding infos + ); + descLayout = device.createDescriptorSetLayout(layoutInfo); + } + + void addStorage(std::vector& bindings, uint32_t binding) { + //Bindings needed for DescriptorSetLayout + //The DescriptorType eStorageBuffer is used in our case as storage buffer for compute shader + //The ID binding(argument) is needed in the shader + //DescriptorCount is set to 1U + bindings.push_back(vk::DescriptorSetLayoutBinding( + binding, // The binding number of this entry + vk::DescriptorType::eStorageBuffer, // Type of resource descriptors used for this binding + 1U, // Number of descriptors contained in the binding + vk::ShaderStageFlagBits::eCompute) // All defined shader stages can access the resource + ); + } + + void allocateDescriptorSet(vk::Device& device, vk::DescriptorSet& descSet, vk::DescriptorPool& descPool, + vk::DescriptorSetLayout& descLayout) { + // You can technically allocate multiple layouts at once, we don't need that (so we put 1) + vk::DescriptorSetAllocateInfo descAllocInfo(descPool, 1U, &descLayout); + // Therefore the vector is length one, we want to take its (only) element + descSet = device.allocateDescriptorSets(descAllocInfo)[0]; + } + + + //Binding our DescriptorSet to Buffer + //VK_WHOLE_SIZE is specified to bind the entire Buffer + //DescriptorType eStorageBuffer in our case should be coherant with DescriptorSetLayout + //WriteDescriptorSets(creates array) and updateDescriptorSets can be used only once + void bindBuffers(vk::Device& device, vk::Buffer& b, vk::DescriptorSet& set, uint32_t binding) { + // Buffer info and data offset info + vk::DescriptorBufferInfo descInfo( + b, // Buffer to get data from + 0ULL, // Position of start of data + VK_WHOLE_SIZE // Size of data + ); + + // Binding index in the shader V + vk::WriteDescriptorSet write(set, binding, 0U, 1U, + vk::DescriptorType::eStorageBuffer, nullptr, &descInfo); + device.updateDescriptorSets(1U, &write, 0U, nullptr); + } + + void createPipeline(vk::Device& device, vk::Pipeline& pipeline, + vk::PipelineLayout& pipLayout, vk::SpecializationInfo& specInfo, + vk::ShaderModule& sModule) { + vk::PipelineShaderStageCreateInfo stageInfo(vk::PipelineShaderStageCreateFlags(), + vk::ShaderStageFlagBits::eCompute, sModule, + "main", &specInfo); + + vk::ComputePipelineCreateInfo computeInfo(vk::PipelineCreateFlags(), stageInfo, pipLayout); + + // This is a workaround: ideally there should not be a ".value" + // This should be fixed in later releases of the SDK + pipeline = device.createComputePipeline(nullptr, computeInfo, nullptr).value; + } + + //Number of DescriptorSets is one by default + void createDescriptorPool(vk::Device& device, + std::vector& bindings, vk::DescriptorPool& descPool, + uint32_t numDescriptorSets) { + vk::DescriptorPoolSize descriptorPoolSize = vk::DescriptorPoolSize( + vk::DescriptorType::eStorageBuffer, bindings.size() * numDescriptorSets); + vk::DescriptorPoolCreateInfo descriptorPoolCI = vk::DescriptorPoolCreateInfo( + vk::DescriptorPoolCreateFlags(), numDescriptorSets, 1U, &descriptorPoolSize); + + descPool = device.createDescriptorPool(descriptorPoolCI); + } + + + void createShader(vk::Device& device, vk::ShaderModule& shaderModule, const std::string& filename) { + std::vector cshader = readFile(filename); + // Shader Module creation information + vk::ShaderModuleCreateInfo smi( + {}, + static_cast(cshader.size()), // Size of code + reinterpret_cast(cshader.data())); // Pointer to code (of uint32_t pointer type) + shaderModule = device.createShaderModule(smi); + } +} + +void TaskResources::destroy(vk::Device& device) { + //Destroy all the resources we created in reverse order + //Pipeline Should be destroyed before PipelineLayout + device.destroyPipeline(this->pipeline); + //PipelineLayout should be destroyed before DescriptorPool + device.destroyPipelineLayout(this->pipelineLayout); + //DescriptorPool should be destroyed before the DescriptorSetLayout + device.destroyDescriptorPool(this->descriptorPool); + device.destroyDescriptorSetLayout(this->descriptorSetLayout); + device.destroyShaderModule(this->cShader); + //The DescriptorSet does not need to be destroyed, It is managed by DescriptorPool. + + std::cout << std::endl + << "destroyed everything successfully in task" << std::endl; +} diff --git a/src/utils.cpp b/src/utils.cpp new file mode 100644 index 0000000..17bd8d0 --- /dev/null +++ b/src/utils.cpp @@ -0,0 +1,109 @@ +#include +#include +#include +#include +#include +#define VULKAN_HPP_DISPATCH_LOADER_DYNAMIC 1 +#include + +#include "utils.h" + +std::vector readFile(const std::string& filename) { + std::ifstream file(filename, std::ios::ate | std::ios::binary); + + if (!file.is_open()) { + std::string error = "failed to open file: " + filename; + throw std::runtime_error(error); + } + size_t fileSize = (size_t) file.tellg(); + + std::vector buffer(fileSize); + file.seekg(0); + file.read(buffer.data(), fileSize); + file.close(); + // uncomment for debug + //std::cout << "read " << buffer.size() << " bytes of data in file " << filename << std::endl; + return buffer; +} + +std::string formatSize(uint64_t size) { + std::ostringstream oss; + if (size < 1024) { + oss << size << " B"; + } else if (size < 1024 * 1024) { + oss << size / 1024.f << " KB"; + } else if (size < 1024 * 1024 * 1024) { + oss << size / (1024.0f * 1024.0f) << " MB"; + } else { + oss << size / (1024.0f * 1024.0f * 1024.0f) << " GB"; + } + return oss.str(); +} + +uint32_t findMemoryType(uint32_t typeFilter, vk::MemoryPropertyFlags properties, vk::PhysicalDevice& pdevice) { + vk::PhysicalDeviceMemoryProperties memProperties = pdevice.getMemoryProperties(); + for (uint32_t i = 0; i < memProperties.memoryTypeCount; i++) { + if ((typeFilter & (1 << i)) && (memProperties.memoryTypes[i].propertyFlags & properties) == properties) { + return i; + } + } + + throw std::runtime_error("failed to find suitable memory type!"); +} + +void createBuffer(vk::PhysicalDevice& pDevice, vk::Device& device, + const vk::DeviceSize& size, vk::BufferUsageFlags usage, + vk::MemoryPropertyFlags properties, std::string name, vk::Buffer& buffer, + vk::DeviceMemory& bufferMemory) { + vk::BufferCreateInfo inBufferInfo({}, size, usage); + buffer = device.createBuffer(inBufferInfo); + setObjectName(device, buffer, name); + + vk::MemoryRequirements memReq = device.getBufferMemoryRequirements(buffer); + vk::MemoryAllocateInfo allocInfo(memReq.size, + findMemoryType(memReq.memoryTypeBits, properties, pDevice)); + + bufferMemory = device.allocateMemory(allocInfo); + device.bindBufferMemory(buffer, bufferMemory, 0U); +} + +void createBuffer(vk::PhysicalDevice& pDevice, vk::Device& device, + const vk::DeviceSize& size, vk::BufferUsageFlags usage, + vk::MemoryPropertyFlags properties, std::string name, Buffer& buffer) { + createBuffer(pDevice, device, size, usage, properties, name, buffer.buf, buffer.mem); +} + +void destroyBuffer(vk::Device& device, Buffer& buffer) { + device.destroyBuffer(buffer.buf); + device.freeMemory(buffer.mem); +} + +void copyBuffer(vk::Device& device, vk::Queue& q, vk::CommandPool& commandPool, + const vk::Buffer& srcBuffer, vk::Buffer& dstBuffer, vk::DeviceSize byteSize) { + vk::CommandBuffer commandBuffer = beginSingleTimeCommands(device, commandPool); + + vk::BufferCopy copyRegion(0ULL, 0ULL, byteSize); + commandBuffer.copyBuffer(srcBuffer, dstBuffer, 1, ©Region); + + endSingleTimeCommands(device, q, commandPool, commandBuffer); +} + +vk::CommandBuffer beginSingleTimeCommands(vk::Device& device, vk::CommandPool& commandPool) { + vk::CommandBufferAllocateInfo allocInfo(commandPool, vk::CommandBufferLevel::ePrimary, 1); + + vk::CommandBuffer commandBuffer = device.allocateCommandBuffers(allocInfo)[0]; + + vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit); + commandBuffer.begin(beginInfo); + + return commandBuffer; +} + +void endSingleTimeCommands(vk::Device& device, vk::Queue& q, + vk::CommandPool& commandPool, vk::CommandBuffer& commandBuffer) { + commandBuffer.end(); + vk::SubmitInfo submitInfo(0U, nullptr, nullptr, 1U, &commandBuffer); + q.submit({submitInfo}, nullptr); + q.waitIdle(); + device.freeCommandBuffers(commandPool, 1, &commandBuffer); +}