quick commit
This commit is contained in:
@@ -9,14 +9,18 @@ in uint gl_LocalInvocationIndex;
|
|||||||
*/
|
*/
|
||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout(push_constant) uniform PushStruct {
|
layout(push_constant) uniform PushStruct
|
||||||
uint size;
|
{
|
||||||
} p;
|
uint offset;
|
||||||
|
}
|
||||||
|
p;
|
||||||
|
|
||||||
layout(binding = 0) buffer inBuffer {
|
layout(binding = 0) buffer inBuffer
|
||||||
|
{
|
||||||
uint v[];
|
uint v[];
|
||||||
};
|
};
|
||||||
layout(binding = 1) buffer outBuffer {
|
layout(binding = 1) buffer outBuffer
|
||||||
|
{
|
||||||
uint g_v[];
|
uint g_v[];
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -25,22 +29,37 @@ layout(binding = 1) buffer outBuffer {
|
|||||||
const uint bufferSize = 256;
|
const uint bufferSize = 256;
|
||||||
shared uint[bufferSize] localBuffer;
|
shared uint[bufferSize] localBuffer;
|
||||||
|
|
||||||
void main() {
|
void main()
|
||||||
// TODO: Kernel implementation
|
{
|
||||||
|
uint tid = gl_LocalInvocationID.x;
|
||||||
|
uint gid = gl_WorkGroupID.x;
|
||||||
|
uint offset = gid * bufferSize;
|
||||||
|
|
||||||
for (uint i = p.size / 2; i < 0; i -= 2) {
|
uint idx1 = offset + tid;
|
||||||
localBuffer[i] = v[i] + v[i + 1];
|
uint idx2 = offset + tid + gl_WorkGroupSize.x;
|
||||||
}
|
|
||||||
|
|
||||||
for (uint j = bufferSize ; j != 0; j / 2) {
|
uint val1 = 0;
|
||||||
for (uint i = bufferSize / 2; i < 0; i -= 2) {
|
uint val2 = 0;
|
||||||
localBuffer[i] = localBuffer[i] + localBuffer[i + 1];
|
|
||||||
|
if (idx1 < p.offset)
|
||||||
|
val1 = v[idx1];
|
||||||
|
if (idx2 < p.offset)
|
||||||
|
val2 = v[idx2];
|
||||||
|
|
||||||
|
localBuffer[tid] = val1 + val2;
|
||||||
|
|
||||||
|
// Reduction in shared memory
|
||||||
|
for (uint s = gl_WorkGroupSize.x / 2; s > 0; s /= 2)
|
||||||
|
{
|
||||||
|
if (tid < s)
|
||||||
|
{
|
||||||
|
barrier();
|
||||||
|
localBuffer[tid] += localBuffer[tid + s];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
localBuffer[0] = localBuffer[0] + localBuffer[1];
|
if (tid == 0)
|
||||||
|
{
|
||||||
for (uint i = 0; i < bufferSize; i ++) {
|
g_v[gid] = localBuffer[tid];
|
||||||
g_v[i] = localBuffer[i];
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -9,16 +9,48 @@ in uint gl_LocalInvocationIndex;
|
|||||||
*/
|
*/
|
||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout(push_constant) uniform PushStruct {
|
layout(push_constant) uniform PushStruct
|
||||||
uint size;
|
{
|
||||||
uint offset;
|
uint offset;
|
||||||
} p;
|
}
|
||||||
|
p;
|
||||||
layout(binding = 0) buffer inBuffer { uint v[]; };
|
|
||||||
layout(binding = 1) buffer outBuffer { uint g_v[]; };
|
layout(binding = 0) buffer inBuffer
|
||||||
|
{
|
||||||
// TODO: Shared variables
|
uint v[];
|
||||||
|
};
|
||||||
void main() {
|
layout(binding = 1) buffer outBuffer
|
||||||
// TODO: Kernel implementation
|
{
|
||||||
|
uint g_v[];
|
||||||
|
};
|
||||||
|
|
||||||
|
const uint bufferSize = 256;
|
||||||
|
shared uint localBuffer;
|
||||||
|
|
||||||
|
void main()
|
||||||
|
{
|
||||||
|
uint tid = gl_LocalInvocationID.x;
|
||||||
|
uint gid = gl_WorkGroupID.x;
|
||||||
|
uint offset = gid * bufferSize;
|
||||||
|
|
||||||
|
uint idx1 = offset + tid;
|
||||||
|
uint idx2 = offset + tid + gl_WorkGroupSize.x;
|
||||||
|
|
||||||
|
uint val1 = 0;
|
||||||
|
uint val2 = 0;
|
||||||
|
|
||||||
|
if (idx1 < p.offset)
|
||||||
|
val1 = v[idx1];
|
||||||
|
if (idx2 < p.offset)
|
||||||
|
val2 = v[idx2];
|
||||||
|
|
||||||
|
if (tid == 0)
|
||||||
|
localBuffer = 0;
|
||||||
|
barrier();
|
||||||
|
|
||||||
|
uint partial = val1 + val2;
|
||||||
|
atomicAdd(localBuffer, partial);
|
||||||
|
barrier();
|
||||||
|
if (tid == 0)
|
||||||
|
g_v[gid] = localBuffer;
|
||||||
}
|
}
|
||||||
@@ -9,16 +9,106 @@ in uint gl_LocalInvocationIndex;
|
|||||||
*/
|
*/
|
||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout(push_constant) uniform PushStruct {
|
layout(push_constant) uniform PushStruct
|
||||||
uint size;
|
{
|
||||||
uint offset;
|
uint offset;
|
||||||
} p;
|
}
|
||||||
|
p;
|
||||||
|
|
||||||
layout(binding = 0) buffer inBuffer { uint v[]; };
|
layout(binding = 0) buffer inBuffer
|
||||||
layout(binding = 1) buffer outBuffer { uint g_v[]; };
|
{
|
||||||
|
uint v[];
|
||||||
|
};
|
||||||
|
layout(binding = 1) buffer outBuffer
|
||||||
|
{
|
||||||
|
uint g_v[];
|
||||||
|
};
|
||||||
|
|
||||||
// TODO: Shared variables
|
// TODO: Shared variables
|
||||||
|
// 512 Elements but initial reduction is done
|
||||||
|
const uint bufferSize = 256;
|
||||||
|
shared uint[bufferSize] localBuffer;
|
||||||
|
|
||||||
void main() {
|
void main()
|
||||||
// TODO: Kernel implementation
|
{
|
||||||
|
uint tid = gl_LocalInvocationID.x;
|
||||||
|
uint gid = gl_WorkGroupID.x;
|
||||||
|
uint offset = gid * bufferSize;
|
||||||
|
|
||||||
|
uint idx1 = offset + tid;
|
||||||
|
uint idx2 = offset + tid + gl_WorkGroupSize.x;
|
||||||
|
|
||||||
|
uint val1 = 0;
|
||||||
|
uint val2 = 0;
|
||||||
|
|
||||||
|
if (idx1 < p.offset)
|
||||||
|
val1 = v[idx1];
|
||||||
|
if (idx2 < p.offset)
|
||||||
|
val2 = v[idx2];
|
||||||
|
|
||||||
|
localBuffer[tid] = val1 + val2;
|
||||||
|
barrier();
|
||||||
|
|
||||||
|
// Reduction in shared memory (unrolled for last 5 steps)
|
||||||
|
for (uint s = gl_WorkGroupSize.x / 2; s > 32; s >>= 1)
|
||||||
|
{
|
||||||
|
if (tid < s)
|
||||||
|
localBuffer[tid] += localBuffer[tid + s];
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Unrolled tail with full barriers for safety on Vulkan
|
||||||
|
if (gl_WorkGroupSize.x >= 64)
|
||||||
|
{
|
||||||
|
if (tid < 32)
|
||||||
|
{
|
||||||
|
localBuffer[tid] += localBuffer[tid + 32];
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (gl_WorkGroupSize.x >= 32)
|
||||||
|
{
|
||||||
|
if (tid < 16)
|
||||||
|
{
|
||||||
|
localBuffer[tid] += localBuffer[tid + 16];
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (gl_WorkGroupSize.x >= 16)
|
||||||
|
{
|
||||||
|
if (tid < 8)
|
||||||
|
{
|
||||||
|
localBuffer[tid] += localBuffer[tid + 8];
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (gl_WorkGroupSize.x >= 8)
|
||||||
|
{
|
||||||
|
if (tid < 4)
|
||||||
|
{
|
||||||
|
localBuffer[tid] += localBuffer[tid + 4];
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (gl_WorkGroupSize.x >= 4)
|
||||||
|
{
|
||||||
|
if (tid < 2)
|
||||||
|
{
|
||||||
|
localBuffer[tid] += localBuffer[tid + 2];
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (gl_WorkGroupSize.x >= 2)
|
||||||
|
{
|
||||||
|
if (tid < 1)
|
||||||
|
{
|
||||||
|
localBuffer[tid] += localBuffer[tid + 1];
|
||||||
|
barrier();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tid == 0)
|
||||||
|
{
|
||||||
|
g_v[gid] = localBuffer[tid];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -11,25 +11,27 @@ in uint gl_LocalInvocationIndex;
|
|||||||
// Why did we not have conflicts in the Reduction?
|
// Why did we not have conflicts in the Reduction?
|
||||||
// Because of the sequential addressing (here we use interleaved => we have conflicts).
|
// Because of the sequential addressing (here we use interleaved => we have conflicts).
|
||||||
// TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs)
|
// TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs)
|
||||||
#define NUM_BANKS 32
|
#define NUM_BANKS 32
|
||||||
#define NUM_BANKS_LOG 5
|
#define NUM_BANKS_LOG 5
|
||||||
#define SIMD_GROUP_SIZE 32
|
#define SIMD_GROUP_SIZE 32
|
||||||
|
#define BUFFER_SIZE 256
|
||||||
|
|
||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
|
|
||||||
layout(push_constant) uniform PushStruct {
|
layout(push_constant) uniform PushStruct
|
||||||
|
{
|
||||||
uint size;
|
uint size;
|
||||||
} p;
|
}
|
||||||
|
p;
|
||||||
|
|
||||||
layout(binding = 0) buffer inoutBufer {uint array[];};
|
layout(binding = 0) buffer inoutBufer { uint array[]; };
|
||||||
layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];};
|
layout(binding = 1) buffer offsetBuffer { uint g_v[]; };
|
||||||
|
|
||||||
// TODO: Shared variables
|
shared uint temp[BUFFER_SIZE + (BUFFER_SIZE >> NUM_BANKS_LOG)];
|
||||||
|
|
||||||
// Bank conflicts
|
// Bank conflicts
|
||||||
#define AVOID_BANK_CONFLICTS
|
|
||||||
#ifdef AVOID_BANK_CONFLICTS
|
#ifdef AVOID_BANK_CONFLICTS
|
||||||
// TODO: define your conflict-free macro here
|
#define OFFSET(A) ((A) + ((A) >> NUM_BANKS_LOG))
|
||||||
#else
|
#else
|
||||||
#define OFFSET(A) (A)
|
#define OFFSET(A) (A)
|
||||||
#endif
|
#endif
|
||||||
@@ -37,16 +39,54 @@ layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];};
|
|||||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||||
void main()
|
void main()
|
||||||
{
|
{
|
||||||
// TODO: Kernel implementation
|
const uint tid = gl_LocalInvocationID.x;
|
||||||
|
const uint gid = gl_GlobalInvocationID.x;
|
||||||
|
const uint size = BUFFER_SIZE;
|
||||||
|
|
||||||
// Cache first half of elements in the local memory
|
uint val0 = 0;
|
||||||
// Cache second half of elements
|
uint val1 = 0;
|
||||||
|
|
||||||
// Perform up-sweep
|
if (2 * gid < p.size)
|
||||||
|
val0 = array[2 * gid];
|
||||||
|
if (2 * gid + 1 < p.size)
|
||||||
|
val1 = array[2 * gid + 1];
|
||||||
|
|
||||||
// Unroll the last steps when arrived at warp size
|
temp[OFFSET(2 * tid)] = val0;
|
||||||
// Set the last element to 0
|
temp[OFFSET(2 * tid + 1)] = val1;
|
||||||
|
|
||||||
|
// Up-Sweep (Reduction) phase
|
||||||
|
for (uint stride = 1; stride < size; stride <<= 1)
|
||||||
|
{
|
||||||
|
barrier();
|
||||||
|
uint idx = (tid + 1) * stride * 2 - 1;
|
||||||
|
if (idx < size)
|
||||||
|
{
|
||||||
|
temp[OFFSET(idx)] += temp[OFFSET(idx - stride)];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Perform down-sweep
|
// Clear the last element
|
||||||
|
if (tid == 0)
|
||||||
|
{
|
||||||
|
g_v[gl_WorkGroupID.x] = temp[OFFSET(size - 1)];
|
||||||
|
temp[OFFSET(size - 1)] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Down-Sweep phase
|
||||||
|
for (uint stride = size >> 1; stride > 0; stride >>= 1)
|
||||||
|
{
|
||||||
|
barrier();
|
||||||
|
uint idx = (tid + 1) * stride * 2 - 1;
|
||||||
|
if (idx < size)
|
||||||
|
{
|
||||||
|
uint t = temp[OFFSET(idx - stride)];
|
||||||
|
temp[OFFSET(idx - stride)] = temp[OFFSET(idx)];
|
||||||
|
temp[OFFSET(idx)] += t;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (2 * gid < p.size)
|
||||||
|
array[2 * gid] = temp[OFFSET(2 * tid)] + val0;
|
||||||
|
if (2 * gid + 1 < p.size)
|
||||||
|
array[2 * gid + 1] = temp[OFFSET(2 * tid + 1)] + val1;
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,18 +8,27 @@ in uvec3 gl_GlobalInvocationID;
|
|||||||
in uint gl_LocalInvocationIndex;
|
in uint gl_LocalInvocationIndex;
|
||||||
*/
|
*/
|
||||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||||
layout (constant_id = 1) const uint SAMPLE_MULTIPLIER = 1;
|
layout(constant_id = 1) const uint SAMPLE_MULTIPLIER = 1;
|
||||||
|
|
||||||
// Push constant
|
// Push constant
|
||||||
layout(push_constant) uniform PushStruct {
|
layout(push_constant) uniform PushStruct
|
||||||
|
{
|
||||||
uint size;
|
uint size;
|
||||||
} p;
|
}
|
||||||
|
p;
|
||||||
layout(binding = 0) buffer inoutBufer { uint v[]; };
|
|
||||||
layout(binding = 1) buffer offsetBufer { uint g_v[]; };
|
layout(binding = 0) buffer inoutBufer { uint data[]; };
|
||||||
|
layout(binding = 1) buffer offsetBufer { uint offsets[]; };
|
||||||
// TODO: Shared variables
|
|
||||||
|
void main()
|
||||||
void main() {
|
{
|
||||||
// TODO: Shared variables
|
uint tid = gl_LocalInvocationID.x;
|
||||||
|
uint group_id = gl_WorkGroupID.x;
|
||||||
|
|
||||||
|
uint gid0 = group_id * 256 + 2 * tid;
|
||||||
|
uint gid1 = group_id * 256 + 2 * tid + 1;
|
||||||
|
|
||||||
|
uint offset = offsets[group_id - 1];
|
||||||
|
data[gid0] += offset;
|
||||||
|
data[gid1] += offset;
|
||||||
}
|
}
|
||||||
@@ -19,4 +19,14 @@ layout(binding = 0) buffer inBuffer { uint v[]; };
|
|||||||
layout(binding = 1) buffer outBufer { uint g_v[]; };
|
layout(binding = 1) buffer outBufer { uint g_v[]; };
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
|
uint gid = gl_GlobalInvocationID.x;
|
||||||
|
|
||||||
|
if (gid >= p.size) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (gid < p.offset)
|
||||||
|
g_v[gid] = v[gid];
|
||||||
|
else
|
||||||
|
g_v[gid] = v[gid - p.offset] + v[gid];
|
||||||
}
|
}
|
||||||
@@ -50,36 +50,32 @@ void A2Task1SolutionKernelDecomposition::compute()
|
|||||||
vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
||||||
|
|
||||||
cb.begin(beginInfo);
|
cb.begin(beginInfo);
|
||||||
|
cb.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline);
|
||||||
|
|
||||||
// TODO: Implement reduction with kernel decomposition
|
uint blocksize = workGroupSize * 2;
|
||||||
// NOTE: make sure that activeBuffer points to the buffer with the final result in the end
|
uint windowSize = mpInput->size();
|
||||||
// That buffer is read back for the correctness check
|
|
||||||
// (A2Task1SolutionKernelDecomposition::result())
|
|
||||||
// HINT: You can alternate between the two provided descriptor sets to implement ping-pong
|
|
||||||
|
|
||||||
uint blocksize = 512;
|
|
||||||
uint kernelCount = mpInput->size() / blocksize;
|
|
||||||
PushConstant p;
|
PushConstant p;
|
||||||
|
|
||||||
for (; kernelCount > 0; kernelCount /= 2)
|
activeBuffer = 1;
|
||||||
|
|
||||||
|
while (windowSize > 1)
|
||||||
{
|
{
|
||||||
activeBuffer = activeBuffer == 0 ? 1 : 0;
|
activeBuffer = 1 - activeBuffer;
|
||||||
|
uint numGroups = (windowSize + blocksize - 1) / blocksize;
|
||||||
|
|
||||||
cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[activeBuffer], 0U, nullptr);
|
cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[activeBuffer], 0U, nullptr);
|
||||||
p.size = blocksize;
|
p.offset = windowSize;
|
||||||
|
|
||||||
cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &p);
|
cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushConstant), &p);
|
||||||
|
|
||||||
// for loop to call each kernel
|
cb.dispatch(numGroups, 1, 1);
|
||||||
for (int i = 0; i < kernelCount; i++)
|
|
||||||
{
|
|
||||||
cb.dispatch(i * blocksize, 0, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
|
vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
|
||||||
cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr);
|
cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr);
|
||||||
|
|
||||||
std::cout << "deine mum \n" ;
|
windowSize = numGroups;
|
||||||
}
|
}
|
||||||
|
// todo check which buffer is active and read from that one
|
||||||
cb.end();
|
cb.end();
|
||||||
|
|
||||||
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
|
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
|
||||||
@@ -97,7 +93,7 @@ void A2Task1SolutionKernelDecomposition::compute()
|
|||||||
uint A2Task1SolutionKernelDecomposition::result() const
|
uint A2Task1SolutionKernelDecomposition::result() const
|
||||||
{
|
{
|
||||||
std::vector<uint> result(1, 0);
|
std::vector<uint> result(1, 0);
|
||||||
fillHostWithStagingBuffer<uint>(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[activeBuffer], result);
|
fillHostWithStagingBuffer<uint>(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[1 - activeBuffer], result);
|
||||||
return result[0];
|
return result[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ public:
|
|||||||
private:
|
private:
|
||||||
struct PushConstant
|
struct PushConstant
|
||||||
{
|
{
|
||||||
uint size;
|
uint offset;
|
||||||
};
|
};
|
||||||
|
|
||||||
AppResources &app;
|
AppResources &app;
|
||||||
|
|||||||
@@ -1,12 +1,16 @@
|
|||||||
#include "KernelDecomposition.h"
|
#include "KernelDecomposition.h"
|
||||||
|
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
#include "host_timer.h"
|
#include "host_timer.h"
|
||||||
|
|
||||||
A2Task2SolutionKernelDecomposition::A2Task2SolutionKernelDecomposition(AppResources& app, uint workGroupSize): app(app),
|
A2Task2SolutionKernelDecomposition::A2Task2SolutionKernelDecomposition(AppResources &app, uint workGroupSize) : app(app),
|
||||||
workGroupSize(workGroupSize) {
|
workGroupSize(workGroupSize)
|
||||||
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
void A2Task2SolutionKernelDecomposition::prepare(const std::vector<uint>& input) {
|
void A2Task2SolutionKernelDecomposition::prepare(const std::vector<uint> &input)
|
||||||
|
{
|
||||||
workSize = input.size();
|
workSize = input.size();
|
||||||
|
|
||||||
// Descriptor & Pipeline Layout
|
// Descriptor & Pipeline Layout
|
||||||
@@ -19,41 +23,70 @@ void A2Task2SolutionKernelDecomposition::prepare(const std::vector<uint>& input)
|
|||||||
|
|
||||||
// Specialization constant for workgroup size
|
// Specialization constant for workgroup size
|
||||||
std::array<vk::SpecializationMapEntry, 1> specEntries = std::array<vk::SpecializationMapEntry, 1>{
|
std::array<vk::SpecializationMapEntry, 1> specEntries = std::array<vk::SpecializationMapEntry, 1>{
|
||||||
{{0U, 0U, sizeof(workGroupSize)}},
|
{{0U, 0U, sizeof(workGroupSize)}},
|
||||||
};
|
};
|
||||||
std::array<uint32_t, 1> specValues = {workGroupSize}; //for workgroup sizes
|
std::array<uint32_t, 1> specValues = {workGroupSize}; // for workgroup sizes
|
||||||
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(),
|
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), CAST(specValues) * sizeof(int), specValues.data());
|
||||||
CAST(specValues) * sizeof(int), specValues.data());
|
|
||||||
|
|
||||||
// Local PPS Pipeline
|
// Local PPS Pipeline
|
||||||
Cmn::createShader(app.device, cShaderLocalPPS, workingDir + "build/shaders/A2Task2KernelDecomposition.comp.spv");
|
Cmn::createShader(app.device, cShaderLocalPPS, workingDir + "build/shaders/A2Task2KernelDecomposition.comp.spv");
|
||||||
Cmn::createPipeline(app.device, pipelineLocalPPS, pipelineLayout, specInfo, cShaderLocalPPS);
|
Cmn::createPipeline(app.device, pipelineLocalPPS, pipelineLayout, specInfo, cShaderLocalPPS);
|
||||||
|
|
||||||
// Local PPS Offset Pipeline
|
// Local PPS Offset Pipeline
|
||||||
Cmn::createShader(app.device, cShaderLocalPPSOffset,
|
Cmn::createShader(app.device, cShaderLocalPPSOffset, workingDir + "build/shaders/A2Task2KernelDecompositionOffset.comp.spv");
|
||||||
workingDir + "build/shaders/A2Task2KernelDecompositionOffset.comp.spv");
|
|
||||||
Cmn::createPipeline(app.device, pipelineLocalPPSOffset, pipelineLayout, specInfo, cShaderLocalPPSOffset);
|
Cmn::createPipeline(app.device, pipelineLocalPPSOffset, pipelineLayout, specInfo, cShaderLocalPPSOffset);
|
||||||
|
|
||||||
// ### create buffers, get their index in the task.buffers[] array ###
|
// ### create buffers, get their index in the task.buffers[] array ###
|
||||||
using BFlag = vk::BufferUsageFlagBits;
|
using BFlag = vk::BufferUsageFlagBits;
|
||||||
auto makeDLocalBuffer = [ this ](vk::BufferUsageFlags usage, vk::DeviceSize size, std::string name) -> Buffer {
|
auto makeDLocalBuffer = [this](vk::BufferUsageFlags usage, vk::DeviceSize size, std::string name) -> Buffer
|
||||||
|
{
|
||||||
Buffer b;
|
Buffer b;
|
||||||
createBuffer(app.pDevice, app.device, size, usage, vk::MemoryPropertyFlagBits::eDeviceLocal, name, b.buf,
|
createBuffer(app.pDevice, app.device, size, usage, vk::MemoryPropertyFlagBits::eDeviceLocal, name, b.buf, b.mem);
|
||||||
b.mem);
|
|
||||||
return b;
|
return b;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
size_t current_size = input.size();
|
||||||
|
levelSizes.clear();
|
||||||
|
levelSizes.push_back(current_size);
|
||||||
|
while (current_size > 256) // BUFFER_SIZE
|
||||||
|
{
|
||||||
|
current_size = (current_size + 255) / 256;
|
||||||
|
levelSizes.push_back(current_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create buffers
|
||||||
|
// We need one buffer for each level, plus one for the final sum
|
||||||
|
for (size_t size : levelSizes)
|
||||||
|
{
|
||||||
|
inoutBuffers.push_back(makeDLocalBuffer(BFlag::eTransferDst | BFlag::eTransferSrc | BFlag::eStorageBuffer,
|
||||||
|
size * sizeof(uint32_t),
|
||||||
|
"buffer_level_" + std::to_string(inoutBuffers.size())));
|
||||||
|
}
|
||||||
|
// Extra buffer for the last level's output (total sum)
|
||||||
inoutBuffers.push_back(makeDLocalBuffer(BFlag::eTransferDst | BFlag::eTransferSrc | BFlag::eStorageBuffer,
|
inoutBuffers.push_back(makeDLocalBuffer(BFlag::eTransferDst | BFlag::eTransferSrc | BFlag::eStorageBuffer,
|
||||||
input.size() * sizeof(uint32_t), "buffer_inout_0"));
|
sizeof(uint32_t),
|
||||||
|
"buffer_final_sum"));
|
||||||
|
|
||||||
fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffers[0],
|
fillDeviceWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffers[0], input);
|
||||||
input);
|
|
||||||
|
|
||||||
// TO DO create additional buffers (by pushing into inoutBuffers) and descriptors (by pushing into descriptorSets)
|
uint32_t numSets = static_cast<uint32_t>(levelSizes.size());
|
||||||
// You need to create an appropriately-sized DescriptorPool first
|
Cmn::createDescriptorPool(app.device, bindings, descriptorPool, numSets);
|
||||||
|
|
||||||
|
// Allocate Descriptor Sets
|
||||||
|
std::vector<vk::DescriptorSetLayout> layouts(numSets, descriptorSetLayout);
|
||||||
|
vk::DescriptorSetAllocateInfo allocInfo(descriptorPool, numSets, layouts.data());
|
||||||
|
descriptorSets = app.device.allocateDescriptorSets(allocInfo);
|
||||||
|
|
||||||
|
// Update Descriptor Sets
|
||||||
|
for (size_t i = 0; i < numSets; ++i)
|
||||||
|
{
|
||||||
|
Cmn::bindBuffers(app.device, inoutBuffers[i].buf, descriptorSets[i], 0);
|
||||||
|
Cmn::bindBuffers(app.device, inoutBuffers[i+1].buf, descriptorSets[i], 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void A2Task2SolutionKernelDecomposition::compute() {
|
void A2Task2SolutionKernelDecomposition::compute()
|
||||||
|
{
|
||||||
vk::CommandBufferAllocateInfo allocInfo(
|
vk::CommandBufferAllocateInfo allocInfo(
|
||||||
app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U);
|
app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U);
|
||||||
vk::CommandBuffer cb = app.device.allocateCommandBuffers(allocInfo)[0];
|
vk::CommandBuffer cb = app.device.allocateCommandBuffers(allocInfo)[0];
|
||||||
@@ -62,8 +95,46 @@ void A2Task2SolutionKernelDecomposition::compute() {
|
|||||||
|
|
||||||
cb.begin(beginInfo);
|
cb.begin(beginInfo);
|
||||||
|
|
||||||
// TODO: Implement efficient version of scan
|
cb.bindPipeline(vk::PipelineBindPoint::eCompute, pipelineLocalPPS);
|
||||||
// Make sure that the local prefix sum works before you start experimenting with large arrays
|
|
||||||
|
for (size_t i = 0; i < levelSizes.size(); ++i)
|
||||||
|
{
|
||||||
|
PushStruct p;
|
||||||
|
p.size = static_cast<uint32_t>(levelSizes[i]);
|
||||||
|
|
||||||
|
cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[i], 0U, nullptr);
|
||||||
|
cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushStruct), &p);
|
||||||
|
|
||||||
|
uint32_t groupCount = (levelSizes[i] + 255) / 256;
|
||||||
|
cb.dispatch(groupCount, 1, 1);
|
||||||
|
|
||||||
|
// Barrier between levels
|
||||||
|
vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
|
||||||
|
cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader,
|
||||||
|
vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
cb.bindPipeline(vk::PipelineBindPoint::eCompute, pipelineLocalPPSOffset);
|
||||||
|
|
||||||
|
if (levelSizes.size() > 1)
|
||||||
|
{
|
||||||
|
for (int i = levelSizes.size() - 2; i >= 0; i--)
|
||||||
|
{
|
||||||
|
PushStruct p;
|
||||||
|
p.size = static_cast<uint32_t>(levelSizes[i]);
|
||||||
|
|
||||||
|
cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[i], 0U, nullptr);
|
||||||
|
cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushStruct), &p);
|
||||||
|
|
||||||
|
uint32_t groupCount = (levelSizes[i] + 255) / 256;
|
||||||
|
|
||||||
|
cb.dispatch(groupCount, 1, 1);
|
||||||
|
|
||||||
|
vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
|
||||||
|
cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader,
|
||||||
|
vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
cb.end();
|
cb.end();
|
||||||
|
|
||||||
@@ -79,15 +150,15 @@ void A2Task2SolutionKernelDecomposition::compute() {
|
|||||||
app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb);
|
app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<uint> A2Task2SolutionKernelDecomposition::result() const {
|
std::vector<uint> A2Task2SolutionKernelDecomposition::result() const
|
||||||
|
{
|
||||||
std::vector<uint> result(workSize, 0);
|
std::vector<uint> result(workSize, 0);
|
||||||
fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffers[0],
|
fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, inoutBuffers[0], result);
|
||||||
result);
|
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void A2Task2SolutionKernelDecomposition::cleanup()
|
||||||
void A2Task2SolutionKernelDecomposition::cleanup() {
|
{
|
||||||
|
|
||||||
app.device.destroyDescriptorPool(descriptorPool);
|
app.device.destroyDescriptorPool(descriptorPool);
|
||||||
|
|
||||||
@@ -101,12 +172,14 @@ void A2Task2SolutionKernelDecomposition::cleanup() {
|
|||||||
app.device.destroyDescriptorSetLayout(descriptorSetLayout);
|
app.device.destroyDescriptorSetLayout(descriptorSetLayout);
|
||||||
bindings.clear();
|
bindings.clear();
|
||||||
|
|
||||||
auto Bclean = [&](Buffer& b) {
|
auto Bclean = [&](Buffer &b)
|
||||||
|
{
|
||||||
app.device.destroyBuffer(b.buf);
|
app.device.destroyBuffer(b.buf);
|
||||||
app.device.freeMemory(b.mem);
|
app.device.freeMemory(b.mem);
|
||||||
};
|
};
|
||||||
|
|
||||||
for (auto inoutBuffer: inoutBuffers) {
|
for (auto inoutBuffer : inoutBuffers)
|
||||||
|
{
|
||||||
Bclean(inoutBuffer);
|
Bclean(inoutBuffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ private:
|
|||||||
// Descriptor Pool
|
// Descriptor Pool
|
||||||
vk::DescriptorPool descriptorPool;
|
vk::DescriptorPool descriptorPool;
|
||||||
|
|
||||||
// TODO extend with any additional members you may need
|
std::vector<vk::DescriptorSet> descriptorSets;
|
||||||
|
std::vector<size_t> levelSizes;
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -1,13 +1,14 @@
|
|||||||
#include "Naive.h"
|
#include "Naive.h"
|
||||||
|
|
||||||
#include "host_timer.h"
|
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
|
|
||||||
A2Task2SolutioNaive::A2Task2SolutioNaive(
|
#include "host_timer.h"
|
||||||
AppResources &app, uint workGroupSize):
|
|
||||||
app(app), workGroupSize(workGroupSize) {}
|
|
||||||
|
|
||||||
void A2Task2SolutioNaive::prepare(const std::vector<uint> &input) {
|
A2Task2SolutioNaive::A2Task2SolutioNaive(
|
||||||
|
AppResources &app, uint workGroupSize) : app(app), workGroupSize(workGroupSize) {}
|
||||||
|
|
||||||
|
void A2Task2SolutioNaive::prepare(const std::vector<uint> &input)
|
||||||
|
{
|
||||||
workSize = input.size();
|
workSize = input.size();
|
||||||
|
|
||||||
// Descriptor & Pipeline Layout
|
// Descriptor & Pipeline Layout
|
||||||
@@ -22,12 +23,11 @@ void A2Task2SolutioNaive::prepare(const std::vector<uint> &input) {
|
|||||||
std::array<vk::SpecializationMapEntry, 1> specEntries = std::array<vk::SpecializationMapEntry, 1>{
|
std::array<vk::SpecializationMapEntry, 1> specEntries = std::array<vk::SpecializationMapEntry, 1>{
|
||||||
{{0U, 0U, sizeof(workGroupSize)}},
|
{{0U, 0U, sizeof(workGroupSize)}},
|
||||||
};
|
};
|
||||||
std::array<uint32_t, 2> specValues = {workGroupSize}; //for workgroup sizes
|
std::array<uint32_t, 2> specValues = {workGroupSize}; // for workgroup sizes
|
||||||
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(),
|
vk::SpecializationInfo specInfo = vk::SpecializationInfo(CAST(specEntries), specEntries.data(), CAST(specValues) * sizeof(int), specValues.data());
|
||||||
CAST(specValues) * sizeof(int), specValues.data());
|
|
||||||
|
|
||||||
// Local PPS Offset Pipeline
|
// Local PPS Offset Pipeline
|
||||||
Cmn::createShader(app.device, cShader, workingDir +"build/shaders/A2Task2Naive.comp.spv");
|
Cmn::createShader(app.device, cShader, workingDir + "build/shaders/A2Task2Naive.comp.spv");
|
||||||
Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, cShader);
|
Cmn::createPipeline(app.device, pipeline, pipelineLayout, specInfo, cShader);
|
||||||
|
|
||||||
// ### create buffers, get their index in the task.buffers[] array ###
|
// ### create buffers, get their index in the task.buffers[] array ###
|
||||||
@@ -49,22 +49,35 @@ void A2Task2SolutioNaive::prepare(const std::vector<uint> &input) {
|
|||||||
activeBuffer = 0;
|
activeBuffer = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void A2Task2SolutioNaive::compute() {
|
void A2Task2SolutioNaive::compute()
|
||||||
|
{
|
||||||
vk::CommandBufferAllocateInfo allocInfo(
|
vk::CommandBufferAllocateInfo allocInfo(
|
||||||
app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U);
|
app.computeCommandPool, vk::CommandBufferLevel::ePrimary, 1U);
|
||||||
vk::CommandBuffer cb = app.device.allocateCommandBuffers( allocInfo )[0];
|
vk::CommandBuffer cb = app.device.allocateCommandBuffers(allocInfo)[0];
|
||||||
|
|
||||||
vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
vk::CommandBufferBeginInfo beginInfo(vk::CommandBufferUsageFlagBits::eOneTimeSubmit);
|
||||||
|
|
||||||
cb.begin(beginInfo);
|
cb.begin(beginInfo);
|
||||||
|
|
||||||
cb.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline);
|
cb.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline);
|
||||||
|
|
||||||
// TODO: Implement naive scan
|
PushStruct p;
|
||||||
// NOTE: make sure that activeBuffer points to the buffer with the final result in the end
|
p.size = workSize;
|
||||||
// That buffer is read back for the correctness check
|
|
||||||
// (A2Task2SolutionNaive::result())
|
for (uint i = 1; i < workSize; i <<= 1)
|
||||||
// HINT: You can alternate between the two provided descriptor sets to implement ping-pong
|
{
|
||||||
|
cb.bindDescriptorSets(vk::PipelineBindPoint::eCompute, pipelineLayout, 0U, 1U, &descriptorSets[activeBuffer], 0U, nullptr);
|
||||||
|
p.offset = i;
|
||||||
|
|
||||||
|
cb.pushConstants(pipelineLayout, vk::ShaderStageFlagBits::eCompute, 0, sizeof(PushStruct), &p);
|
||||||
|
|
||||||
|
cb.dispatch((workSize + workGroupSize - 1) / workGroupSize, 1, 1);
|
||||||
|
|
||||||
|
vk::MemoryBarrier memoryBarrier(vk::AccessFlagBits::eShaderWrite, vk::AccessFlagBits::eShaderRead);
|
||||||
|
cb.pipelineBarrier(vk::PipelineStageFlagBits::eComputeShader, vk::PipelineStageFlagBits::eComputeShader, vk::DependencyFlags(), 1, &memoryBarrier, 0, nullptr, 0, nullptr);
|
||||||
|
|
||||||
|
activeBuffer = 1 - activeBuffer;
|
||||||
|
}
|
||||||
|
|
||||||
cb.end();
|
cb.end();
|
||||||
|
|
||||||
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
|
vk::SubmitInfo submitInfo = vk::SubmitInfo(0, nullptr, nullptr, 1, &cb);
|
||||||
@@ -79,13 +92,15 @@ void A2Task2SolutioNaive::compute() {
|
|||||||
app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb);
|
app.device.freeCommandBuffers(app.computeCommandPool, 1U, &cb);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<uint> A2Task2SolutioNaive::result() const {
|
std::vector<uint> A2Task2SolutioNaive::result() const
|
||||||
|
{
|
||||||
std::vector<uint> result(workSize, 0);
|
std::vector<uint> result(workSize, 0);
|
||||||
fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[activeBuffer], result);
|
fillHostWithStagingBuffer(app.pDevice, app.device, app.transferCommandPool, app.transferQueue, buffers[activeBuffer], result);
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void A2Task2SolutioNaive::cleanup() {
|
void A2Task2SolutioNaive::cleanup()
|
||||||
|
{
|
||||||
app.device.destroyDescriptorPool(descriptorPool);
|
app.device.destroyDescriptorPool(descriptorPool);
|
||||||
|
|
||||||
app.device.destroyPipeline(pipeline);
|
app.device.destroyPipeline(pipeline);
|
||||||
|
|||||||
30
src/main.cpp
30
src/main.cpp
@@ -39,20 +39,20 @@ void run_A2_task1(AppResources &app){
|
|||||||
}
|
}
|
||||||
|
|
||||||
};
|
};
|
||||||
// A2Task1SolutionInterleaved interleavedSolution(app, 128);
|
A2Task1SolutionInterleaved interleavedSolution(app, 128);
|
||||||
// evaluateTask1Solution(&interleavedSolution, "Interleaved");
|
evaluateTask1Solution(&interleavedSolution, "Interleaved");
|
||||||
|
|
||||||
// A2Task1SolutionSequential sequentialSolution(app, 128);
|
A2Task1SolutionSequential sequentialSolution(app, 128);
|
||||||
// evaluateTask1Solution(&sequentialSolution, "Sequential");
|
evaluateTask1Solution(&sequentialSolution, "Sequential");
|
||||||
|
|
||||||
A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv");
|
A2Task1SolutionKernelDecomposition kernelDecompositionSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecomposition.comp.spv");
|
||||||
evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition");
|
evaluateTask1Solution(&kernelDecompositionSolution, "KernelDecomposition");
|
||||||
|
|
||||||
// A2Task1SolutionKernelDecomposition kernelDecompositionUnrollSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionUnroll.comp.spv");
|
A2Task1SolutionKernelDecomposition kernelDecompositionUnrollSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionUnroll.comp.spv");
|
||||||
// evaluateTask1Solution(&kernelDecompositionUnrollSolution, "KernelDecomposition Unroll");
|
evaluateTask1Solution(&kernelDecompositionUnrollSolution, "KernelDecomposition Unroll");
|
||||||
|
|
||||||
// A2Task1SolutionKernelDecomposition kernelDecompositionAtomicSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionAtomic.comp.spv");
|
A2Task1SolutionKernelDecomposition kernelDecompositionAtomicSolution(app, 128, workingDir +"build/shaders/A2Task1KernelDecompositionAtomic.comp.spv");
|
||||||
// evaluateTask1Solution(&kernelDecompositionAtomicSolution, "KernelDecomposition Atomic");
|
evaluateTask1Solution(&kernelDecompositionAtomicSolution, "KernelDecomposition Atomic");
|
||||||
}
|
}
|
||||||
void run_A2_task2(AppResources& app){
|
void run_A2_task2(AppResources& app){
|
||||||
|
|
||||||
@@ -61,7 +61,7 @@ void run_A2_task2(AppResources& app){
|
|||||||
|
|
||||||
// This is used for testing local kernel decomposition without extension to arbitrary arrays.
|
// This is used for testing local kernel decomposition without extension to arbitrary arrays.
|
||||||
// Must be power of two and <= 1024!
|
// Must be power of two and <= 1024!
|
||||||
size_t sizeLocal = 128;
|
size_t sizeLocal = 1024;
|
||||||
|
|
||||||
A2Task2 a2Task2(size);
|
A2Task2 a2Task2(size);
|
||||||
A2Task2 a2Task2Local(sizeLocal);
|
A2Task2 a2Task2Local(sizeLocal);
|
||||||
@@ -91,11 +91,11 @@ void run_A2_task2(AppResources& app){
|
|||||||
A2Task2SolutioNaive naiveSolution(app, 128);
|
A2Task2SolutioNaive naiveSolution(app, 128);
|
||||||
evaluateTask2Solution(&a2Task2, &naiveSolution, "Naive",5);
|
evaluateTask2Solution(&a2Task2, &naiveSolution, "Naive",5);
|
||||||
|
|
||||||
// A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, sizeLocal);
|
A2Task2SolutionKernelDecomposition kernelDecompositionSolutionLocal(app, 128);
|
||||||
// evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5);
|
evaluateTask2Solution(&a2Task2Local, &kernelDecompositionSolutionLocal, "Kernel Decomposition that fits in one workgroup (normal if 'slow')",5);
|
||||||
|
|
||||||
// A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128);
|
A2Task2SolutionKernelDecomposition kernelDecompositionSolution(app, 128);
|
||||||
// evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5);
|
evaluateTask2Solution(&a2Task2, &kernelDecompositionSolution, "Kernel Decomposition",5);
|
||||||
|
|
||||||
}
|
}
|
||||||
int main()
|
int main()
|
||||||
@@ -109,9 +109,9 @@ int main()
|
|||||||
renderdoc::initialize();
|
renderdoc::initialize();
|
||||||
renderdoc::startCapture();
|
renderdoc::startCapture();
|
||||||
|
|
||||||
run_A2_task1(app);
|
// run_A2_task1(app);
|
||||||
|
|
||||||
// run_A2_task2(app);
|
run_A2_task2(app);
|
||||||
|
|
||||||
renderdoc::endCapture();
|
renderdoc::endCapture();
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user