quick commit
This commit is contained in:
@@ -9,14 +9,18 @@ in uint gl_LocalInvocationIndex;
|
||||
*/
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(push_constant) uniform PushStruct {
|
||||
uint size;
|
||||
} p;
|
||||
layout(push_constant) uniform PushStruct
|
||||
{
|
||||
uint offset;
|
||||
}
|
||||
p;
|
||||
|
||||
layout(binding = 0) buffer inBuffer {
|
||||
layout(binding = 0) buffer inBuffer
|
||||
{
|
||||
uint v[];
|
||||
};
|
||||
layout(binding = 1) buffer outBuffer {
|
||||
layout(binding = 1) buffer outBuffer
|
||||
{
|
||||
uint g_v[];
|
||||
};
|
||||
|
||||
@@ -25,22 +29,37 @@ layout(binding = 1) buffer outBuffer {
|
||||
const uint bufferSize = 256;
|
||||
shared uint[bufferSize] localBuffer;
|
||||
|
||||
void main() {
|
||||
// TODO: Kernel implementation
|
||||
void main()
|
||||
{
|
||||
uint tid = gl_LocalInvocationID.x;
|
||||
uint gid = gl_WorkGroupID.x;
|
||||
uint offset = gid * bufferSize;
|
||||
|
||||
for (uint i = p.size / 2; i < 0; i -= 2) {
|
||||
localBuffer[i] = v[i] + v[i + 1];
|
||||
}
|
||||
uint idx1 = offset + tid;
|
||||
uint idx2 = offset + tid + gl_WorkGroupSize.x;
|
||||
|
||||
for (uint j = bufferSize ; j != 0; j / 2) {
|
||||
for (uint i = bufferSize / 2; i < 0; i -= 2) {
|
||||
localBuffer[i] = localBuffer[i] + localBuffer[i + 1];
|
||||
uint val1 = 0;
|
||||
uint val2 = 0;
|
||||
|
||||
if (idx1 < p.offset)
|
||||
val1 = v[idx1];
|
||||
if (idx2 < p.offset)
|
||||
val2 = v[idx2];
|
||||
|
||||
localBuffer[tid] = val1 + val2;
|
||||
|
||||
// Reduction in shared memory
|
||||
for (uint s = gl_WorkGroupSize.x / 2; s > 0; s /= 2)
|
||||
{
|
||||
if (tid < s)
|
||||
{
|
||||
barrier();
|
||||
localBuffer[tid] += localBuffer[tid + s];
|
||||
}
|
||||
}
|
||||
|
||||
localBuffer[0] = localBuffer[0] + localBuffer[1];
|
||||
|
||||
for (uint i = 0; i < bufferSize; i ++) {
|
||||
g_v[i] = localBuffer[i];
|
||||
if (tid == 0)
|
||||
{
|
||||
g_v[gid] = localBuffer[tid];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,16 +9,48 @@ in uint gl_LocalInvocationIndex;
|
||||
*/
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(push_constant) uniform PushStruct {
|
||||
uint size;
|
||||
layout(push_constant) uniform PushStruct
|
||||
{
|
||||
uint offset;
|
||||
} p;
|
||||
}
|
||||
p;
|
||||
|
||||
layout(binding = 0) buffer inBuffer { uint v[]; };
|
||||
layout(binding = 1) buffer outBuffer { uint g_v[]; };
|
||||
layout(binding = 0) buffer inBuffer
|
||||
{
|
||||
uint v[];
|
||||
};
|
||||
layout(binding = 1) buffer outBuffer
|
||||
{
|
||||
uint g_v[];
|
||||
};
|
||||
|
||||
// TODO: Shared variables
|
||||
const uint bufferSize = 256;
|
||||
shared uint localBuffer;
|
||||
|
||||
void main() {
|
||||
// TODO: Kernel implementation
|
||||
}
|
||||
void main()
|
||||
{
|
||||
uint tid = gl_LocalInvocationID.x;
|
||||
uint gid = gl_WorkGroupID.x;
|
||||
uint offset = gid * bufferSize;
|
||||
|
||||
uint idx1 = offset + tid;
|
||||
uint idx2 = offset + tid + gl_WorkGroupSize.x;
|
||||
|
||||
uint val1 = 0;
|
||||
uint val2 = 0;
|
||||
|
||||
if (idx1 < p.offset)
|
||||
val1 = v[idx1];
|
||||
if (idx2 < p.offset)
|
||||
val2 = v[idx2];
|
||||
|
||||
if (tid == 0)
|
||||
localBuffer = 0;
|
||||
barrier();
|
||||
|
||||
uint partial = val1 + val2;
|
||||
atomicAdd(localBuffer, partial);
|
||||
barrier();
|
||||
if (tid == 0)
|
||||
g_v[gid] = localBuffer;
|
||||
}
|
||||
|
||||
@@ -9,16 +9,106 @@ in uint gl_LocalInvocationIndex;
|
||||
*/
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(push_constant) uniform PushStruct {
|
||||
uint size;
|
||||
layout(push_constant) uniform PushStruct
|
||||
{
|
||||
uint offset;
|
||||
} p;
|
||||
}
|
||||
p;
|
||||
|
||||
layout(binding = 0) buffer inBuffer { uint v[]; };
|
||||
layout(binding = 1) buffer outBuffer { uint g_v[]; };
|
||||
layout(binding = 0) buffer inBuffer
|
||||
{
|
||||
uint v[];
|
||||
};
|
||||
layout(binding = 1) buffer outBuffer
|
||||
{
|
||||
uint g_v[];
|
||||
};
|
||||
|
||||
// TODO: Shared variables
|
||||
// 512 Elements but initial reduction is done
|
||||
const uint bufferSize = 256;
|
||||
shared uint[bufferSize] localBuffer;
|
||||
|
||||
void main() {
|
||||
// TODO: Kernel implementation
|
||||
}
|
||||
void main()
|
||||
{
|
||||
uint tid = gl_LocalInvocationID.x;
|
||||
uint gid = gl_WorkGroupID.x;
|
||||
uint offset = gid * bufferSize;
|
||||
|
||||
uint idx1 = offset + tid;
|
||||
uint idx2 = offset + tid + gl_WorkGroupSize.x;
|
||||
|
||||
uint val1 = 0;
|
||||
uint val2 = 0;
|
||||
|
||||
if (idx1 < p.offset)
|
||||
val1 = v[idx1];
|
||||
if (idx2 < p.offset)
|
||||
val2 = v[idx2];
|
||||
|
||||
localBuffer[tid] = val1 + val2;
|
||||
barrier();
|
||||
|
||||
// Reduction in shared memory (unrolled for last 5 steps)
|
||||
for (uint s = gl_WorkGroupSize.x / 2; s > 32; s >>= 1)
|
||||
{
|
||||
if (tid < s)
|
||||
localBuffer[tid] += localBuffer[tid + s];
|
||||
barrier();
|
||||
}
|
||||
|
||||
// Unrolled tail with full barriers for safety on Vulkan
|
||||
if (gl_WorkGroupSize.x >= 64)
|
||||
{
|
||||
if (tid < 32)
|
||||
{
|
||||
localBuffer[tid] += localBuffer[tid + 32];
|
||||
barrier();
|
||||
}
|
||||
}
|
||||
if (gl_WorkGroupSize.x >= 32)
|
||||
{
|
||||
if (tid < 16)
|
||||
{
|
||||
localBuffer[tid] += localBuffer[tid + 16];
|
||||
barrier();
|
||||
}
|
||||
}
|
||||
if (gl_WorkGroupSize.x >= 16)
|
||||
{
|
||||
if (tid < 8)
|
||||
{
|
||||
localBuffer[tid] += localBuffer[tid + 8];
|
||||
barrier();
|
||||
}
|
||||
}
|
||||
if (gl_WorkGroupSize.x >= 8)
|
||||
{
|
||||
if (tid < 4)
|
||||
{
|
||||
localBuffer[tid] += localBuffer[tid + 4];
|
||||
barrier();
|
||||
}
|
||||
}
|
||||
if (gl_WorkGroupSize.x >= 4)
|
||||
{
|
||||
if (tid < 2)
|
||||
{
|
||||
localBuffer[tid] += localBuffer[tid + 2];
|
||||
barrier();
|
||||
}
|
||||
}
|
||||
if (gl_WorkGroupSize.x >= 2)
|
||||
{
|
||||
if (tid < 1)
|
||||
{
|
||||
localBuffer[tid] += localBuffer[tid + 1];
|
||||
barrier();
|
||||
}
|
||||
}
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
g_v[gid] = localBuffer[tid];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,28 +8,30 @@ in uvec3 gl_GlobalInvocationID;
|
||||
in uint gl_LocalInvocationIndex;
|
||||
*/
|
||||
|
||||
// Why did we not have conflicts in the Reduction?
|
||||
// Why did we not have conflicts in the Reduction?
|
||||
// Because of the sequential addressing (here we use interleaved => we have conflicts).
|
||||
// TODO: tailor to your architecture (these parameter work for virtually all NVIDIA GPUs)
|
||||
#define NUM_BANKS 32
|
||||
#define NUM_BANKS_LOG 5
|
||||
#define SIMD_GROUP_SIZE 32
|
||||
#define NUM_BANKS 32
|
||||
#define NUM_BANKS_LOG 5
|
||||
#define SIMD_GROUP_SIZE 32
|
||||
#define BUFFER_SIZE 256
|
||||
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(push_constant) uniform PushStruct {
|
||||
layout(push_constant) uniform PushStruct
|
||||
{
|
||||
uint size;
|
||||
} p;
|
||||
}
|
||||
p;
|
||||
|
||||
layout(binding = 0) buffer inoutBufer {uint array[];};
|
||||
layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];};
|
||||
layout(binding = 0) buffer inoutBufer { uint array[]; };
|
||||
layout(binding = 1) buffer offsetBuffer { uint g_v[]; };
|
||||
|
||||
// TODO: Shared variables
|
||||
shared uint temp[BUFFER_SIZE + (BUFFER_SIZE >> NUM_BANKS_LOG)];
|
||||
|
||||
// Bank conflicts
|
||||
#define AVOID_BANK_CONFLICTS
|
||||
#ifdef AVOID_BANK_CONFLICTS
|
||||
// TODO: define your conflict-free macro here
|
||||
#define OFFSET(A) ((A) + ((A) >> NUM_BANKS_LOG))
|
||||
#else
|
||||
#define OFFSET(A) (A)
|
||||
#endif
|
||||
@@ -37,16 +39,54 @@ layout(binding = 1) buffer offsetBufer {uint higherLevelArray[];};
|
||||
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
void main()
|
||||
{
|
||||
// TODO: Kernel implementation
|
||||
const uint tid = gl_LocalInvocationID.x;
|
||||
const uint gid = gl_GlobalInvocationID.x;
|
||||
const uint size = BUFFER_SIZE;
|
||||
|
||||
// Cache first half of elements in the local memory
|
||||
// Cache second half of elements
|
||||
uint val0 = 0;
|
||||
uint val1 = 0;
|
||||
|
||||
// Perform up-sweep
|
||||
if (2 * gid < p.size)
|
||||
val0 = array[2 * gid];
|
||||
if (2 * gid + 1 < p.size)
|
||||
val1 = array[2 * gid + 1];
|
||||
|
||||
// Unroll the last steps when arrived at warp size
|
||||
// Set the last element to 0
|
||||
temp[OFFSET(2 * tid)] = val0;
|
||||
temp[OFFSET(2 * tid + 1)] = val1;
|
||||
|
||||
// Up-Sweep (Reduction) phase
|
||||
for (uint stride = 1; stride < size; stride <<= 1)
|
||||
{
|
||||
barrier();
|
||||
uint idx = (tid + 1) * stride * 2 - 1;
|
||||
if (idx < size)
|
||||
{
|
||||
temp[OFFSET(idx)] += temp[OFFSET(idx - stride)];
|
||||
}
|
||||
}
|
||||
|
||||
// Perform down-sweep
|
||||
// Clear the last element
|
||||
if (tid == 0)
|
||||
{
|
||||
g_v[gl_WorkGroupID.x] = temp[OFFSET(size - 1)];
|
||||
temp[OFFSET(size - 1)] = 0;
|
||||
}
|
||||
|
||||
// Down-Sweep phase
|
||||
for (uint stride = size >> 1; stride > 0; stride >>= 1)
|
||||
{
|
||||
barrier();
|
||||
uint idx = (tid + 1) * stride * 2 - 1;
|
||||
if (idx < size)
|
||||
{
|
||||
uint t = temp[OFFSET(idx - stride)];
|
||||
temp[OFFSET(idx - stride)] = temp[OFFSET(idx)];
|
||||
temp[OFFSET(idx)] += t;
|
||||
}
|
||||
}
|
||||
|
||||
if (2 * gid < p.size)
|
||||
array[2 * gid] = temp[OFFSET(2 * tid)] + val0;
|
||||
if (2 * gid + 1 < p.size)
|
||||
array[2 * gid + 1] = temp[OFFSET(2 * tid + 1)] + val1;
|
||||
}
|
||||
|
||||
@@ -8,18 +8,27 @@ in uvec3 gl_GlobalInvocationID;
|
||||
in uint gl_LocalInvocationIndex;
|
||||
*/
|
||||
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
||||
layout (constant_id = 1) const uint SAMPLE_MULTIPLIER = 1;
|
||||
layout(constant_id = 1) const uint SAMPLE_MULTIPLIER = 1;
|
||||
|
||||
// Push constant
|
||||
layout(push_constant) uniform PushStruct {
|
||||
layout(push_constant) uniform PushStruct
|
||||
{
|
||||
uint size;
|
||||
} p;
|
||||
}
|
||||
p;
|
||||
|
||||
layout(binding = 0) buffer inoutBufer { uint v[]; };
|
||||
layout(binding = 1) buffer offsetBufer { uint g_v[]; };
|
||||
layout(binding = 0) buffer inoutBufer { uint data[]; };
|
||||
layout(binding = 1) buffer offsetBufer { uint offsets[]; };
|
||||
|
||||
// TODO: Shared variables
|
||||
void main()
|
||||
{
|
||||
uint tid = gl_LocalInvocationID.x;
|
||||
uint group_id = gl_WorkGroupID.x;
|
||||
|
||||
void main() {
|
||||
// TODO: Shared variables
|
||||
}
|
||||
uint gid0 = group_id * 256 + 2 * tid;
|
||||
uint gid1 = group_id * 256 + 2 * tid + 1;
|
||||
|
||||
uint offset = offsets[group_id - 1];
|
||||
data[gid0] += offset;
|
||||
data[gid1] += offset;
|
||||
}
|
||||
|
||||
@@ -19,4 +19,14 @@ layout(binding = 0) buffer inBuffer { uint v[]; };
|
||||
layout(binding = 1) buffer outBufer { uint g_v[]; };
|
||||
|
||||
void main() {
|
||||
}
|
||||
uint gid = gl_GlobalInvocationID.x;
|
||||
|
||||
if (gid >= p.size) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (gid < p.offset)
|
||||
g_v[gid] = v[gid];
|
||||
else
|
||||
g_v[gid] = v[gid - p.offset] + v[gid];
|
||||
}
|
||||
Reference in New Issue
Block a user