I am new to C++/CUDA. I tried implementing the parallel algorithm "reduce" with ability to handle any type of inputsize, and threadsize without increasing asymptotic parallel runtime by recursing over the output of the kernel (in the kernel wrapper).
e.g. Implementing Max Reduce in Cuda the top answer to this question, his/hers implementation will essentially be sequential when threadsize is small enough.
However, I keep getting a "Segmentation fault" when I compile and run it ..?
>> nvcc -o mycode mycode.cu
>> ./mycode
Segmentail fault.
Compiled on a K40 with cuda 6.5
Here is the kernel, basically same as the SO post I linked the checker for "out of bounds" is different:
#include <stdio.h>
/* -------- KERNEL -------- */
__global__ void reduce_kernel(float * d_out, float * d_in, const int size)
{
// position and threadId
int pos = blockIdx.x * blockDim.x + threadIdx.x;
int tid = threadIdx.x;
// do reduction in global memory
for (unsigned int s = blockDim.x / 2; s>0; s>>=1)
{
if (tid < s)
{
if (pos+s < size) // Handling out of bounds
{
d_in[pos] = d_in[pos] + d_in[pos+s];
}
}
}
// only thread 0 writes result, as thread
if (tid==0)
{
d_out[blockIdx.x] = d_in[pos];
}
}
The kernel wrapper I mentioned to handle when 1 block will not contain all of the data.
/* -------- KERNEL WRAPPER -------- */
void reduce(float * d_out, float * d_in, const int size, int num_threads)
{
// setting up blocks and intermediate result holder
int num_blocks = ((size) / num_threads) + 1;
float * d_intermediate;
cudaMalloc(&d_intermediate, sizeof(float)*num_blocks);
// recursively solving, will run approximately log base num_threads times.
do
{
reduce_kernel<<<num_blocks, num_threads>>>(d_intermediate, d_in, size);
// updating input to intermediate
cudaMemcpy(d_in, d_intermediate, sizeof(float)*num_blocks, cudaMemcpyDeviceToDevice);
// Updating num_blocks to reflect how many blocks we now want to compute on
num_blocks = num_blocks / num_threads + 1;
// updating intermediate
cudaMalloc(&d_intermediate, sizeof(float)*num_blocks);
}
while(num_blocks > num_threads); // if it is too small, compute rest.
// computing rest
reduce_kernel<<<1, num_blocks>>>(d_out, d_in, size);
}
Main program to initialize in/out and create bogus data for testing.
/* -------- MAIN -------- */
int main(int argc, char **argv)
{
// Setting num_threads
int num_threads = 512;
// Making bogus data and setting it on the GPU
const int size = 1024;
const int size_out = 1;
float * d_in;
float * d_out;
cudaMalloc(&d_in, sizeof(float)*size);
cudaMalloc((void**)&d_out, sizeof(float)*size_out);
const int value = 5;
cudaMemset(d_in, value, sizeof(float)*size);
// Running kernel wrapper
reduce(d_out, d_in, size, num_threads);
printf("sum is element is: %.f", d_out[0]);
}