In the following code, I want to use nested parallelism to compute 10 times an element of an array. I use this simple example to learn more about dynamic parallelism in Cuda. The way the code works is that for each element of a parentArray, there is another kernel saving this element at a position of a childArray (0 to 9). So for each element of the parentArray, I have another array with 10 elements, each being equal to the element of the parentArray. In the end, I compute the sum of all childArrays and save the result in the parentArray.
The result therefore should be:
Element 0 of parentArray, Result = 0
Element 1 of parentArray, Result = 10
Element 2 of parentArray, Result = 20 and so on
Currently, the code compiles but doesn't give the expected results. What is wrong with the current code?
The function to compute the sum of the elements
__device__ double summe(double *arr, int size)
{
double result = 0.0;
for(int i = 0; i < size; i++)
{
result += arr[i];
}
return result;
}
The function called from childKernel
__device__ double getElement(double arrElement)
{
return arrElement;
}
The array in which results are stored
__device__ double childArr[10];
The childKernel
__global__ void childKernel(double *arr, double arrElement,int N)
{
int cidx = blockIdx.x * blockDim.x + threadIdx.x;
if (cidx < N)
{
arr[cidx] = getElement(arrElement);
}
}
The parentKernel
__global__ void parentKernel(double *parentArray, int N)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < N)
{
childKernel<<<1,10>>>(childArr,parentArray[idx],N);
__syncthreads();
parentArray[idx] = summe(childArr,10);
}
}
The main part
int main(void)
{
double *host_array;
double *device_array;
// Number of elements in arrays
const int N_array = 10;
// size of array
const size_t size_array = N_array * sizeof(double);
// Allocate array on host
host_array = (double *)malloc(size_array);
// Allocate array on device
CUDA_CALL(cudaMalloc((void **) &device_array, size_array));
// Initialize host array
for (int i=0; i<N_array; i++)
{
host_array[i] = (double)i;
}
// and copy it to CUDA device
CUDA_CALL(cudaMemcpy(device_array, host_array, size_array, cudaMemcpyHostToDevice));
// Do calculation on device:
int block_size = 4;
// if N = 10, then n_blocks = 3
int n_blocks = N_array/block_size + (N_array % block_size == 0 ? 0:1);
parentKernel<<<n_blocks, block_size>>>(device_array,N_array);
// Retrieve result from device and store it in host array
CUDA_CALL(cudaMemcpy(host_array, device_array, sizeof(double)*N_array, cudaMemcpyDeviceToHost));
// Print results
for (int i=0; i<N_array; i++)
{
printf("Element %d of parentArray, Result = %f\n", i, host_array[i]);
}
// Cleanup
free(host_array);
CUDA_CALL(cudaFree(device_array));
}
The result I get is:
0 52.000000
1 52.000000
2 52.000000
3 52.000000
4 48.000000
5 48.000000
6 48.000000
7 48.000000
8 48.000000
9 48.000000
I use Cuda 6.5
NVCCFLAGS= -arch=sm_35 -rdc=true -G -O3 --compiler-options -Wall
/opt/cuda-6.5/bin/nvcc -V
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2014 NVIDIA Corporation
Built on Thu_Jul_17_21:41:27_CDT_2014
Cuda compilation tools, release 6.5, V6.5.12