I wrote a simple code to understand Dynamic Parallelism. From the values being printed,I see that the child kernel has executed correctly, but when I come back to the parent kernel, I see wrong values being used in place of temp array which is being updated correctly in the child kernel. When I try to update the 'd_cin array' it is giving me wrong values. These are the compilation flags being used :
nvcc -m64 -dc -gencode arch=compute_35,code=sm_35 -I/opt/apps/cuda/5.5/include -I. -I.. -I../../common/inc -o simple.o -c simple.cu
nvcc -m64 -gencode arch=compute_35,code=sm_35 -o simple simple.o -L/opt/apps/cuda/5.5/lib64 -lcudadevrt
Can someone help me ? Here is the code.
#include <stdio.h>
#include "cuPrintf.cu"
#include "cuPrintf.cuh"
__global__ void innerKernel(double *I,double *d_temp,int parentIndex){
int index=threadIdx.x+blockIdx.x*blockDim.x;
d_temp[parentIndex*3+index]=I[parentIndex];
}
__global__ void kernel(double *d_I,double *d_temp,double *d_cin){
int index=threadIdx.x+blockIdx.x*blockDim.x;
int i;
double res=0.0;
if(index<30){
cudaStream_t s;
cudaStreamCreateWithFlags( &s, cudaStreamNonBlocking );
dim3 dimBlock(3,1,1);
dim3 dimGrid(1,1,1);
innerKernel<<<dimGrid,dimBlock>>>(d_I,d_temp,index);
__syncthreads();
if(index==0){
for(i=0;i<90;i++)
cuPrintf("temp[%d]: %f\n",i,d_temp[i]);
}
for (i=0;i<3;i++){
res=res+d_temp[index*3+i];
}
__syncthreads();
d_cin[index]=res;
cudaStreamDestroy(s);
}
}
int main(int argc,char **argv){
double I[30]={1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30};
double *d_I;
double *d_temp;
double *d_cin;
double cout[30];
cudaMalloc(&d_I,30*sizeof(double));
cudaMemcpy(d_I,I,30*sizeof(double),cudaMemcpyHostToDevice);
cudaMalloc(&d_temp,3*30*sizeof(double));
cudaMalloc(&d_cin,30*sizeof(double));
dim3 dimBlock(8,1,1);
dim3 dimGrid(4,1,1);
/*LAUNCH THE KERNEL*/
printf("Before the kernel\n");
cudaPrintfInit();
kernel<<<dimGrid,dimBlock>>>(d_I,d_temp,d_cin);
//cudaThreadSynchronize();
cudaPrintfDisplay(stdout,true);
cudaPrintfEnd();
printf("After the kernel\n");
cudaMemcpy(cout,d_cin,30*sizeof(double),cudaMemcpyDeviceToHost);
int i;
for(i=0;i<30;i++)
printf("%f\n",cout[i]);
}