I lost data after __syncthreads() in cuda

Question

I am trying to find the maximum of an array.. I took the help from CUDA Maximum Reduction Algorithm Not Working. and do some own modification. However I am running it for 16 data. I am finding that in kernel code shared memory copies only 1st 4data. rest are lost. I put two cuPrintf..1st printf shows data is their in the shared memory. But the 2nd cuPrintf is just after __syncthreads.. and that shows 0 from thread ids 4 onwords.. pls help #include #include #include #include #include #include "cuPrintf.cu" #include "cuPrintf.cuh"

__device__ float MaxOf2(float a, float b)
{
    if(a > b)   return a;
    else            return b;
}

__global__ void findMax(int size,float *array_device , float *outPut)
{
    extern __shared__ float sdata[];
    int tid =  threadIdx.x;
    int i = blockIdx.x*blockDim.x + threadIdx.x;

   if(i< size)
   {
    sdata[tid] = array_device[i];
    cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);
    __threadfence();

    }
    __syncthreads();

    if(tid<size) 
        cuPrintf(" array_d[%d]===%f, sdata[%d]===%f\n ",i,array_device[i],tid,sdata[tid]);

    for ( int s=blockDim.x/2; s>0; s=s>>1)//s=blockDim.x/2
    {   
        if (tid < s) 
        {   
            sdata[tid]= MaxOf2(sdata[tid],sdata[tid+s]);

        }

        __syncthreads();
   }
   if (tid == 0) outPut[blockIdx.x] = sdata[0];

}

 int main()
{
    long double M = pow(2,20);
    long double N = 2;
    int noThreadsPerBlock = 512 ;
    printf("\n Provide the array Size N.(array will be of size N * 2^20 ) :-");
    scanf("%Lf",&N);
    long int size = 16;
    int numOfBlock = (int)size /noThreadsPerBlock + 1;
    printf("\n num of blocks==%ld",numOfBlock);

    float *array_device , *outPut;
    float array_host[]={221,100,2,340,47,36,500,1,33,4460,5,6,7,8,9,11};
    cudaMalloc((void **)&array_device, size*sizeof(float));
    cudaMalloc((void **)&outPut, size*sizeof(float));
    cudaError_t error0 = cudaGetLastError();
    printf("\n 0CUDA error: %s\n", cudaGetErrorString(error0));
    printf("size===%ld",size);

    cudaMemcpy(array_device, array_host, size*sizeof(float), cudaMemcpyHostToDevice);
    cudaError_t error1 = cudaGetLastError();
    printf("\n1CUDA error: %s\n", cudaGetErrorString(error1));


    while(size>1 )
    {
        cudaPrintfInit();
        findMax<<< numOfBlock,noThreadsPerBlock>>>(size,array_device, outPut);cudaPrintfDisplay(stdout, true);
       cudaPrintfEnd();
       cudaError_t error2 = cudaGetLastError();
       printf("   2CUDA error: %s\n", cudaGetErrorString(error2));
       cudaMemcpy(array_device, outPut, size*sizeof(float), cudaMemcpyDeviceToDevice);
       size = numOfBlock;
       printf("\n ****size==%ld\n",size);
       numOfBlock = (int)size /noThreadsPerBlock + 1;
   }

     cudaMemcpy(array_host, outPut, size*sizeof(float), cudaMemcpyDeviceToHost);
     cudaError_t error3 = cudaGetLastError();
    printf("\n3CUDA error: %s\n", cudaGetErrorString(error3));
    for(int i=0;i<size;i++)
          printf("\n index==%d ;data=%f ",i,array_host[i]);
    return 0;
 }

Firstly, you havent specified dynamic size of shared memory in kernel launch. It should look like: `findMax<<< numOfBlock,noThreadsPerBlock,sizeof(float)*noThreadsPerBlock>>>` Secondly, what was the concept behind condition `if(tid — Maku, Aug 28 '13 at 08:03
Is it really necessary to post lots of lines of commented out code? You are asking us to read and correct your code, at least take the time to format it correctly so that it is easy for us to read. Help us help you..... — talonmies, Aug 28 '13 at 08:03
I put that if condition just to prohibit so many print as I have only 16 data. I tried to put the output..but geting some problem to to save it as it is giving "some code is there...". — Abhidip Bhattacharyya, Aug 28 '13 at 08:47
@Maku could you post your response as an answer. Apparently the poster is going to let this question sit and collect dust. The code as posted clearly cannot work for the (first) reason you state. I would upvote your answer. — Robert Crovella, Aug 29 '13 at 14:56

score 3 · Answer 1 · answered Aug 30 '13 at 12:50

3

I'm posting my comment as an answer as requested.

Firstly, you havent specified dynamic size of shared memory in kernel launch. It should look something like: findMax<<< numOfBlock,noThreadsPerBlock,sizeof(float)*noThreadsPerBlock>>>

Secondly, what was the concept behind condition if(tid<size) on second cuPrintf? Providing output of the program could also help.

answered Aug 30 '13 at 12:50

Maku

1,464
11
20

Thanks for the answer. It helps us get these abandoned questions off the unaswered list. – talonmies Aug 30 '13 at 13:18

I lost data after __syncthreads() in cuda

1 Answers1