0

So Here is My almost Complete code: the first kernel which is normal global histogram works correctly. but I get the error "an illegal memory access was encountered (77)" at the final memcpy after calculating the shared_histogram. I dont know what is wrong with the code. seems like the shared histogram does change the size of d_hist2. I also checked that bin_count is changed or not. but it didnt. so is my shared_histog kernel wrong or i am doing a mistake on memCpy?? note : w * h * nc is the size of my input image

__global__ void histog( int *img, int *hist, int bin_count, int n)
{
int x = threadIdx.x + blockDim.x *blockIdx.x;
if(x>=n)   return;

    unsigned char value = img[x];
int bin = value % bin_count;
atomicAdd(&hist[bin],1);

}

__global__ void shared_histog( int *img, int *hist, int n)
{
int x = threadIdx.x + blockDim.x *blockIdx.x;
int indx = threadIdx.x;
if(x>n)   return;

__shared__ int shHist[256];

if (indx < 256)
shHist[indx] =0;
__syncthreads();

unsigned char value = img[x];
__syncthreads();

atomicAdd( (int*)&shHist[value], 1);
__syncthreads();

atomicAdd( (int*)&(hist[indx]), shHist[indx] );

}



int main(int argc, char **argv)
{
cudaDeviceSynchronize();  CUDA_CHECK;


int *imgval  = new int[(size_t)w*h*nc];
for (int i =0; i<w*h*nc; i++)
    imgval[i] = (imgIn[i])*256 + 1;


int bin_count = 256;
int *Histogram  = new int[bin_count];
int *Histogram2  = new int[bin_count];

for (int i =0; i <bin_count; i++)
    Histogram2[i] = 0;

Timer timer; timer.start();
for (int i =0; i <bin_count; i++)
    Histogram[i] = 0;
for (int i =0; i<w*h*nc; i++)
    Histogram[(imgval[i])]++;

showHistogram256("CPU_Histo", Histogram, 100 + w + 40, 100);


timer.end();  float t = timer.get();  // elapsed time in seconds
cout << "CPU time: " << t*1000 << " ms" << endl;


int *d_img = NULL;
int nbytes = w * h * nc * sizeof(int);
cudaMalloc(&d_img, nbytes); CUDA_CHECK;
cudaMemcpy(d_img, imgval, nbytes, cudaMemcpyHostToDevice); CUDA_CHECK;

int *d_hist = NULL;
cudaMalloc(&d_hist, bin_count * sizeof(int)); CUDA_CHECK;
cudaMemset(d_hist, 0, bin_count * sizeof(int)); CUDA_CHECK;

int *d_hist2 = NULL;
cudaMalloc(&d_hist2, bin_count * sizeof(int)); CUDA_CHECK;
cudaMemset(d_hist2, 0, bin_count * sizeof(int)); CUDA_CHECK;

dim3 block = dim3(1024,1,1);
dim3 grid = dim3 ((w*h*nc+block.x-1)/block.x, 1, 1);

Timer timer2; timer2.start();
histog <<<grid, block>>> (d_img, d_hist, bin_count, nbytes);    CUDA_CHECK;
    timer2.end();  float t2 = timer2.get();  // elapsed time in seconds
    cout << "GPU time: " << t2*1000 << " ms" << endl;
cudaMemcpy(Histogram, d_hist,bin_count * sizeof(int), cudaMemcpyDeviceToHost); CUDA_CHECK;
showHistogram256("GPU_Histo", Histogram, 100 + w + 40, 100 + h/2 + 10);


Timer timer3; timer3.start();
shared_histog <<<grid, block>>> (d_img, d_hist2, nbytes); CUDA_CHECK;
timer3.end();  float t3 = timer3.get();  // elapsed time in seconds
    cout << "Shared time: " << t3*1000 << " ms" << endl;

* here comes the error *

cudaMemcpy(Histogram2, d_hist2, 256 * sizeof(int), cudaMemcpyDeviceToHost);  CUDA_CHECK;

showHistogram256("GPU_Histo_Shared", Histogram2, 100 + w + 40, 100 + h +10);


return 0;
}
Afshinzkh
  • 111
  • 1
  • 14

1 Answers1

3

You're using __syncthreads() after a conditional statement:

if(x>n)   return;

that may prevent all threads in the block from reaching it. That is not correct usage:

__syncthreads() is allowed in conditional code but only if the conditional evaluates identically across the entire thread block, otherwise the code execution is likely to hang or produce unintended side effects.

But it is probably not connected to the illegal memory access.

You are launching this kernel with 1024 threads per block:

dim3 block = dim3(1024,1,1);

which means in the kernel, your indx variable:

int indx = threadIdx.x;

will go from 0..1023 depending on the thread, which means that this line:

atomicAdd( (int*)&(hist[indx]), shHist[indx] );
                        ^^^^           ^^^^

will attempt to index into both hist and shHist out-of bounds for threads whose indx value is greater than 255, since both hist and shHist are only allocated with 256 elements.

You can probably fix this by adding a conditional statement:

if (indx < 256) 
  atomicAdd( (int*)&(hist[indx]), shHist[indx] );

If you compile with -lineinfo and use cuda-memcheck, you can actually have cuda-memcheck pinpoint the line of source code that is generating the out-of-bounds access.

Community
  • 1
  • 1
Robert Crovella
  • 143,785
  • 11
  • 213
  • 257