So Here is My almost Complete code: the first kernel which is normal global histogram works correctly. but I get the error "an illegal memory access was encountered (77)" at the final memcpy after calculating the shared_histogram. I dont know what is wrong with the code. seems like the shared histogram does change the size of d_hist2. I also checked that bin_count is changed or not. but it didnt. so is my shared_histog kernel wrong or i am doing a mistake on memCpy?? note : w * h * nc is the size of my input image
__global__ void histog( int *img, int *hist, int bin_count, int n)
{
int x = threadIdx.x + blockDim.x *blockIdx.x;
if(x>=n) return;
unsigned char value = img[x];
int bin = value % bin_count;
atomicAdd(&hist[bin],1);
}
__global__ void shared_histog( int *img, int *hist, int n)
{
int x = threadIdx.x + blockDim.x *blockIdx.x;
int indx = threadIdx.x;
if(x>n) return;
__shared__ int shHist[256];
if (indx < 256)
shHist[indx] =0;
__syncthreads();
unsigned char value = img[x];
__syncthreads();
atomicAdd( (int*)&shHist[value], 1);
__syncthreads();
atomicAdd( (int*)&(hist[indx]), shHist[indx] );
}
int main(int argc, char **argv)
{
cudaDeviceSynchronize(); CUDA_CHECK;
int *imgval = new int[(size_t)w*h*nc];
for (int i =0; i<w*h*nc; i++)
imgval[i] = (imgIn[i])*256 + 1;
int bin_count = 256;
int *Histogram = new int[bin_count];
int *Histogram2 = new int[bin_count];
for (int i =0; i <bin_count; i++)
Histogram2[i] = 0;
Timer timer; timer.start();
for (int i =0; i <bin_count; i++)
Histogram[i] = 0;
for (int i =0; i<w*h*nc; i++)
Histogram[(imgval[i])]++;
showHistogram256("CPU_Histo", Histogram, 100 + w + 40, 100);
timer.end(); float t = timer.get(); // elapsed time in seconds
cout << "CPU time: " << t*1000 << " ms" << endl;
int *d_img = NULL;
int nbytes = w * h * nc * sizeof(int);
cudaMalloc(&d_img, nbytes); CUDA_CHECK;
cudaMemcpy(d_img, imgval, nbytes, cudaMemcpyHostToDevice); CUDA_CHECK;
int *d_hist = NULL;
cudaMalloc(&d_hist, bin_count * sizeof(int)); CUDA_CHECK;
cudaMemset(d_hist, 0, bin_count * sizeof(int)); CUDA_CHECK;
int *d_hist2 = NULL;
cudaMalloc(&d_hist2, bin_count * sizeof(int)); CUDA_CHECK;
cudaMemset(d_hist2, 0, bin_count * sizeof(int)); CUDA_CHECK;
dim3 block = dim3(1024,1,1);
dim3 grid = dim3 ((w*h*nc+block.x-1)/block.x, 1, 1);
Timer timer2; timer2.start();
histog <<<grid, block>>> (d_img, d_hist, bin_count, nbytes); CUDA_CHECK;
timer2.end(); float t2 = timer2.get(); // elapsed time in seconds
cout << "GPU time: " << t2*1000 << " ms" << endl;
cudaMemcpy(Histogram, d_hist,bin_count * sizeof(int), cudaMemcpyDeviceToHost); CUDA_CHECK;
showHistogram256("GPU_Histo", Histogram, 100 + w + 40, 100 + h/2 + 10);
Timer timer3; timer3.start();
shared_histog <<<grid, block>>> (d_img, d_hist2, nbytes); CUDA_CHECK;
timer3.end(); float t3 = timer3.get(); // elapsed time in seconds
cout << "Shared time: " << t3*1000 << " ms" << endl;
* here comes the error *
cudaMemcpy(Histogram2, d_hist2, 256 * sizeof(int), cudaMemcpyDeviceToHost); CUDA_CHECK;
showHistogram256("GPU_Histo_Shared", Histogram2, 100 + w + 40, 100 + h +10);
return 0;
}