So I've gone over this multiple times and just cant seem to figure it out. What is happening is my variable that I am trying to copy from GPU memory to CPU memory always appears to be blank.
From my understanding, I should have a variable or multiple variables and create copies of these which I will send to the GPU along with some data to compute, once it is done computing, come back and insert the contents of the variable from GPU into the one from CPU.
But every time I do this, my variable 'd_result' is always empty. If anyone has an idea on how to fix this, it would be much appreciated.
My CUDA function:
__global__ void gpu_histogram_equalization(unsigned char * img_out, unsigned char * img_in,
int * hist_in, int img_size, int nbr_bin){
int *lut = (int *)malloc(sizeof(int)*nbr_bin);
int i, cdf, min, d;
/* Construct the LUT by calculating the CDF */
cdf = 0;
min = 0;
i = threadIdx.x;
while(min == 0){
min = hist_in[i++];
}
d = img_size - min;
if(i < nbr_bin){
cdf += hist_in[i];
//lut[i] = (cdf - min)*(nbr_bin - 1)/d;
lut[i] = (int)(((float)cdf - min)*255/d + 0.5);
if(lut[i] < 0){
lut[i] = 0;
}
}
/* Get the result image */
if(i < img_size){
if(lut[img_in[i]] > 255){
img_out[i] = 255;
}
else{
img_out[i] = (unsigned char)lut[img_in[i]];
}
}
}
And then my function which calls it:
PGM_IMG gpu_contrast_enhancement_g(PGM_IMG img_in)
{
PGM_IMG result;
int hist[256];
unsigned char * d_result;
result.w = img_in.w;
result.h = img_in.h;
result.img = (unsigned char *)malloc(result.w * result.h * sizeof(unsigned char));
cudaMalloc(&d_result, result.w * result.h * sizeof(unsigned char));
cudaMemcpy(d_result, result.img, result.w * result.h * sizeof(unsigned char), cudaMemcpyHostToDevice);
histogram(hist, img_in.img, img_in.h * img_in.w, 256);
gpu_histogram_equalization<<<1,result.w * result.h * sizeof(unsigned char)>>>(d_result,img_in.img,hist,result.w*result.h, 256);
cudaMemcpy(result.img, d_result, result.w * result.h * sizeof(unsigned char), cudaMemcpyDeviceToHost);
cudaFree(d_result);
return result;
}