I have a CUDA function that calculates Local Binary Patterns on GPU. Basically LBP is a computation over the pixels of an image where the value of any given pixel (i,j) depends on it's 8 neighbors' intensities.
So far so good, the code is the following:
//The kernel
__global__ void LBP(unsigned char *in, unsigned char *out, const int w, const int h)
{
const unsigned int i = (blockIdx.x * blockDim.x) + threadIdx.x;
//Don't do edges!
if(
i < w //first row
|| i >= (w * (h - 1)) // last row
|| !(i % w) // first column
|| (i % w + 1 == w) // last column
)
{
out[i] = 0;
return;
}
unsigned char
code = 0,
center = in[i];
code |= (in[i-w-1] > center) << 7;
code |= (in[i-w ] > center) << 6;
code |= (in[i-w+1] > center) << 5;
code |= (in[i +1] > center) << 4;
code |= (in[i+w+1] > center) << 3;
code |= (in[i+w ] > center) << 2;
code |= (in[i+w-1] > center) << 1;
code |= (in[i -1] > center) << 0;
out[i] = code;
}
// A proxi function
void DoLBP(unsigned char *in, unsigned char *out, const int w, const int h)
{
const int
sz = w * h * sizeof(unsigned char);
unsigned char
*in_gpu,
*out_gpu;
cudaMalloc((void**)&in_gpu, sz);
cudaMalloc((void**)&out_gpu, sz);
cudaMemcpy(in_gpu, in, sz, cudaMemcpyHostToDevice);
cudaMemcpy(out_gpu, out, sz, cudaMemcpyHostToDevice);
dim3 threadsPerBlock(1024); //Max
dim3 numBlocks(w*h/threadsPerBlock.x + 1);
LBP<<<numBlocks,threadsPerBlock>>>(in_gpu, out_gpu, w, h);
cudaMemcpy(out, out_gpu, sz, cudaMemcpyDeviceToHost);
cudaFree(in_gpu);
cudaFree(out_gpu);
}
//The caller
int main()
{
printf("Starting\n");
const int
w = 4000,
h = 2000;
unsigned char
in[w*h],
out[w*h];
// Fill [in] with some data
DoLBP(in, out, w, h);
// Use [out] data
return 0;
}
The images are passed to the GPU as a single-dimension array of *unsigned char*s (array = [[row 1] [row 2] [row 3] ... [row n]]
) (they are extracted from OpenCV's Mat)
The problem
This code works fine with relatively small images and it returns the output array populated with the right values but when the image size grows, the output array is all zeroed!
My suspicion is that the image data is overflowing some GPU buffer or something like that.
It is also not clear to me how does the numberOfBlocks and threadsPerBlock part work! If any of you could provide some basic insight about this it would be much appreciated.
(I'm like 1-day-old in CUDA, so there might be way too many ways to improve this snippet of code!)