I'm new to CUDA and I'm trying to develop simple (naive) erosion algorithm with structural element 3x3. As for now, I have developed a code (it's based on nVidia presentation):
#define bx (blockIdx.x)
#define by (blockIdx.y)
#define bdx (blockDim.x)
#define bdy (blockDim.y)
#define tx (threadIdx.x)
#define ty (threadIdx.y)
#define max( a, b ) ( ((a) > (b)) ? (a) : (b) )
#define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
#define TILE_H 16
#define TILE_W 16
#define D 3 //structural element diameter
#define R 1 //structural element radius
#define BLOCK_W (TILE_W+D-1)
#define BLOCK_H (TILE_H+D-1)
__global__ void erosion(int *picture, unsigned int width, unsigned int height)
{
__shared__ int pixels[BLOCK_W*BLOCK_H];
int x = bx*TILE_W + tx - R;
int y = by*TILE_H + ty - R;
x = max(0, x);
x = min(x, (int)width-1);
y = max(y,0);
y = min(y, (int)height-1);
unsigned int idx = y*width + x;
unsigned int bidx = ty*bdy+tx;
pixels[bidx] = picture[idx];
__syncthreads();
//compute pixels inside apron
if (tx>=R && tx<BLOCK_W-R && ty>=R && ty < BLOCK_H-R)
{
//erode
if (pixels[bidx] == 1)
picture[idx] = pixels[ty*bdy+(tx+1)] & pixels[ty*bdy+(tx-1)] & pixels[(ty+1)*bdy+tx] & pixels[(ty-1)*bdy+tx];
}
}
And main()
function:
int main()
{
//...
int *pixels;
int img_width=M; int img_height=N;
cudaMemcpy(dev_pixels, pixels, M*N*sizeof(int), cudaMemcpyHostToDevice);
dim3 blocks(img_width/BLOCK_W, img_height/BLOCK_H);
erosion<<<blocks, D*D>>>(dev_pixels, img_width, img_height);
cudaMemcpy(output, dev_pixels, M*N*sizeof(int), cudaMemcpyDeviceToHost);
}
My problem is: it seems, that erosion()
never reaches the if
statement, where I want to compute pixels inside the apron. Do You happen to have any idea why is that so? I already ruled out img_widht/BLOCK_W
division (it could return 0 value, but currently I fixed img_widht=54
and img_height=36
).