1

I'm new to CUDA and I'm trying to develop simple (naive) erosion algorithm with structural element 3x3. As for now, I have developed a code (it's based on nVidia presentation):

#define bx (blockIdx.x)
#define by (blockIdx.y)
#define bdx (blockDim.x)
#define bdy (blockDim.y)
#define tx (threadIdx.x)
#define ty (threadIdx.y)
#define max( a, b ) ( ((a) > (b)) ? (a) : (b) )
#define min( a, b ) ( ((a) < (b)) ? (a) : (b) )

#define TILE_H 16
#define TILE_W 16
#define D 3    //structural element diameter
#define R 1    //structural element radius
#define BLOCK_W (TILE_W+D-1)
#define BLOCK_H (TILE_H+D-1)

__global__ void erosion(int *picture, unsigned int width, unsigned int height)
{    
    __shared__ int pixels[BLOCK_W*BLOCK_H];
    int x = bx*TILE_W + tx - R;
    int y = by*TILE_H + ty - R;

    x = max(0, x);
    x = min(x, (int)width-1);
    y = max(y,0);
    y = min(y, (int)height-1);

    unsigned int idx = y*width + x;
    unsigned int bidx = ty*bdy+tx;
    pixels[bidx] = picture[idx];
    __syncthreads();

    //compute pixels inside apron
    if (tx>=R && tx<BLOCK_W-R && ty>=R && ty < BLOCK_H-R)
    {
    //erode
    if (pixels[bidx] == 1)
        picture[idx] = pixels[ty*bdy+(tx+1)] & pixels[ty*bdy+(tx-1)] & pixels[(ty+1)*bdy+tx] & pixels[(ty-1)*bdy+tx];
    }
}

And main() function:

int main()
{
    //...    
    int *pixels;
    int img_width=M; int img_height=N;
    cudaMemcpy(dev_pixels, pixels, M*N*sizeof(int), cudaMemcpyHostToDevice);

    dim3 blocks(img_width/BLOCK_W, img_height/BLOCK_H);
    erosion<<<blocks, D*D>>>(dev_pixels, img_width, img_height);

    cudaMemcpy(output, dev_pixels, M*N*sizeof(int), cudaMemcpyDeviceToHost);
}

My problem is: it seems, that erosion() never reaches the if statement, where I want to compute pixels inside the apron. Do You happen to have any idea why is that so? I already ruled out img_widht/BLOCK_W division (it could return 0 value, but currently I fixed img_widht=54 and img_height=36).

Szał Pał
  • 306
  • 1
  • 7
  • 20
  • 1
    start by adding [proper cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) to your code and run your code with `cuda-memcheck` – Robert Crovella May 21 '14 at 23:21

1 Answers1

2

You are launching a kernel whose grid consists of a 2D array of blocks, each of which has a 1D array of threads:

dim3 blocks(img_width/BLOCK_W, img_height/BLOCK_H); // creates 2D blocks variable
erosion<<<blocks, D*D>>>(dev_pixels, img_width, img_height);
           ^       ^
           |       |
           |       1D array of threads
           2D array of blocks

Since your threadblock is a 1D array of threads, threadIdx.y is always zero (for every thread in every block). Therefore ty is always zero, and this if-test always fails:

if (tx>=R && tx<BLOCK_W-R && ty>=R && ty < BLOCK_H-R)

since ty(==0) is never greater than or equal to R(==1)

You can launch a 2D array of threads in each block by defining an appropriate dim3 quantity:

dim3 threads(D,D);

and passing that in your kernel config:

erosion<<<blocks, threads>>>(dev_pixels, img_width, img_height);

I can't say whether this is sensible for the rest of your code, but with that modification I can say that the interior (body) of your if-statement in question will be reached.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257