CUDA not executing code after nested loop in kernel function

Question

I am trying to apply Gaussian Filter on image using CUDA.

int main() {

    double GKernel[5][5];
    FilterCreation(GKernel);
    double * kernel = new double[25];
    int i,j,k = 0;
    for( int i = 0; i < 5; i++){
     for( int j = 0; j < 5; j++){
       kernel[k++] = GKernel[i][j];
     }
    }

    double * deviceKernel;

    cudaMalloc((void **)&deviceKernel, 25 * sizeof(double));
    cudaMemcpy(deviceKernel, kernel, 25 * sizeof(double), cudaMemcpyHostToDevice);

    Info dat = readBMP("konik.bmp");
    unsigned char * devPtr;
    unsigned char * devPtrFilter;
    size_t pitch;

    unsigned char * test= new unsigned char  [dat.size - dat.offset ];

    cudaMalloc (( void **)& devPtr , dat.size * sizeof ( unsigned char  ));
    cudaMalloc (( void **)& devPtrFilter , dat.size * sizeof ( unsigned char  ));

    cudaMemcpy ( devPtr , dat.data , sizeof ( unsigned char ) *  dat.size , cudaMemcpyHostToDevice );

        greyScale<<<dat.height,dat.width>>>(devPtr,dat.height,dat.width);
        CreateGaussFilter<<<dat.height,dat.width>>>(devPtr,devPtrFilter,5,5,deviceKernel,dat.height,dat.width);

    cudaMemcpy ( test, devPtrFilter , sizeof ( unsigned char )  *  dat.size ,cudaMemcpyDeviceToHost );
    cudaDeviceSynchronize ();

    ofstream fout;
    fout.open("output.bmp", ios::binary | ios::out);
    fout.write( reinterpret_cast<char *>(dat.info), dat.offset);

    fout.write( reinterpret_cast<char *>(test), dat.size - dat.offset );
    fout.close();
    return 0;
}

First i call greyScale kernel that works as it should.

__global__ void greyScale( unsigned char * src , int rows, int cols){

        int i = blockDim.x * blockIdx.x * threadIdx.x;
        int j = blockDim.y * blockIdx.y * threadIdx.y;

        if( i >= rows || j >= cols ) {
                return;
        }
            unsigned char r = src[3 * (i * cols + j)];
            unsigned char g = src[3 * (i * cols + j) + 1];
            unsigned char b = src[3 * (i * cols + j) + 2];

            unsigned char linearIntensity = (unsigned char)(0.2126f * r + 0.7512f * g + 0);

            src[3 * (i * cols + j)] = linearIntensity;
            src[3 * (i * cols + j) + 1] = linearIntensity;
            src[3 * (i * cols + j) + 2] = linearIntensity;

}

But what bothers me is my CreateGaussFilter kernel function:

__global__ void CreateGaussFilter(unsigned char * src, unsigned char * dst, int kernalHeight, int kernalWidth, double *kernalArray, int rows, int cols){
    int verticleImageBound=(kernalHeight-1)/2;
    int horizontalImageBound=(kernalWidth-1)/2;

    int row = ( blockDim.x * blockIdx.x * threadIdx.x ) + verticleImageBound;
    int col = ( blockDim.y * blockIdx.y * threadIdx.y ) + horizontalImageBound;

    if ( row >= rows - verticleImageBound || col >= cols - horizontalImageBound ) return;
            float  value=0;
            for(int kRow=0;kRow<kernalHeight;kRow++){
                  for(int kCol=0;kCol<kernalWidth;kCol++){
                  float pixel = src[ 3 * ((kRow+row-verticleImageBound ) * cols + (kCol+col-horizontalImageBound)) ] * kernalArray[kRow + kCol * kernalWidth];
                  value+=pixel;
             }
            }

            printf("value = %i\n",value);

            dst[3 * ( row * cols + col )] = round(value);
            dst[3 * ( row * cols + col ) + 1] = round(value);
            dst[3 * ( row * cols + col ) + 2] = round(value);

}

it actually outputs black image ( this works when im running it on CPU without CUDA ), after debugging with printf it seems like the code wont execute after for loop, im not sure why, the boundaries works in non CUDA version and block size are same for greyScale kernel that works. Why isnt code executed after for loop? Is error occuring somewhere and shutting down the thread? Im quite confused.

Code for loading images:

struct Info{
    int width;
    int height;
    int offset;
    unsigned char * info;
    unsigned char * data;

    int size;
};

Info readBMP(char* filename)
{
    int i;
    std::ifstream is(filename, std::ifstream::binary);
    is.seekg(0, is.end);
    i = is.tellg();
    is.seekg(0);
    unsigned char *info = new unsigned char[i];
    is.read((char *)info,i);

    int width = *(int*)&info[18];
    int height = *(int*)&info[22];
    int offset = *(int*)&info[10];

    unsigned char a[offset];
    unsigned char *b = new unsigned char[i - offset];
    std::copy(info,
              info + offset,
              a);

    std::copy(info + offset,
              info + i,
              b + 0);

    Info dat;
    dat.width = width;
    dat.height = height;
    dat.offset = offset;
    dat.size = i;
    dat.info = new unsigned char[offset - 1];
    dat.data = new unsigned char[i - offset + 1];

    for( int j = 0; j < offset ; j++ ){
        dat.info[j] = a[j];
    }

    for( int j = 0; j < i - offset; j++ ){
        dat.data[j] = b[j];
    }
    return dat;

}

Thanks for help!

Any time you are having trouble with a CUDA code, I recommend [proper CUDA error checking](https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) and also run your code with `cuda-memcheck`. — Robert Crovella, Dec 01 '19 at 14:50
im directly compiling .cu file if i understood it correctly thats not runtime api.. or? — Darlyn, Dec 01 '19 at 14:51
What is the size of the bitmap? I would guess the kernel never even runs — talonmies, Dec 01 '19 at 14:52
@talonmies kernel runs, if i put printF for indexes of threads it outputs it ...size is 511*766*3 ( for pixels of img ) — Darlyn, Dec 01 '19 at 14:53
So what do you want from us? You haven't provided an [MCVE], and have no error description beyond a black image. — talonmies, Dec 01 '19 at 15:26

Robert Crovella · Accepted Answer · 2019-12-09T03:30:10.910

I doubt that either of your kernels are working correctly. You have at least 2 issues:

This is not how to create a proper index:
```
blockDim.x * blockIdx.x * threadIdx.x
```
a proper index looks like this:
```
blockDim.x * blockIdx.x + threadIdx.x
```
this error is evident for both .x and .y indices, in both kernels.
You are confused about CUDA kernel launch syntax, e.g. here: CreateGaussFilter<<<dat.height,dat.width>>>. The first <<<...>>> argument is the number of blocks per grid. The second is the number of threads per block. If you pass scalar quantities for both of these arguments (which you do) you will get a 1D grid of 1D threadblocks. 1D here means that in-kernel, your .y index values will always be zero, so this statement: int col = ( blockDim.y * blockIdx.y * threadIdx.y ) + horizontalImageBound; will result in every thread in your kernel having a col value of horizontalImageBound
in your in-kernel printf statement, %i is not the correct format parameter for a float quantity.

You should study any CUDA 2D kernel code for proper usage. Changes are needed in both your host code as well as your kernel code.

A few other notes.

It's nice if you don't strip off the headers your code uses. Some people trying to help you will want to run your code. Make it easy for them (if you want help from them). Just my suggestion, as is this entire post of mine.
You are expected to provide a complete example. See item 1 here. For example, your posted code does not indicate a definition of FilterCreation anywhere. And I don't have your konik.bmp, so either indicate how I can get it, or even better, write your code that you post here in such a way that it does not depend on an external file. For example create a dummy image in code, and skip the file load process.
This doesn't have to be that hard. Take what you've posted and create a new project with just that code. Does it compile? If not, keep adding to your posting until it compiles. Then does your posted code reproduce the issue? If not, keep adjusting till it does. In other words, put yourself in the place of those trying to help you. Again, just suggestions.

What follows is a code I attempted to build around what you have shown, while avoiding the issues I mentioned above. I make no claim that it produces the correct output, but should give you an idea how to fix some of the mistakes indicated above.

#include <iostream>
#include <fstream>
struct Info{
    int width;
    int height;
    int offset;
    unsigned char * info;
    unsigned char * data;

    int size;
};

Info readBMP(char* filename)
{
    int i;
    std::ifstream is(filename, std::ifstream::binary);
    is.seekg(0, is.end);
    i = is.tellg();
    is.seekg(0);
    unsigned char *info = new unsigned char[i];
    is.read((char *)info,i);

    int width = *(int*)&info[18];
    int height = *(int*)&info[22];
    int offset = *(int*)&info[10];

    unsigned char a[offset];
    unsigned char *b = new unsigned char[i - offset];
    std::copy(info,
              info + offset,
              a);

    std::copy(info + offset,
              info + i,
              b + 0);

    Info dat;
    dat.width = width;
    dat.height = height;
    dat.offset = offset;
    dat.size = i;
    dat.info = new unsigned char[offset - 1];
    dat.data = new unsigned char[i - offset + 1];

    for( int j = 0; j < offset ; j++ ){
        dat.info[j] = a[j];
    }

    for( int j = 0; j < i - offset; j++ ){
        dat.data[j] = b[j];
    }
    return dat;

}

__global__ void CreateGaussFilter(unsigned char * src, unsigned char * dst, int kernalHeight, int kernalWidth, double *kernalArray, int rows, int cols){
    int verticleImageBound=(kernalHeight-1)/2;
    int horizontalImageBound=(kernalWidth-1)/2;

    int row = ( blockDim.x * blockIdx.x + threadIdx.x ) + verticleImageBound;
    int col = ( blockDim.y * blockIdx.y + threadIdx.y ) + horizontalImageBound;

    if ( row >= rows - verticleImageBound || col >= cols - horizontalImageBound ) return;
            float  value=0;
            for(int kRow=0;kRow<kernalHeight;kRow++){
                  for(int kCol=0;kCol<kernalWidth;kCol++){
                  float pixel = src[ 3 * ((kRow+row-verticleImageBound ) * cols + (kCol+col-horizontalImageBound)) ] * kernalArray[kRow + kCol * kernalWidth];
                  value+=pixel;
             }
            }

            printf("value = %f\n",round(value));

            dst[3 * ( row * cols + col )] = round(value);
            dst[3 * ( row * cols + col ) + 1] = round(value);
            dst[3 * ( row * cols + col ) + 2] = round(value);

}

__global__ void greyScale( unsigned char * src , int rows, int cols){

        int i = blockDim.x * blockIdx.x + threadIdx.x;
        int j = blockDim.y * blockIdx.y + threadIdx.y;

        if( i >= rows || j >= cols ) {
                return;
        }
            unsigned char r = src[3 * (i * cols + j)];
            unsigned char g = src[3 * (i * cols + j) + 1];
            unsigned char b = src[3 * (i * cols + j) + 2];

            unsigned char linearIntensity = (unsigned char)(0.2126f * r + 0.7512f * g + 0);

            src[3 * (i * cols + j)] = linearIntensity;
            src[3 * (i * cols + j) + 1] = linearIntensity;
            src[3 * (i * cols + j) + 2] = linearIntensity;

}

int main() {

    double GKernel[5][5] = {0.1};
    //FilterCreation(GKernel);
    double * kernel = new double[25];
    int i,j,k = 0;
    for( int i = 0; i < 5; i++){
     for( int j = 0; j < 5; j++){
       kernel[k++] = GKernel[i][j];
     }
    }

    double * deviceKernel;

    cudaMalloc((void **)&deviceKernel, 25 * sizeof(double));
    cudaMemcpy(deviceKernel, kernel, 25 * sizeof(double), cudaMemcpyHostToDevice);

    Info dat; // = readBMP("konik.bmp");
    dat.width = 766;
    dat.height = 511;
    dat.size = dat.width*dat.height*3;
    dat.offset = 0;
    dat.data = new unsigned char[dat.size];
    unsigned char * devPtr;
    unsigned char * devPtrFilter;
    size_t pitch;

    unsigned char * test= new unsigned char  [dat.size - dat.offset ];

    cudaMalloc (( void **)& devPtr , dat.size * sizeof ( unsigned char  ));
    cudaMalloc (( void **)& devPtrFilter , dat.size * sizeof ( unsigned char  ));

    cudaMemcpy ( devPtr , dat.data , sizeof ( unsigned char ) *  dat.size , cudaMemcpyHostToDevice );

    dim3 block(32,32);
    dim3 grid((dat.height+31)/32, (dat.width+31)/32);

        greyScale<<<grid,block>>>(devPtr,dat.height,dat.width);
        CreateGaussFilter<<<grid,block>>>(devPtr,devPtrFilter,5,5,deviceKernel,dat.height,dat.width);

    cudaMemcpy ( test, devPtrFilter , sizeof ( unsigned char )  *  dat.size ,cudaMemcpyDeviceToHost );
    cudaDeviceSynchronize ();
#if 0
    std::ofstream fout;
    fout.open("output.bmp", std::ios::binary | std::ios::out);
    fout.write( reinterpret_cast<char *>(dat.info), dat.offset);

    fout.write( reinterpret_cast<char *>(test), dat.size - dat.offset );
    fout.close();
#endif
    return 0;
}

There were a few other issues with your code.

your RGB to grayscale is using 0 instead of b.
your gaussian kernel does not write to all output points, so we will fill the output with 0 first.
I've provided my own 5x5 gaussian kernel coefficients.

With those additional changes:

$ cat t8.cu
#include <iostream>
#include <fstream>
#include <stdio.h>

struct Info{
    int width;
    int height;
    int offset;
    unsigned char * info;
    unsigned char * data;

    int size;
};

Info readBMP(const char* filename)
{
    int i;
    std::ifstream is(filename, std::ifstream::binary);
    is.seekg(0, is.end);
    i = is.tellg();
    is.seekg(0);
    unsigned char *info = new unsigned char[i];
    is.read((char *)info,i);

    int width = *(int*)&info[18];
    int height = *(int*)&info[22];
    int offset = *(int*)&info[10];

    unsigned char a[offset];
    unsigned char *b = new unsigned char[i - offset];
    std::copy(info,
              info + offset,
              a);

    std::copy(info + offset,
              info + i,
              b + 0);

    Info dat;
    dat.width = width;
    dat.height = height;
    dat.offset = offset;
    dat.size = i;
    dat.info = new unsigned char[offset - 1];
    dat.data = new unsigned char[i - offset + 1];

    for( int j = 0; j < offset ; j++ ){
        dat.info[j] = a[j];
    }

    for( int j = 0; j < i - offset; j++ ){
        dat.data[j] = b[j];
    }
    return dat;

}

__global__ void CreateGaussFilter(unsigned char * src, unsigned char * dst, int kernalHeight, int kernalWidth, double *kernalArray, int rows, int cols){
    int verticleImageBound=(kernalHeight-1)/2;
    int horizontalImageBound=(kernalWidth-1)/2;

    int row = ( blockDim.x * blockIdx.x + threadIdx.x ) + verticleImageBound;
    int col = ( blockDim.y * blockIdx.y + threadIdx.y ) + horizontalImageBound;

    if ( row >= rows - verticleImageBound || col >= cols - horizontalImageBound ) return;
            float  value=0;
            for(int kRow=0;kRow<kernalHeight;kRow++){
                  for(int kCol=0;kCol<kernalWidth;kCol++){
                  float pixel = src[ 3 * ((kRow+row-verticleImageBound ) * cols + (kCol+col-horizontalImageBound)) ] * kernalArray[kRow + kCol * kernalWidth];
                  value+=pixel;
             }
            }

//            printf("value = %f\n",round(value));

            dst[3 * ( row * cols + col )] = round(value);
            dst[3 * ( row * cols + col ) + 1] = round(value);
            dst[3 * ( row * cols + col ) + 2] = round(value);

}

__global__ void greyScale( unsigned char * src , int rows, int cols){

        int i = blockDim.x * blockIdx.x + threadIdx.x;
        int j = blockDim.y * blockIdx.y + threadIdx.y;

        if( i >= rows || j >= cols ) {
                return;
        }
            unsigned char r = src[3 * (i * cols + j)];
            unsigned char g = src[3 * (i * cols + j) + 1];
            unsigned char b = src[3 * (i * cols + j) + 2];

            unsigned char linearIntensity = (unsigned char)(0.21f * r + 0.72f * g + 0.07 * b);

            src[3 * (i * cols + j)] = linearIntensity;
            src[3 * (i * cols + j) + 1] = linearIntensity;
            src[3 * (i * cols + j) + 2] = linearIntensity;

}
int main() {

    double GKernel[5][5] = {{1,4,7,4,1},{4,16,26,16,4},{7,26,41,26,7},{4,16,26,16,4},{1,4,7,4,1}};
    //FilterCreation(GKernel);
    double * kernel = new double[25];
    int k = 0;
    for( int i = 0; i < 5; i++){
     for( int j = 0; j < 5; j++){
       kernel[k++] = GKernel[i][j]/273;
     }
    }

    double * deviceKernel;

    cudaMalloc((void **)&deviceKernel, 25 * sizeof(double));
    cudaMemcpy(deviceKernel, kernel, 25 * sizeof(double), cudaMemcpyHostToDevice);

    Info dat = readBMP("input.bmp");
#if 0
    dat.width = 766;
    dat.height = 511;
    dat.size = dat.width*dat.height*3;
    dat.offset = 0;
    dat.data = new unsigned char[dat.size];
    for (int i = 0; i<dat.size; i++) dat.data[i] = (i%dat.width)%255;
#endif
    unsigned char * devPtr;
    unsigned char * devPtrFilter;

    unsigned char * test= new unsigned char  [dat.size - dat.offset ];

    cudaMalloc (( void **)& devPtr , dat.size * sizeof ( unsigned char  ));
    cudaMalloc (( void **)& devPtrFilter , dat.size * sizeof ( unsigned char  ));
    cudaMemset(devPtrFilter, 0, dat.size);
    cudaMemcpy ( devPtr , dat.data , sizeof ( unsigned char ) *  dat.size , cudaMemcpyHostToDevice );

    dim3 block(32,32);
    dim3 grid((dat.height+31)/32, (dat.width+31)/32);

        greyScale<<<grid,block>>>(devPtr,dat.height,dat.width);
        CreateGaussFilter<<<grid,block>>>(devPtr,devPtrFilter,5,5,deviceKernel,dat.height,dat.width);

    cudaMemcpy ( test, devPtrFilter , sizeof ( unsigned char )  *  (dat.size - dat.offset) ,cudaMemcpyDeviceToHost );
    cudaDeviceSynchronize ();
    std::ofstream fout;
    fout.open("output.bmp", std::ios::binary | std::ios::out);
    fout.write( reinterpret_cast<char *>(dat.info), dat.offset);

    fout.write( reinterpret_cast<char *>(test), dat.size - dat.offset );
    fout.close();
    return 0;
}
$ nvcc -o t8 t8.cu
$ cuda-memcheck ./t8

and starting with the BarbaraBlocks1.bmp file that is available at /usr/local/cuda/samples/3_Imaging/dct8x8/data on a standard, current CUDA linux install, which looks like this:

It produces output like this:

CUDA not executing code after nested loop in kernel function

1 Answers1