-1

I am writing this CUDA code to convert RGB image to Greyscale using CUDA. I am currently learning CUDA and OpenCV so most of the thing is written by taking help from other codes especially "Intro to Parallel Program" Psets.

The output I am getting is a plain grey image. How can I find the problem in this code?

#include <iostream>
#include <cuda.h>
#include <cuda_runtime.h>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>

using namespace cv;
using namespace std;

__global__ void rgba_to_greyscale(const uchar4* const rgbaImage,
    unsigned char* greyImage,
    int numRows, int numCols)
{
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    int row = blockIdx.y * blockDim.y + threadIdx.y;        

    if (col >= numCols || row >= numRows) {
        return;
}

    int offset = row * numCols + col;

    uchar4 rgba_pixel = rgbaImage[offset];
    float greyness = .299f * rgba_pixel.x + .587f * rgba_pixel.y +
        .114f * rgba_pixel.z;
    greyImage[offset] = static_cast<unsigned char>(greyness);
}

int main()
{
    Mat imageRGBA;
    Mat imageGrey;
    uchar4        *h_rgbaImage;
    uchar4 *d_rgbaImage = NULL;
    unsigned char *h_greyImage;
    unsigned char *d_greyImage = NULL;
    ///////////////////////////////////
    Mat image;
    image = cv::imread("IMG.jpg");
    if (image.empty()) {
        cerr << "Couldn't open file: " << endl;
        exit(1);
    }

    ///////////////////////////////////
    int numRows = image.rows;
    int numCols = image.cols;
    ///////////////////////////////////////
    cvtColor(image, imageRGBA, COLOR_BGR2RGBA);    

    //Allocate Memory for output
    imageGrey.create(image.rows, image.cols, CV_8UC1);
    h_rgbaImage = (uchar4 *)imageRGBA.data;
    h_greyImage = (unsigned char *)imageGrey.data;

    const size_t numPixels = numRows * numCols;

    //Allocate memory on the device for both input and output

    cudaMalloc((void**)d_rgbaImage, sizeof(uchar4) * numPixels);
    cudaMalloc((void**)d_greyImage, sizeof(unsigned char) * numPixels);
    cudaMemset((void *)d_greyImage, 0, numPixels * sizeof(unsigned char));
    //Copy input array to the GPU

    cudaMemcpy(d_rgbaImage, h_rgbaImage, sizeof(uchar4)*numPixels,         
    cudaMemcpyHostToDevice);

    //Calling the Kernel - 

    const dim3 blockSize(32, 16, 1);
    const dim3 gridSize(1 + (numCols / blockSize.x), 1 + (numRows /     
     blockSize.y), 1);

    rgba_to_greyscale <<<gridSize, blockSize >>> (d_rgbaImage, d_greyImage, 
    numRows, numCols);

    //Copy Output array to Host

    cudaMemcpy(h_greyImage, d_greyImage, sizeof(unsigned char) * numPixels,     
    cudaMemcpyDeviceToHost);

    //Check Output
    Mat output;
    output = Mat(numRows, numCols, CV_8UC1, (void*)h_greyImage);
    imwrite("result.jpg", output);  
}
halfer
  • 19,824
  • 17
  • 99
  • 186
Sherry
  • 59
  • 5
  • Maybe (just maybe) you need a ``__syncthreads()` after your check if `col` (or `row` respectively) is bigger than the maximum allowed value. – Thomas Lang Dec 10 '18 at 05:29
  • @ThomasLang tried that. Still the same output. – Sherry Dec 10 '18 at 05:35
  • Are you using the correct file format? For grayscale images one normally uses `pgm` files, where you just write one value (the grayscale value). In JPG however, you still have to write such RGBa quadruplets. – Thomas Lang Dec 10 '18 at 05:52
  • Please add [`CUDA error checking`](https://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api/14038590#14038590) in the code. – sgarizvi Dec 10 '18 at 06:17
  • @ThomasLang: Apart from having no effect whatsoever on the data consistency of this kernel, the change suggested in your first comment would introduce a potential deadlock in the code – talonmies Dec 10 '18 at 06:17
  • @talonmies Actually, under OpenCL one needs a sync(0) after every if-condition such that the diverging control flows unify again. Otherwise the behaviour is unpredictable. In OpenCL, this won't actually block anything, not sure what the CUDA equivalent is. – Thomas Lang Dec 10 '18 at 06:20
  • The CUDA behaviour is a potential deadlock. Every (nominated in some implementations) thread or a thread from every warp must decrement the counter associated with the barrier before threads at the barrier can proceed. If that conditions isn't fulfilled, you have deadlock – talonmies Dec 10 '18 at 06:25
  • @talonmies Interesting, didn't know that. Thanks. – Thomas Lang Dec 10 '18 at 06:31

1 Answers1

2

The device memory allocation calls in your code are invalid.

cudaMalloc((void**)d_rgbaImage, sizeof(uchar4) * numPixels);
cudaMalloc((void**)d_greyImage, sizeof(unsigned char) * numPixels);

Practically, the above calls do nothing. Please correct the calls as follows so that the pointer is actually modified.

cudaMalloc((void**)&d_rgbaImage, sizeof(uchar4) * numPixels);
                   ^
cudaMalloc((void**)&d_greyImage, sizeof(unsigned char) * numPixels);
                   ^

Also, please make sure you check for CUDA errors in your code, so that issues like these can be tracked easily.

sgarizvi
  • 16,623
  • 9
  • 64
  • 98