RGB to greyscale conversion using CUDA

Question

So I am trying to write a program that turns RGB images to greyscale. I got the idea from the Udacity problem set. The problem is that when I write out the kernel in the Udacity web environment, it says my code works, however, when I try to do it locally on my computer, I get no errors, but my image instead of coming out greyscale, comes out completely grey. It looks like one grey box the dimensions of the image I loaded. Can you help me find the error in my code, I've compared it with the Udacity version and I can't seem to find it.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <string>
#include <cuda.h>
#include <stdio.h>
#include <opencv\cv.h>
#include <opencv\highgui.h>
#include <iostream>



#define CUDA_ERROR_CHECK

#define CudaSafeCall( err ) __cudaSafeCall( err, __FILE__, __LINE__ )
#define CudaCheckError()    __cudaCheckError( __FILE__, __LINE__ )

inline void __cudaSafeCall(cudaError err, const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
    if (cudaSuccess != err)
    {
        fprintf(stderr, "cudaSafeCall() failed at %s:%i : %s\n",
            file, line, cudaGetErrorString(err));
        exit(-1);
    }
#endif

    return;
}

inline void __cudaCheckError(const char *file, const int line)
{
#ifdef CUDA_ERROR_CHECK
    cudaError err = cudaGetLastError();
    if (cudaSuccess != err)
    {
        fprintf(stderr, "cudaCheckError() failed at %s:%i : %s\n",
            file, line, cudaGetErrorString(err));
        exit(-1);
    }


    err = cudaDeviceSynchronize();
    if (cudaSuccess != err)
    {
        fprintf(stderr, "cudaCheckError() with sync failed at %s:%i : %s\n",
            file, line, cudaGetErrorString(err));
        exit(-1);
    }
#endif

    return;
}

__global__ void rgb_2_grey(uchar* const greyImage, const uchar4* const rgbImage, int rows, int columns)
{
    int rgb_x = blockIdx.x * blockDim.x + threadIdx.x; //x coordinate of pixel
    int rgb_y = blockIdx.y * blockDim.y + threadIdx.y; //y coordinate of pixel

    if ((rgb_x >= columns) && (rgb_y >= rows)) {
        return;
    }

    int rgb_ab = rgb_y*columns + rgb_x; //absolute pixel position
    uchar4 rgb_Img = rgbImage[rgb_ab];
    greyImage[rgb_ab] = uchar((float(rgb_Img.x))*0.299f + (float(rgb_Img.y))*0.587f + (float(rgb_Img.z))*0.114f);
}
using namespace cv;
using namespace std;

void Proc_Img(uchar4** h_RGBImage, uchar** h_greyImage, uchar4 **d_RGBImage, uchar** d_greyImage);
void RGB_2_Greyscale(uchar* const d_greyImage, uchar4* const d_RGBImage, size_t num_Rows, size_t num_Cols);
void Save_Img();

Mat img_RGB;
Mat img_Grey;
uchar4 *d_rgbImg;
uchar *d_greyImg; 
int main()
{
        uchar4* h_rgbImg;
        //uchar4* d_rgbImge=0;
        uchar* h_greyImg;
        //uchar* d_greyImge=0;

        Proc_Img(&h_rgbImg, &h_greyImg, &d_rgbImg, &d_greyImg);
        RGB_2_Greyscale(d_greyImg, d_rgbImg, img_RGB.rows, img_RGB.cols);
        Save_Img();





    return 0;
}
void Proc_Img(uchar4** h_RGBImage, uchar** h_greyImage, uchar4 **d_RGBImage, uchar** d_greyImage){
    cudaFree(0);
    CudaCheckError();

    //loads image into a matrix object along with the colors in BGR format (must convert to rgb).
    Mat img = imread("C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581.JPG", CV_LOAD_IMAGE_COLOR);
    if (img.empty()){
        cerr << "couldnt open file dumbas..." << "C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581.JPG" << endl;
        exit(1);
    }

    //converts color type from BGR to RGB
    cvtColor(img, img_RGB, CV_BGR2RGBA);

    //allocate memory for new greyscale image. 
    //img.rows returns the range of pixels in y, img.cols returns range of pixels in x
    //CV_8UC1 means 8 bit unsigned(non-negative) single channel of color, aka greyscale.
    //all three of the parameters allow the create function in the Mat class to determine how much memory to allocate
    img_Grey.create(img.rows, img.cols, CV_8UC1);

    //creates rgb and greyscale image arrays
    *h_RGBImage = (uchar4*)img_RGB.ptr<uchar>(0); //.ptr is a method in the mat class that returns a pointer to the first element of the matrix.
    *h_greyImage = (uchar*)img_Grey.ptr<uchar>(0);        //this is just like a regular array/pointer mem address to first element of the array. This is templated
                                                          //in this case the compiler runs the function for returning pointer of type unsigned char. for rgb image it is
                                                          //cast to uchar4 struct to hold r,g, and b values.

    const size_t num_pix = (img_RGB.rows) * (img_RGB.cols); //amount of pixels 

    //allocate memory on gpu
    cudaMalloc(d_RGBImage, sizeof(uchar4) * num_pix); //bites of 1 uchar4 times # of pixels gives number of bites necessary for array
    CudaCheckError();
    cudaMalloc(d_greyImage, sizeof(uchar) * num_pix);//bites of uchar times # pixels gives number of bites necessary for array
    CudaCheckError();
    cudaMemset(*d_greyImage, 0, sizeof(uchar) * num_pix);
    CudaCheckError();


    //copy array into allocated space
    cudaMemcpy(*d_RGBImage, *h_RGBImage, sizeof(uchar4)*num_pix, cudaMemcpyHostToDevice);
    CudaCheckError();


    d_rgbImg = *d_RGBImage;
    d_greyImg = *d_greyImage; 
}


void RGB_2_Greyscale(uchar* const d_greyImage, uchar4* const d_RGBImage, size_t num_Rows, size_t num_Cols){

    const int BS = 16;
    const dim3 blockSize(BS, BS);
    const dim3 gridSize((num_Cols / BS) + 1, (num_Rows / BS) + 1); 

    rgb_2_grey <<<gridSize, blockSize>>>(d_greyImage, d_RGBImage, num_Rows, num_Cols);

    cudaDeviceSynchronize(); CudaCheckError();


}



void Save_Img(){

    const size_t num_pix = (img_RGB.rows) * (img_RGB.cols);
    cudaMemcpy(img_Grey.ptr<uchar>(0), d_greyImg, sizeof(uchar)*num_pix, cudaMemcpyDeviceToHost);
    CudaCheckError();


    imwrite("C:\\Users\\Austin\\Pictures\\wallpapers\\IMG_3581GR.JPG", img_Grey);

    cudaFree(d_rgbImg);
    cudaFree(d_greyImg);

}

EDIT: I realized that the local var in my main is the same name as the global var, I have edited the code here, now I get the error from visual studio that the

variable d_rgbIme is being used without being initialized

when I have already initialized it above. If I set them equal to zero I get a CUDA error saying

an illegal memory access was encountered

I tried running cuda-memcheck, but then I get the error that i could not run the file...

You're launching a grid of threads that is larger than your image. But you have no thread-check in your kernel. This means that some threads in your kernel will be accessing your image out-of-bounds. Try adding a thread-check in your kernel like `if ((rgb_x < columns) && (rgb_y < rows)) {` before doing any reading or writing in your kernel. Also try running your code with `cuda-memcheck`, and add [proper cuda error checking](http://stackoverflow.com/questions/14038589/what-is-the-canonical-way-to-check-for-errors-using-the-cuda-runtime-api) to your code. — Robert Crovella, Aug 14 '14 at 05:18
You need to cast int to float before multiplication in (rgb_Img.x)*0.299f. Change it to (float(rgb_Img.x))*0.299f. — Andrey Smorodov, Aug 14 '14 at 08:01
@AndreySmorodov that will happen [automatically](http://stackoverflow.com/questions/5563000/implicit-type-conversion-rules-in-c-operators). It is not necessary to explicitly cast to `float`, when one of the multiplication arguments is already a `float` (0.299f is a `float` constant). — Robert Crovella, Aug 14 '14 at 13:12
Hi guys thanks for the comments. I have taken all you advice and implemented it into my code, unfortunately, it still doesn't run properly on my computer, only on Udacity's online enviroment. — tinman248, Aug 15 '14 at 01:21
Why don't you update the code posted in the question, to demonstrate the error checking you have added. i.e. show the code you are currently using/trying. — Robert Crovella, Aug 15 '14 at 14:29
Did you run your code with `cuda-memcheck` ? The one remaining error I can find in your code is that this kernel line: `if ((rgb_x >= columns) && (rgb_y >= rows)) {` should be this: `if ((rgb_x >= columns) || (rgb_y >= rows)) {` You want the thread to exit if either `x` *or* `y` is out of range. && is boolean AND. || is boolean OR. — Robert Crovella, Aug 15 '14 at 20:54
HA!!! the `if ((rgb_x >= columns) || (rgb_y >= rows)) {` change did it!! — tinman248, Aug 15 '14 at 20:59
You should probably learn how to use `cuda-memcheck`, it's pretty simple. This statement doesn't tell me much: "I tried running cuda-memcheck, but then I get the error that i could not run the file... " What command line did you use to run cuda-memcheck, exactly? What was the exact output? If you answer those questions, you can probably learn how to use it and it will be useful for debugging CUDA programs. — Robert Crovella, Aug 15 '14 at 22:56
I guess is not an executable. When I ask for the exact command line, and you say "cuda-memcheck ", I guess that is not the exact command line. I guess you also haven't indicated the exact output. It's difficult to help you that way. — Robert Crovella, Aug 18 '14 at 20:49

score 5 · Accepted Answer · answered Aug 15 '14 at 21:00

5

I have found the error thanks to one of the comments by Robert Crovella, he has been very helpful with this! it is in my kernel the if statement should read if ((rgb_x >= columns) || (rgb_y >= rows)) {

answered Aug 15 '14 at 21:00

tinman248

345
1
3
10

score 0 · Answer 2 · answered Aug 28 '14 at 11:35

I was working on the same problem in JCUDA. See if you can use any part of this solution:

//Read Height and Width of image in Height & Width variables
int Width = image.getWidth();
int Height = image.getHeight();

int N = Height * Width;
int[] grayScale = new int[N];

//Allocate separate arrays to store Alpha, Red, Green and 
//Blue values for every pixel 
int[] redHost = new int[N];
int[] greenHost = new int[N];
int[] blueHost = new int[N];
int[] alphaHost = new int[N];

for(int i=0; i<Height; i++)
{
    for(int j=0; j<Width; j++)
    {
        int pixel = image.getRGB(j, i);
        //Read the ARGB data
        alphaHost[i*Width+j] = (pixel >> 24) & 0xff;
        redHost[i*Width+j] = (pixel >> 16) & 0xff;
        greenHost[i*Width+j] = (pixel >> 8) & 0xff;
        blueHost[i*Width+j] = (pixel) & 0xff;
    }
}

/* Following are the CUDA Kernel parameters*/

Pointer kernelParameters = Pointer.to(
                                Pointer.to(new int[]{N}), //Total size of each array W * H 
                                Pointer.to(redDev),       // Pointer to redArray on device
                                Pointer.to(greenDev),     // Pointer to greenArray on device
                                Pointer.to(blueDev),      // Pointer to blueArray on device
                                Pointer.to(Output));      //Pointer to output array

/*Following is my RGBToGrayScale.cu..i.e. CUDA Kernel */

__global__ void RGBtoGrayScale(int N, int *red, int *green, int *blue, int *Output)
{
        int id = blockIdx.x * blockDim.x + threadIdx.x;

    if(id<N)
    {
        Output[id] = (red[id]*0.2989) + (green[id]*0.587) + (blue[id]*0.114);
    }

}

/* Get the output data back to Host memory */

cuMemcpyDtoH(Pointer.to(grayScale), Output, N * Sizeof.INT);

/* Write the image with the new RBG values*/

BufferedImage im = new BufferedImage(Width,Height,BufferedImage.TYPE_BYTE_GRAY);
        WritableRaster raster = im.getRaster();
        for(int i=0;i<Height;i++)
        {
            for(int j=0;j<Width;j++)
            {
                raster.setSample(j, i, 0, grayScale[i*Width+j]);
            }
        }
        try 
        {
            ImageIO.write(im,"JPEG",new File("glpattern.jpeg"));
        } catch (IOException e) 
        {
            e.printStackTrace();
        }

RGB to greyscale conversion using CUDA

2 Answers2