0

I'm trying to convert a 3x3 rgb "image" to grayscale. The image is just a 3d array. You can think of it being a 2d image, with each pixel having a 1x3 array of rgb values.

To do this, I've basically converted the 3d array into a 1D array. There is no issue with the conversion. I basically send this vector array to the cuda device and it returns me a 1x9 (one grayscale value for each pixel) processed vector array. I've tried to do this by first creating 3 blocks, each for RGB. Each block has 3x3 threads where each thread targets either the red, blue, or green value (determined by the block index) of a pixel. I then multiply each red, blue, or green value by a corresponding factor and add that value to the corresponding pixel of the output array.

However, what I've been getting is an array that is comprised of all zeroes.

#include <stdio.h>

//
__global__ void RGBToGrayScale(float *d_out, float *d_in, int rowCount, int colCount) {
    float grayScaleAddition;
    int temp = d_in[blockIdx.x * rowCount * colCount + threadIdx.x * colCount + threadIdx.y];
    switch(blockIdx.x) {
        case 0:
            grayScaleAddition = 0.299 * temp;
            break;
        case 1:
            grayScaleAddition = 0.587 * temp;
            break;
        case 2:
            grayScaleAddition = 0.114 * temp;
            break;
    }
    d_out[threadIdx.x * colCount + threadIdx.y] += grayScale;
}

int main() {
    int image[3][3][3] = { //3 rows, columns, and 3 rgb values for each pixel
        {{1, 3, 2}, {4, 5, 6}, {7, 8, 9}},
        {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}},
        {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}
    };

    const int IMAGE_ROW_COUNT = 3;
    const int IMAGE_COLUMN_COUNT = 3;
    const int ARRAY_BYTES = IMAGE_ROW_COUNT * IMAGE_COLUMN_COUNT * 3 * sizeof(float);

    //converting image to a 1D array
    float* h_in = (float*)malloc(ARRAY_BYTES);
    float* h_out = (float*)malloc(ARRAY_BYTES / 3);

    for (int i = 0; i < IMAGE_ROW_COUNT; ++i) {//no issue with conversion, checked manually
        for (int j = 0; j < IMAGE_COLUMN_COUNT; ++j) {
            for (int k = 0; k < 3; ++k) {
                h_in[k * IMAGE_ROW_COUNT * IMAGE_COLUMN_COUNT + i * IMAGE_COLUMN_COUNT + j] = float(image[i][j][k]);
            }
        }
    }

    //declare GPU memory pointers
    float* d_in;
    float* d_out;

    cudaMalloc((void **) &d_in, ARRAY_BYTES);
    cudaMalloc((void **) &d_out, ARRAY_BYTES / 3);

    //transfer array into GPU
    cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice);

    RGBToGrayScale<<<3, dim3(IMAGE_ROW_COUNT, IMAGE_COLUMN_COUNT)>>>(d_out, d_in, IMAGE_ROW_COUNT, IMAGE_COLUMN_COUNT);

    //copy back the result array to CPU
    cudaMemcpy(h_out, d_out, ARRAY_BYTES, cudaMemcpyDeviceToHost);

    for (int i = 0; i < IMAGE_ROW_COUNT * IMAGE_COLUMN_COUNT; ++i) {
        printf("%f", h_out[i]);
        printf(((i % 3) != 2) ? "\t" : "\n");
    }

    cudaFree(d_in);
    cudaFree(d_out);
    free(h_in);
    free(h_out);
    return 0;
}
talonmies
  • 70,661
  • 34
  • 192
  • 269

1 Answers1

2

The code does not compile because kernel variable grayScale does not exist. After fixing this and adding proper CUDA error checking, see for example https://stackoverflow.com/a/14038590/5206464 , the program reports an invalid argument for the last cudaMemcpy call.

You attempt to transfer ARRAY_BYTES of data, but both h_out and d_out have size ARRAY_BYTES / 3. Fixing cudaMemcpy leads to non-zero results.

#include <stdio.h>

#define gpuErrchk(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess) 
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}

__global__ void RGBToGrayScale(float *d_out, float *d_in, int rowCount, int colCount) {
    float grayScaleAddition;
    int temp = d_in[blockIdx.x * rowCount * colCount + threadIdx.x * colCount + threadIdx.y];
    switch(blockIdx.x) {
        case 0:
            grayScaleAddition = 0.299 * temp;
            break;
        case 1:
            grayScaleAddition = 0.587 * temp;
            break;
        case 2:
            grayScaleAddition = 0.114 * temp;
            break;
    }
    d_out[threadIdx.x * colCount + threadIdx.y] += grayScaleAddition;
}

int main() {
    int image[3][3][3] = { //3 rows, columns, and 3 rgb values for each pixel
        {{1, 3, 2}, {4, 5, 6}, {7, 8, 9}},
        {{10, 11, 12}, {13, 14, 15}, {16, 17, 18}},
        {{19, 20, 21}, {22, 23, 24}, {25, 26, 27}}
    };

    const int IMAGE_ROW_COUNT = 3;
    const int IMAGE_COLUMN_COUNT = 3;
    const int ARRAY_BYTES = IMAGE_ROW_COUNT * IMAGE_COLUMN_COUNT * 3 * sizeof(float);

    //converting image to a 1D array
    float* h_in = (float*)malloc(ARRAY_BYTES);
    float* h_out = (float*)malloc(ARRAY_BYTES / 3);

    for (int i = 0; i < IMAGE_ROW_COUNT; ++i) {//no issue with conversion, checked manually
        for (int j = 0; j < IMAGE_COLUMN_COUNT; ++j) {
            for (int k = 0; k < 3; ++k) {
                h_in[k * IMAGE_ROW_COUNT * IMAGE_COLUMN_COUNT + i * IMAGE_COLUMN_COUNT + j] = float(image[i][j][k]);
            }
        }
    }

    //declare GPU memory pointers
    float* d_in;
    float* d_out;

    gpuErrchk(cudaMalloc((void **) &d_in, ARRAY_BYTES));
    gpuErrchk(cudaMalloc((void **) &d_out, ARRAY_BYTES / 3));

    //transfer array into GPU
    gpuErrchk(cudaMemcpy(d_in, h_in, ARRAY_BYTES, cudaMemcpyHostToDevice));

    RGBToGrayScale<<<3, dim3(IMAGE_ROW_COUNT, IMAGE_COLUMN_COUNT)>>>(d_out, d_in, IMAGE_ROW_COUNT, IMAGE_COLUMN_COUNT);
    gpuErrchk(cudaGetLastError());

    //copy back the result array to CPU
    gpuErrchk(cudaMemcpy(h_out, d_out, ARRAY_BYTES / 3, cudaMemcpyDeviceToHost));

    for (int i = 0; i < IMAGE_ROW_COUNT * IMAGE_COLUMN_COUNT; ++i) {
        printf("%f", h_out[i]);
        printf(((i % 3) != 2) ? "\t" : "\n");
    }

    gpuErrchk(cudaFree(d_in));
    gpuErrchk(cudaFree(d_out));
    free(h_in);
    free(h_out);
    return 0;
}
Abator Abetor
  • 2,345
  • 1
  • 10
  • 12