CUDA copy image only copying part of image

Question

Today I was trying to create a program that copied an image using the GPU. I created a simple program that does this. To load the image I am using lodepng. The problem isn't with copying via cudaMemcpy because when I copy the image to GPU and back it stays intact, but when I try to copy it in the kernel, it doesn't. Feel free to ask any questions that you have about my problem.

The code:

#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
#include <Windows.h>
#include <math.h>
#include <LodePNG\lodepng.h>

const int BLOCK_WIDTH = 32;


using namespace std;

__global__ void expousure(unsigned char *in, unsigned char *out)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int pitch = blockDim.x * gridDim.x;
    int absIdx = x + y * pitch;

    out[absIdx] = in[absIdx];

}

void decode(std::vector<unsigned char>& image, const char* filename, int& width, int& height)
{
    unsigned widthU, heightU;
        //decode
    unsigned error = lodepng::decode(image, widthU, heightU, filename);

    width = int(widthU);
    height = int(heightU);

    //if there's an error, display it
    if (error) std::cout << "decoder error " << error << ": " << lodepng_error_text(error) << std::endl;

    //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
}

void encodeAndSave(const std::vector<unsigned char>& inPixels, const char* filename, int width, int height)
{
    std::vector<unsigned char> outEncoded;

    unsigned error = lodepng::encode(outEncoded, inPixels, unsigned(width), unsigned(height));

    if (error){

        std::cout << "encoder error" << error << ": " << lodepng_error_text(error) << std::endl;

        return;
    }

    lodepng::save_file(outEncoded, filename);
}

void encodeAndSave(unsigned char* inPixels, const char* filename, int width, int height)
{
    std::vector<unsigned char> outEncoded;

    unsigned error = lodepng::encode(outEncoded, inPixels, unsigned(width), unsigned(height));

    if (error){

        std::cout << "encoder error" << error << ": " << lodepng_error_text(error) << std::endl;

        return;
    }

    lodepng::save_file(outEncoded, filename);
}


int main(int argc, char *argv[])
{

    // decode the image to image from filename
    int width, height;
    const char* filename = argc > 1 ? argv[1] : "C:/Users/Russell/Documents/Visual Studio 2013/Projects/Hello CUDA/Release/test.png";
    vector <unsigned char> h_image;
    decode(h_image, filename, width, height);

    unsigned char *d_in;
    unsigned char *d_out;

    cudaMalloc(&d_in, sizeof(unsigned char) * width * height * 4);
    cudaMalloc(&d_out, sizeof(unsigned char) * width * height * 4);

    cudaMemcpy(d_in, &h_image[0], sizeof(unsigned char) * width * height * 4, cudaMemcpyHostToDevice);

    expousure<<<dim3(width / BLOCK_WIDTH, height / BLOCK_WIDTH, 1), dim3(BLOCK_WIDTH, BLOCK_WIDTH, 1) >>>(d_in, d_out);

    unsigned char h_out[256 * 256 * 4];

    cudaMemcpy(h_out, d_out, sizeof(unsigned char) * width * height * 4, cudaMemcpyDeviceToHost);

    // encode and save image from image to filename
    vector <unsigned char> imageOUT;
    const char* outname = "C:/Users/Russell/Documents/Visual Studio 2013/Projects/Hello CUDA/Release/testOUT.png";
    encodeAndSave(h_out, outname, width, height);

}

The input image: https://i.stack.imgur.com/Rx0mF.png

The output image: https://i.stack.imgur.com/HLmPQ.png

score 1 · Accepted Answer · edited May 23 '17 at 12:20

I would point out a few things:

The main issue you have is that your thread array is dimensioned to provide 1 thread per pixel, but since each pixel consists of 4 bytes, and your kernel only copies one byte per thread, you are only getting 1/4 of the image copied. The fix for this, in a nutshell, could be launching 4 times as many threads in the x-dimension, to account for 4 bytes per pixel.
Any time you are having trouble with a CUDA code, it's a good idea to do proper cuda error checking, although I don't think it would have turned up anything here. As a quick check, you can also run a CUDA code with cuda-memcheck.
It's better if you provide a proper MCVE. Such a complete code does not depend on external things, like lodepng.
Your code as written (with or without the fix above) will depend on the image dimensions being evenly divisible by BLOCK_WIDTH. It's a good idea to write code that doesn't have these dependencies, and the modifications involve making sure you are launching enough or more than enough threads in both dimensions at kernel launch time, and then including a "thread check" in your kernel to make sure that only valid threads are doing any work (copying, in this case).

Here's a complete example, that doesn't depend on lodepng, but demonstrates suitable fixes for items 1,3, and 4 above.

#include <iostream>
#include <vector>

const int BLOCK_WIDTH = 32;

#define DUMMY_SIZE 256

unsigned create_dummy_image(std::vector<unsigned char>& image, unsigned & widthU, unsigned &heightU, const char* filename){

  for (int i = 0; i < 4*DUMMY_SIZE; i++)
    for (int j = 0; j < DUMMY_SIZE; j++)
      image.push_back(j%8);
  widthU = DUMMY_SIZE;
  heightU = DUMMY_SIZE;
  return 0;
}

unsigned dummy_encode(std::vector<unsigned char> &outEncoded, unsigned char *inPixels, unsigned width, unsigned height){

  for (int j = 0; j < height; j++)
    for (int i = 0 ; i < 4*width; i++)
      outEncoded.push_back(inPixels[(j*4*width)+i]);
  return 0;
}

void dummy_save(std::vector<unsigned char> &outEncoded, const char * filename){

  for (int i = 0; i < outEncoded.size(); i++)
    if (outEncoded[i] != (i%8)) {printf("mismatch at %d, was %d, should be %d\n", i, outEncoded[i], i%8); exit(1);}
}

using namespace std;

__global__ void expousure(unsigned char *in, unsigned char *out, const int width, const int height)
{
    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;
    int pitch = width*4;
    int absIdx = x + y * pitch;
    if ((x<(width*4)) && (y<height))
      out[absIdx] = in[absIdx];

}


void decode(std::vector<unsigned char>& image, const char* filename, int& width, int& height)
{
    unsigned widthU, heightU;
        //decode
    unsigned error = create_dummy_image(image, widthU, heightU, filename);

    width = int(widthU);
    height = int(heightU);

    //if there's an error, display it
    if (error) std::cout << "decoder error " << error << ": " << error  << std::endl;

    //the pixels are now in the vector "image", 4 bytes per pixel, ordered RGBARGBA..., use it as texture, draw it, ...
}

void encodeAndSave(unsigned char* inPixels, const char* filename, int width, int height)
{
    std::vector<unsigned char> outEncoded;

    unsigned error = dummy_encode(outEncoded, inPixels, unsigned(width), unsigned(height));

    if (error){

        std::cout << "encoder error" << error << ": " << error  << std::endl;

        return;
    }

    dummy_save(outEncoded, filename);
}


int main(int argc, char *argv[])
{

    // decode the image to image from filename
    int width, height;
    const char* filename = argc > 1 ? argv[1] : "C:/Users/Russell/Documents/Visual Studio 2013/Projects/Hello CUDA/Release/test.png";
    std::vector<unsigned char> h_image;
    decode(h_image, filename, width, height);

    unsigned char *d_in;
    unsigned char *d_out;

    cudaMalloc(&d_in, sizeof(unsigned char) * width * height * 4);
    cudaMalloc(&d_out, sizeof(unsigned char) * width * height * 4);

    cudaMemcpy(d_in, &h_image[0], sizeof(unsigned char) * width * height * 4, cudaMemcpyHostToDevice);

    expousure<<<dim3((4*width / BLOCK_WIDTH)+1, (height / BLOCK_WIDTH)+1, 1), dim3(BLOCK_WIDTH, BLOCK_WIDTH, 1) >>>(d_in, d_out, width, height);

    unsigned char h_out[DUMMY_SIZE * DUMMY_SIZE * 4];

    cudaMemcpy(h_out, d_out, sizeof(unsigned char) * width * height * 4, cudaMemcpyDeviceToHost);

    // encode and save image from image to filename
    vector <unsigned char> imageOUT;
    const char* outname = "C:/Users/Russell/Documents/Visual Studio 2013/Projects/Hello CUDA/Release/testOUT.png";
    encodeAndSave(h_out, outname, width, height);
    std::cout << "Success!" << std::endl;
}

CUDA copy image only copying part of image

1 Answers1