CUDA Mixed pixels in output

Question

I'm learning CUDA and building my way towards implementing Gaussian filter. For starters I tried to implement CUDA program which will simply make a copy of an input image.

Instead of getting a copy of an image I get "mixed" pixels on smaller images, and gray or blank background on larger images.

Can you help me find the bug?

Feel free to suggest any additional improvements.

Input → Output Examples

Standard Lenna (500×500)

Mountains (1125×750)

Source code

#define subpixel unsigned char 

struct Dimensions {
    unsigned width;
    unsigned height;
};

struct ImageVectors {
    subpixel *red;
    subpixel *green;
    subpixel *blue;
    subpixel *alpha;
};

__global__ void CopyKernel(subpixel *device_subpixelsVector, subpixel *device_subpixelsResult) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    device_subpixelsResult[index] = device_subpixelsVector[index];
}

ImageVectors CUDAGaussBlur(ImageVectors imageVectors, Dimensions dimensions) {
    const int totalNumberOfSubpixels = dimensions.width * dimensions.height;
    const int sizeInBytes = dimensions.width * dimensions.height * sizeof(subpixel);

    const int blockSize = 128;
    const int gridSize = ceil(totalNumberOfSubpixels / blockSize);
    const dim3 dimBlock(blockSize);
    const dim3 dimGrid(gridSize);

    ImageVectors transformedImage;
    transformedImage.red = new subpixel[totalNumberOfSubpixels];
    transformedImage.green = new subpixel[totalNumberOfSubpixels];
    transformedImage.blue = new subpixel[totalNumberOfSubpixels];
    transformedImage.alpha = new subpixel[totalNumberOfSubpixels];

    subpixel *device_redVector;
    subpixel *device_greenVector;
    subpixel *device_blueVector;
    subpixel *device_alphaVector;

    subpixel *device_redResultVector;
    subpixel *device_greenResultVector;
    subpixel *device_blueResultVector;
    subpixel *device_alphaResultVector;

    cudaMalloc(&device_redVector, sizeInBytes);
    cudaMalloc(&device_greenVector, sizeInBytes);
    cudaMalloc(&device_blueVector, sizeInBytes);
    cudaMalloc(&device_alphaVector, sizeInBytes);

    cudaMalloc(&device_redResultVector, sizeInBytes);
    cudaMalloc(&device_greenResultVector, sizeInBytes);
    cudaMalloc(&device_blueResultVector, sizeInBytes);
    cudaMalloc(&device_alphaResultVector, sizeInBytes);

    cudaMemcpy(device_redVector, imageVectors.red, sizeInBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(device_greenVector, imageVectors.green, sizeInBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(device_blueVector, imageVectors.blue, sizeInBytes, cudaMemcpyHostToDevice);
    cudaMemcpy(device_alphaVector, imageVectors.alpha, sizeInBytes, cudaMemcpyHostToDevice);

    // Eventually CopyKernel will be replaced this with Gauss filter kernel.
    CopyKernel<<<dimGrid, dimBlock>>>(device_redVector, device_redResultVector);
    CopyKernel<<<dimGrid, dimBlock>>>(device_greenVector, device_greenResultVector);
    CopyKernel<<<dimGrid, dimBlock>>>(device_blueVector, device_blueResultVector);
    CopyKernel<<<dimGrid, dimBlock>>>(device_alphaVector, device_alphaResultVector);

    cudaMemcpy(transformedImage.red, device_redResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(transformedImage.green, device_greenResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(transformedImage.blue, device_blueResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
    cudaMemcpy(transformedImage.alpha, device_alphaResultVector, sizeInBytes, cudaMemcpyDeviceToHost);

    cudaFree(device_redVector);
    cudaFree(device_greenVector);
    cudaFree(device_blueVector);
    cudaFree(device_alphaVector);

    cudaFree(device_redResultVector);
    cudaFree(device_greenResultVector);
    cudaFree(device_blueResultVector);
    cudaFree(device_alphaResultVector);

    return transformedImage;
}

Every image vector (i.e. 1D array) is filled with single channel values from RGBA channels. I have omitted the part where reading, transforming and writing to image file happens.

Image input/output

I don't suspect to find a bug here. But I'll give it for completeness and because I'm almost never 100% sure.

enum Channel
{
    R = 0,
    G = 1,
    B = 2,
    A = 3
};

subpixel* extractChannelToVector(std::vector<subpixel> rgbaImage, Dimensions dimensions, Channel selectedChannel) {
    std::vector<subpixel> vectorBuffer;

    for (int row = 0; row < dimensions.height; row++) {
        for (int column = 0; column < dimensions.width; column++) {
            vectorBuffer.push_back(rgbaImage[4 * dimensions.width * row + 4 * column + selectedChannel]);
        }
    }

    const int totalNumberOfSubpixels = dimensions.width * dimensions.height;
    subpixel *subpixelsVector = new subpixel[totalNumberOfSubpixels];
    for (int index = 0; index++ < vectorBuffer.size(); index++)
        subpixelsVector[index] = vectorBuffer[index];
    return subpixelsVector;
}

std::vector<subpixel> vectorsToChannels(Dimensions dimensions, subpixel *redVector, subpixel *greenVector, subpixel *blueVector, subpixel *alphaVector) {
    const int totalNumberOfSubpixels = dimensions.width * dimensions.height;
    std::vector<subpixel> rgbaImage;
    for (int index = 0; index < totalNumberOfSubpixels; index++) {
        rgbaImage.push_back(redVector[index + Channel::R]);
        rgbaImage.push_back(greenVector[index + Channel::G]);
        rgbaImage.push_back(blueVector[index + Channel::B]);
        rgbaImage.push_back(alphaVector[index + Channel::A]);
    }
    return rgbaImage;
}

std::vector<subpixel> vectorsToChannels(Dimensions dimensions, ImageVectors imageVectors) {
    return vectorsToChannels(dimensions, imageVectors.red, imageVectors.green, imageVectors.blue, imageVectors.alpha);
}

int main() {
    const char* filename = R"(lenna.png)";
    cout << filename << endl;

    std::vector<subpixel> png;
    std::vector<subpixel> rgbaImage;
    Dimensions dimensions;

    lodepng::load_file(png, filename);
    lodepng::decode(rgbaImage, dimensions.width, dimensions.height, png);

    cout << "sizeof(image): " << rgbaImage.size() << endl
        << "width: " << dimensions.width << endl
        << "height: " << dimensions.height << endl;

    ImageVectors imageVectors;
    imageVectors.red = extractChannelToVector(rgbaImage, dimensions, Channel::R);
    imageVectors.green = extractChannelToVector(rgbaImage, dimensions, Channel::G);
    imageVectors.blue = extractChannelToVector(rgbaImage, dimensions, Channel::B);
    imageVectors.alpha = extractChannelToVector(rgbaImage, dimensions, Channel::A);

    std::vector<subpixel> transformedImage = vectorsToChannels(dimensions, CUDAGaussBlur(imageVectors, dimensions));

    lodepng::encode("lenna-result.png", transformedImage, dimensions.width, dimensions.height);

    return 0;
}

I'm using "lodepng" for reading and writing PNG files. I have used it successuly in this program when I have done Gaussian filtering using CPU. More about lodepng can be found here: lodev.org/lodepng/ , github.com/lvandeve/lodepng .

Update #1

With the suggestion from @jwdmsd I have short circuited kernel. More precisely I just copied image data from host (CPU) do device (GPU), and then from device back to host without using kernel.

The resulted image is gray with color #cdcdcdcd. What is interesting is that 0xCD is used by Microsoft compilers to fill in memory blocks in debug mode. According to SO::When and why will an OS initialise memory to 0xCD, 0xDD, etc. on malloc/free/new/delete? 0xCD is called Clean Memory and represents Allocated memory via malloc or new but never written by the application. It seems like I have some memory/pointer problems. Where's the problem?

Result for Lenna

There's nothing much in the kernel to be looked at. It might be the problem with the way you are handling input/output. A quick way to check is to avoid the kernel altogether and just cudaMemcpy device_redVector (from GPU) to red (back to CPU) and for others channels and check the result. — jwdmsd, May 21 '17 at 13:29
@jwdmsd Copying memory from host to device and back is not something I'm aiming for. I'm using kernel on purpose because I plan to extend this program for Gaussian filter (where kernel will be certainly needed). — Miro, May 21 '17 at 13:37
Yes I know, but first you have to rule out whether the problem is with kernel or not. — jwdmsd, May 21 '17 at 13:38
@jwdmsd Good idea! I'll try that one out and come back with an update. — Miro, May 21 '17 at 13:51
@devnglee I'm not sure what do you mean by padding pixels in this case. `lodepng` returns an image data as 1D vector of a sequence of RBGA values for each pixel. Taken from from their official web: "The function, decodePNG, converts any PNG file data, with any colortype or interlace mode, into a 32-bit image in an std::vector at the output.". I had no problems with manipulating image data when I implemented Gaussian filter using only CPU. — Miro, May 21 '17 at 14:26
You should check the return values of each CUDA api with `cudaSuccess` as explained [here](http://stackoverflow.com/a/14038590/7724939) and run your app with `cuda-memcheck`. — nglee, May 21 '17 at 14:42
@devnglee I will try that a bit later and will come with the update. — Miro, May 21 '17 at 16:48
jwdmsd and devnglee thank you for your help so far! :) @everyone Why are people downvoting the question? I really put a lot of effort into solving this problem and also in the question itself (Not to skip any details and to show what is going on. It might even help someone in the future.) If you are going to downvote it please at least explain why. I've talked to my colleagues and none of us can think of a viable reason besides "Some people are irrational or toxic". — Miro, May 21 '17 at 17:00
@kr85 I do not know why people gave down votes and close votes, but the fact that the CUDA tag is being flooded with "debug my code for me" questions these days that are of little or no use to subsequent visitors may have something to do with it. The close votes specifically are for failure to provide an [MCVE](https://stackoverflow.com/help/mcve) (this may or may not have been fixed since the question was originally posted). Not sure whether close reasons are visible at your karma level (> 500). — njuffa, May 22 '17 at 14:22
So now that the dust has settled on this and someone kindly pointed out a number of completely trivial mistakes in your code, I'll bite. I am one of the **five** "irrational/toxic" who downvoted this, so thank you for the very helpful character analysis. I downvoted (and voted to close, along with **four** others) because this question has absolutely no value at all. As best as I can tell, you made little or no serious attempt to debug this yourself, instead choosing to dump not working code here and wait for someone to fix it, and the result is of no use to anyone but yourself — talonmies, May 25 '17 at 06:38
@talonomies Don't take it personalty. But put yourself in my position where you get down-voted without knowing why. How would you feel and what would you think? And you are wrong in saying that I haven't put serious attempt to debugging (don't jump to conclusions). I agree that there are some silly mistakes, but at that moment I didn't noticed them. Those situations happen.Now that I have found the source of the error I completely agree that this post doesn't bring any value. That's something I didn't knew before posting it.I tried to do my best with information and situation I had at the time — Miro, May 25 '17 at 17:45

score 3 · Accepted Answer · edited May 23 '17 at 12:34

I don't think your problem is with CUDA.

Change

for (int index = 0; index++ < vectorBuffer.size(); index++)
                    ^^^^^^^
    subpixelsVector[index] = vectorBuffer[index];

to

for (int index = 0; index < vectorBuffer.size(); index++)
                    ^^^^^
    subpixelsVector[index] = vectorBuffer[index];

and also change

for (int index = 0; index < totalNumberOfSubpixels; index++) {
    rgbaImage.push_back(redVector[index + Channel::R]);
    rgbaImage.push_back(greenVector[index + Channel::G]);
    rgbaImage.push_back(blueVector[index + Channel::B]);
    rgbaImage.push_back(alphaVector[index + Channel::A]);
}

to

for (int index = 0; index < totalNumberOfSubpixels; index++) {
    rgbaImage.push_back(redVector[index]);
    rgbaImage.push_back(greenVector[index]);
    rgbaImage.push_back(blueVector[index]);
    rgbaImage.push_back(alphaVector[index]);
}

+) You'd better modify your kernel as follows:

__global__ void CopyKernel(subpixel *device_subpixelsVector, subpixel *device_subpixelsResult, int totalNumberOfSubpixels) {
    int index = blockIdx.x * blockDim.x + threadIdx.x;
    if (index < totalNumberOfSubpixels)
        device_subpixelsResult[index] = device_subpixelsVector[index];
}

Also, read this, especially where it says "How to get Useful Answers to your CUDA Questions on Stack Overflow". It should help you to get better answers from people here.

CUDA Mixed pixels in output

Input → Output Examples

Source code

Image input/output

Update #1

1 Answers1