I'm learning CUDA and building my way towards implementing Gaussian filter. For starters I tried to implement CUDA program which will simply make a copy of an input image.
Instead of getting a copy of an image I get "mixed" pixels on smaller images, and gray or blank background on larger images.
Can you help me find the bug?
Feel free to suggest any additional improvements.
Input → Output Examples
Standard Lenna (500×500)
Mountains (1125×750)
Source code
#define subpixel unsigned char
struct Dimensions {
unsigned width;
unsigned height;
};
struct ImageVectors {
subpixel *red;
subpixel *green;
subpixel *blue;
subpixel *alpha;
};
__global__ void CopyKernel(subpixel *device_subpixelsVector, subpixel *device_subpixelsResult) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
device_subpixelsResult[index] = device_subpixelsVector[index];
}
ImageVectors CUDAGaussBlur(ImageVectors imageVectors, Dimensions dimensions) {
const int totalNumberOfSubpixels = dimensions.width * dimensions.height;
const int sizeInBytes = dimensions.width * dimensions.height * sizeof(subpixel);
const int blockSize = 128;
const int gridSize = ceil(totalNumberOfSubpixels / blockSize);
const dim3 dimBlock(blockSize);
const dim3 dimGrid(gridSize);
ImageVectors transformedImage;
transformedImage.red = new subpixel[totalNumberOfSubpixels];
transformedImage.green = new subpixel[totalNumberOfSubpixels];
transformedImage.blue = new subpixel[totalNumberOfSubpixels];
transformedImage.alpha = new subpixel[totalNumberOfSubpixels];
subpixel *device_redVector;
subpixel *device_greenVector;
subpixel *device_blueVector;
subpixel *device_alphaVector;
subpixel *device_redResultVector;
subpixel *device_greenResultVector;
subpixel *device_blueResultVector;
subpixel *device_alphaResultVector;
cudaMalloc(&device_redVector, sizeInBytes);
cudaMalloc(&device_greenVector, sizeInBytes);
cudaMalloc(&device_blueVector, sizeInBytes);
cudaMalloc(&device_alphaVector, sizeInBytes);
cudaMalloc(&device_redResultVector, sizeInBytes);
cudaMalloc(&device_greenResultVector, sizeInBytes);
cudaMalloc(&device_blueResultVector, sizeInBytes);
cudaMalloc(&device_alphaResultVector, sizeInBytes);
cudaMemcpy(device_redVector, imageVectors.red, sizeInBytes, cudaMemcpyHostToDevice);
cudaMemcpy(device_greenVector, imageVectors.green, sizeInBytes, cudaMemcpyHostToDevice);
cudaMemcpy(device_blueVector, imageVectors.blue, sizeInBytes, cudaMemcpyHostToDevice);
cudaMemcpy(device_alphaVector, imageVectors.alpha, sizeInBytes, cudaMemcpyHostToDevice);
// Eventually CopyKernel will be replaced this with Gauss filter kernel.
CopyKernel<<<dimGrid, dimBlock>>>(device_redVector, device_redResultVector);
CopyKernel<<<dimGrid, dimBlock>>>(device_greenVector, device_greenResultVector);
CopyKernel<<<dimGrid, dimBlock>>>(device_blueVector, device_blueResultVector);
CopyKernel<<<dimGrid, dimBlock>>>(device_alphaVector, device_alphaResultVector);
cudaMemcpy(transformedImage.red, device_redResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
cudaMemcpy(transformedImage.green, device_greenResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
cudaMemcpy(transformedImage.blue, device_blueResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
cudaMemcpy(transformedImage.alpha, device_alphaResultVector, sizeInBytes, cudaMemcpyDeviceToHost);
cudaFree(device_redVector);
cudaFree(device_greenVector);
cudaFree(device_blueVector);
cudaFree(device_alphaVector);
cudaFree(device_redResultVector);
cudaFree(device_greenResultVector);
cudaFree(device_blueResultVector);
cudaFree(device_alphaResultVector);
return transformedImage;
}
Every image vector (i.e. 1D array) is filled with single channel values from RGBA channels. I have omitted the part where reading, transforming and writing to image file happens.
Image input/output
I don't suspect to find a bug here. But I'll give it for completeness and because I'm almost never 100% sure.
enum Channel
{
R = 0,
G = 1,
B = 2,
A = 3
};
subpixel* extractChannelToVector(std::vector<subpixel> rgbaImage, Dimensions dimensions, Channel selectedChannel) {
std::vector<subpixel> vectorBuffer;
for (int row = 0; row < dimensions.height; row++) {
for (int column = 0; column < dimensions.width; column++) {
vectorBuffer.push_back(rgbaImage[4 * dimensions.width * row + 4 * column + selectedChannel]);
}
}
const int totalNumberOfSubpixels = dimensions.width * dimensions.height;
subpixel *subpixelsVector = new subpixel[totalNumberOfSubpixels];
for (int index = 0; index++ < vectorBuffer.size(); index++)
subpixelsVector[index] = vectorBuffer[index];
return subpixelsVector;
}
std::vector<subpixel> vectorsToChannels(Dimensions dimensions, subpixel *redVector, subpixel *greenVector, subpixel *blueVector, subpixel *alphaVector) {
const int totalNumberOfSubpixels = dimensions.width * dimensions.height;
std::vector<subpixel> rgbaImage;
for (int index = 0; index < totalNumberOfSubpixels; index++) {
rgbaImage.push_back(redVector[index + Channel::R]);
rgbaImage.push_back(greenVector[index + Channel::G]);
rgbaImage.push_back(blueVector[index + Channel::B]);
rgbaImage.push_back(alphaVector[index + Channel::A]);
}
return rgbaImage;
}
std::vector<subpixel> vectorsToChannels(Dimensions dimensions, ImageVectors imageVectors) {
return vectorsToChannels(dimensions, imageVectors.red, imageVectors.green, imageVectors.blue, imageVectors.alpha);
}
int main() {
const char* filename = R"(lenna.png)";
cout << filename << endl;
std::vector<subpixel> png;
std::vector<subpixel> rgbaImage;
Dimensions dimensions;
lodepng::load_file(png, filename);
lodepng::decode(rgbaImage, dimensions.width, dimensions.height, png);
cout << "sizeof(image): " << rgbaImage.size() << endl
<< "width: " << dimensions.width << endl
<< "height: " << dimensions.height << endl;
ImageVectors imageVectors;
imageVectors.red = extractChannelToVector(rgbaImage, dimensions, Channel::R);
imageVectors.green = extractChannelToVector(rgbaImage, dimensions, Channel::G);
imageVectors.blue = extractChannelToVector(rgbaImage, dimensions, Channel::B);
imageVectors.alpha = extractChannelToVector(rgbaImage, dimensions, Channel::A);
std::vector<subpixel> transformedImage = vectorsToChannels(dimensions, CUDAGaussBlur(imageVectors, dimensions));
lodepng::encode("lenna-result.png", transformedImage, dimensions.width, dimensions.height);
return 0;
}
I'm using "lodepng" for reading and writing PNG files. I have used it successuly in this program when I have done Gaussian filtering using CPU. More about lodepng can be found here: lodev.org/lodepng/ , github.com/lvandeve/lodepng .
Update #1
With the suggestion from @jwdmsd I have short circuited kernel. More precisely I just copied image data from host (CPU) do device (GPU), and then from device back to host without using kernel.
The resulted image is gray with color #cdcdcdcd
. What is interesting is that 0xCD
is used by Microsoft compilers to fill in memory blocks in debug mode. According to SO::When and why will an OS initialise memory to 0xCD, 0xDD, etc. on malloc/free/new/delete? 0xCD
is called Clean Memory
and represents Allocated memory via malloc or new but never written by the application.
It seems like I have some memory/pointer problems. Where's the problem?
Result for Lenna