TLDR: The problem is with the amount of device memory allocated for the image and the indexing scheme used to access the pixel values inside the kernel. Use the corrected implementation from the last code section of this answer.
Following is the explanation of problematic aspects of the provided implementation.
1. Total number of image bytes
The input image is an 8 bit RGB image, so the theoretical number of bytes occupied by it is equal to width x height x number_of_channels
. In this case, it should be numRows * numCols * 3
. But practically, OpenCV allocates aligned memory for image data, so the total number of image bytes should be calculated as image.step * numrows
regardless of image type and number of channels. That being said, the cudaMalloc
and cudaMemcpy
calls expect total number of bytes we want to allocate or copy respectively. Correct the calls as follows (adapting code from @micehlson's answer):
const size_t numBytes = input.step * numrows;
cudaMalloc<unsigned char>(&d_input, numBytes);
^
cudaMalloc<unsigned char>(&d_output, numBytes);
^
//Copy data from OpenCV input image to device memory
cudaMemcpy(d_input, input.ptr(), numBytes, cudaMemcpyHostToDevice);
^
//copy output from device to host
cudaMemcpy(output.ptr(), d_output, numBytes, cudaMemcpyDeviceToHost);
^
2. Pixel index calculation in the kernel
Since the image memory is aligned, the actual index of a pixel should be calculated using the step
parameter of the Mat object. The generic formula for calculating the start index of a pixel in an OpenCV Mat
is as follows:
index = row * step/bytes_per_pixel_component + (channels * column)
For an 8 bit RGB image, the number of bytes occupied by a single component of an RGB pixel is 1 byte. Meaning that an individual R or G or B occupies 1 byte while a whole RGB pixel is 3 bytes. So the starting index is calculated as
int index = row * step + 3 * column;
Since this is the start index, each individual channel of this specific pixel can be accessed by increment this index up-to the number of channels as follows:
int R = index;
int G = index + 1;
int B = index + 2;
Subsequently, the index of the pixel in the flipped image can be calculated as follows (Assuming flip about y axis):
int flipped_index = row * step + 3 * (numCols - column - 1);
Of-course, we would require the image step as an argument to the kernel.
The final kernel may look like this:
__global__ void mirror( unsigned char* input, unsigned char* output, int numRows, int numCols, int channels, int step)
{
//2D Index of current thread
const int col = blockIdx.x * blockDim.x + threadIdx.x;
const int row = blockIdx.y * blockDim.y + threadIdx.y;
if ( col >= numCols || row >= numRows ) return;
const int tid = row * step + (channels * col);
const int tid_flipped = row * step + (channels * (numCols - col - 1)); //Flip about y axis
//Copy each component of the current pixel
for(int i=0; i<channels; i++)
output[tid_flipped + i] = input[tid + i];
}
Making all the corrections, the final code may look like this:
#include<iostream>
#include<cstdio>
#include<opencv2/core.hpp>
#include<opencv2/imgcodecs.hpp>
#include<opencv2/highgui.hpp>
#include<cuda_runtime.h>
using std::cout;
using std::endl;
__global__ void mirror( unsigned char* input, unsigned char* output, int numRows, int numCols, int channels, int step)
{
//2D index of current thread
const int col = blockIdx.x * blockDim.x + threadIdx.x;
const int row = blockIdx.y * blockDim.y + threadIdx.y;
if ( col >= numCols || row >= numRows ) return;
const int tid = row * step + (3 * col);
const int tid_new = row * step + (3 * (numCols - col - 1)); //Flip about y axis
//Copy each component of the current pixel
for(int i=0; i<channels; i++)
output[tid_new + i] = input[tid + i];
}
void convert_to_mirror(const cv::Mat& input, cv::Mat& output,int numrows,int numcols)
{
const dim3 blockSize(1024,1,1);
int a=numcols/blockSize.x, b=numrows/blockSize.y;
const dim3 gridSize(a+1,b+1,1);
const size_t numBytes = input.step * input.rows;
unsigned char *d_input, *d_output;
cudaMalloc<unsigned char>(&d_input, numBytes);
cudaMalloc<unsigned char>(&d_output,numBytes);
//Copy data from OpenCV input image to device memory
cudaMemcpy(d_input,input.ptr(), numBytes, cudaMemcpyHostToDevice);
//Call mirror kernel.
mirror<<<gridSize, blockSize>>>(d_input,d_output, numrows, numcols, input.channels(), input.step);
assert(cudaSuccess == cudaDeviceSynchronize());
//copy output from device to host
cudaMemcpy(output.ptr(), d_output,numBytes, cudaMemcpyDeviceToHost);
cudaFree(d_input);
cudaFree(d_output);
}
int main()
{
//Read input image from the disk
cv::Mat input = cv::imread("C:/a.jpg", cv::IMREAD_COLOR);
const int rows = input.rows;
const int cols = input.cols;
if(input.empty())
{
std::cout<<"Image Not Found!"<<std::endl;
std::cin.get();
return -1;
}
//Create output image
cv::Mat output(rows,cols,CV_8UC3);
//Call the wrapper function
convert_to_mirror(input,output,rows,cols);
//Show the input and output
cv::imshow("Input",input);
cv::imshow("Output",output);
//Wait for key press
cv::waitKey();
return 0;
}
Compiled with the following command:
nvcc -o mirror -std=c++11 mirror.cu -I/usr/local/include/opencv4
-L/usr/local/lib -lopencv_core -lopencv_imgcodecs -lopencv_highgui
Tested with OpenCV 4.0 and CUDA 9 on Ubuntu 16.04