0

I just started learning OpenCV. I want to delete one channel using CUDA kernel and then visualize how it affected the original image. But the program doesn't work, no idea why. It just shows black window :( Here is the code:

#include "opencv2\opencv.hpp"
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <device_functions.h>


using namespace cv;


__global__ void imgProc(unsigned char *in, unsigned char * out)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    out[i] =in[i];
    out[i+1] = in[i+1];
    out[i + 2] = 0; //deleting one channel


}

int main()
{
    Mat file1 = imread("sw.jpg", CV_LOAD_IMAGE_COLOR);  
    unsigned char *input = (unsigned char*)(file1.data);
    unsigned char *dev_input, *dev_output;
    unsigned char *output = (unsigned char*)malloc(file1.cols*file1.rows * 3 * sizeof(char));

    cudaMalloc((void**)&dev_input, file1.cols*file1.rows * 3 * sizeof(char));
    cudaMalloc((void**)&dev_output, file1.cols*file1.rows * 3 * sizeof(char));
    cudaMemcpy(dev_input, input, file1.cols*file1.rows * 3 * sizeof(char), cudaMemcpyHostToDevice);
    imgProc << <file1.cols, file1.rows >> > (dev_input, dev_output);
    cudaMemcpy(output, dev_output, file1.cols*file1.rows * 3 * sizeof(char), cudaMemcpyDeviceToHost);

    Mat file3 =  Mat(file1.rows,file1.cols, CV_8UC3,output);
    namedWindow("Modified", CV_WINDOW_FREERATIO);
    imshow("Modified", file3);
    namedWindow("Original", CV_WINDOW_FREERATIO);
    imshow("Original", file1);

    cudaFree(dev_input);
    cudaFree(dev_output);
    free(output);


    waitKey(); 

    return 0;
}
karollo
  • 573
  • 8
  • 22
  • I see no error checking anywhere in your code. Are you sure there are no runtime errors? What is the image dimensions you are using? – talonmies Nov 02 '17 at 15:30
  • Ok, I have figured it out. It turned out, that cols number was greater than my max thread number on GPU. I will paste the right code in my question so maybe it will help someone ;p – karollo Nov 02 '17 at 15:46
  • @KarolŻurowski you really shouldn't update your question. You can *answer* your own question though and accept that. That will be much more useful. – GPPK Nov 02 '17 at 15:51
  • @GPPK That seems resonable, I have followed your advice :) – karollo Nov 02 '17 at 15:57

2 Answers2

1

You seem to be making this more complicated than it needs to be, OpenCV provides all the functions you need to complete this task:

split(src,BGRChannels); // split the BGR channesl
BGRChannels[1]=Mat::zeros(src.rows,src.cols,CV_8UC1);// removing Green channel
merge(BGRChannels,3,src); // pack the image 
GPPK
  • 6,546
  • 4
  • 32
  • 57
1

Ok, i got it. There were some mistakes in the kernel, but most importantly size of the picture i was working on was greater than number of the max threads per grid on my GPU.

Here is working code, which deletes one img channel from the picture:

#include "opencv2\opencv.hpp"
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <device_functions.h>


using namespace cv;


__global__ void imgProc(unsigned char *in, unsigned char * out)
{
    int x = blockIdx.x;
    int y = blockIdx.y;
    int offset = x + y * gridDim.x;
    out[offset*3+0] =0;
    out[offset * 3 + 1] = in[offset * 3 + 1];
    out[offset * 3 + 2] = in[offset * 3 + 2];


}

int main()
{
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    std::cout << (int)prop.maxGridSize[1];


    Mat file1 = imread("sw.jpg", CV_LOAD_IMAGE_COLOR);  
    unsigned char *input = (unsigned char*)(file1.data);
    unsigned char *dev_input, *dev_output;
    unsigned char *output = (unsigned char*)malloc(file1.cols*file1.rows * 3 * sizeof(char));

    cudaMalloc((void**)&dev_input, file1.cols*file1.rows * 3 * sizeof(char));
    cudaMalloc((void**)&dev_output, file1.cols*file1.rows * 3 * sizeof(char));
    cudaMemcpy(dev_input, input, file1.cols*file1.rows * 3 * sizeof(char), cudaMemcpyHostToDevice);

    dim3 grid(file1.cols, file1.rows);
    imgProc << <grid,1  >> > (dev_input, dev_output);
    cudaMemcpy(output, dev_output, file1.cols*file1.rows * 3 * sizeof(char), cudaMemcpyDeviceToHost);

    Mat file3 =  Mat(file1.rows,file1.cols, CV_8UC3,output);
    namedWindow("Modified", CV_WINDOW_FREERATIO);
    imshow("Modified", file3);
    namedWindow("Original", CV_WINDOW_FREERATIO);
    imshow("Original", file1);

    cudaFree(dev_input);
    cudaFree(dev_output);
    free(output);


    waitKey(); 

    return 0;
}
karollo
  • 573
  • 8
  • 22
  • Setting threads per block to 1 is wasting about 97% of your GPUs computational capacity. You might want to read this https://stackoverflow.com/questions/9985912/how-do-i-choose-grid-and-block-dimensions-for-cuda-kernels – talonmies Nov 02 '17 at 16:12
  • Kernel call signature is `imgProc <<< number_of_blocks, threads_per_block >>> (dev_input, dev_output);` – zindarod Nov 02 '17 at 17:44