Deleting one image channel using CUDA and OpenCV

Question

I just started learning OpenCV. I want to delete one channel using CUDA kernel and then visualize how it affected the original image. But the program doesn't work, no idea why. It just shows black window :( Here is the code:

#include "opencv2\opencv.hpp"
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <device_functions.h>


using namespace cv;


__global__ void imgProc(unsigned char *in, unsigned char * out)
{
    int i = threadIdx.x + blockIdx.x * blockDim.x;
    out[i] =in[i];
    out[i+1] = in[i+1];
    out[i + 2] = 0; //deleting one channel


}

int main()
{
    Mat file1 = imread("sw.jpg", CV_LOAD_IMAGE_COLOR);  
    unsigned char *input = (unsigned char*)(file1.data);
    unsigned char *dev_input, *dev_output;
    unsigned char *output = (unsigned char*)malloc(file1.cols*file1.rows * 3 * sizeof(char));

    cudaMalloc((void**)&dev_input, file1.cols*file1.rows * 3 * sizeof(char));
    cudaMalloc((void**)&dev_output, file1.cols*file1.rows * 3 * sizeof(char));
    cudaMemcpy(dev_input, input, file1.cols*file1.rows * 3 * sizeof(char), cudaMemcpyHostToDevice);
    imgProc << <file1.cols, file1.rows >> > (dev_input, dev_output);
    cudaMemcpy(output, dev_output, file1.cols*file1.rows * 3 * sizeof(char), cudaMemcpyDeviceToHost);

    Mat file3 =  Mat(file1.rows,file1.cols, CV_8UC3,output);
    namedWindow("Modified", CV_WINDOW_FREERATIO);
    imshow("Modified", file3);
    namedWindow("Original", CV_WINDOW_FREERATIO);
    imshow("Original", file1);

    cudaFree(dev_input);
    cudaFree(dev_output);
    free(output);


    waitKey(); 

    return 0;
}

I see no error checking anywhere in your code. Are you sure there are no runtime errors? What is the image dimensions you are using? — talonmies, Nov 02 '17 at 15:30
Ok, I have figured it out. It turned out, that cols number was greater than my max thread number on GPU. I will paste the right code in my question so maybe it will help someone ;p — karollo, Nov 02 '17 at 15:46
@KarolŻurowski you really shouldn't update your question. You can *answer* your own question though and accept that. That will be much more useful. — GPPK, Nov 02 '17 at 15:51

score 1 · Answer 1 · answered Nov 02 '17 at 15:12

1

You seem to be making this more complicated than it needs to be, OpenCV provides all the functions you need to complete this task:

split(src,BGRChannels); // split the BGR channesl
BGRChannels[1]=Mat::zeros(src.rows,src.cols,CV_8UC1);// removing Green channel
merge(BGRChannels,3,src); // pack the image

answered Nov 02 '17 at 15:12

GPPK

6,546
4
32
57

1

I know, I don't want to make it simple, I want to combine it with CUDA in order to practice – karollo Nov 02 '17 at 15:14

score 1 · Answer 2 · answered Nov 02 '17 at 15:56

Ok, i got it. There were some mistakes in the kernel, but most importantly size of the picture i was working on was greater than number of the max threads per grid on my GPU.

Here is working code, which deletes one img channel from the picture:

#include "opencv2\opencv.hpp"
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <device_functions.h>


using namespace cv;


__global__ void imgProc(unsigned char *in, unsigned char * out)
{
    int x = blockIdx.x;
    int y = blockIdx.y;
    int offset = x + y * gridDim.x;
    out[offset*3+0] =0;
    out[offset * 3 + 1] = in[offset * 3 + 1];
    out[offset * 3 + 2] = in[offset * 3 + 2];


}

int main()
{
    cudaDeviceProp prop;
    cudaGetDeviceProperties(&prop, 0);
    std::cout << (int)prop.maxGridSize[1];


    Mat file1 = imread("sw.jpg", CV_LOAD_IMAGE_COLOR);  
    unsigned char *input = (unsigned char*)(file1.data);
    unsigned char *dev_input, *dev_output;
    unsigned char *output = (unsigned char*)malloc(file1.cols*file1.rows * 3 * sizeof(char));

    cudaMalloc((void**)&dev_input, file1.cols*file1.rows * 3 * sizeof(char));
    cudaMalloc((void**)&dev_output, file1.cols*file1.rows * 3 * sizeof(char));
    cudaMemcpy(dev_input, input, file1.cols*file1.rows * 3 * sizeof(char), cudaMemcpyHostToDevice);

    dim3 grid(file1.cols, file1.rows);
    imgProc << <grid,1  >> > (dev_input, dev_output);
    cudaMemcpy(output, dev_output, file1.cols*file1.rows * 3 * sizeof(char), cudaMemcpyDeviceToHost);

    Mat file3 =  Mat(file1.rows,file1.cols, CV_8UC3,output);
    namedWindow("Modified", CV_WINDOW_FREERATIO);
    imshow("Modified", file3);
    namedWindow("Original", CV_WINDOW_FREERATIO);
    imshow("Original", file1);

    cudaFree(dev_input);
    cudaFree(dev_output);
    free(output);


    waitKey(); 

    return 0;
}

Setting threads per block to 1 is wasting about 97% of your GPUs computational capacity. You might want to read this https://stackoverflow.com/questions/9985912/how-do-i-choose-grid-and-block-dimensions-for-cuda-kernels — talonmies, Nov 02 '17 at 16:12
Kernel call signature is `imgProc <<< number_of_blocks, threads_per_block >>> (dev_input, dev_output);` — zindarod, Nov 02 '17 at 17:44

Deleting one image channel using CUDA and OpenCV

2 Answers2