2

As far as I'm aware cv::cuda::PtrStep is used to passing GpuMat data directly to the custom kernel. I found examples of one channel access here however my case is 2 channel mat (CV_32FC2). In this case I'm trying to achieve complex absolute squared value where complex values are encoded like: real part is 1st plane, imaginary part is 2nd plane of given Mat.

I tried:

__global__ void testKernel(const cv::cuda::PtrStepSz<cv::Vec2f> input, cv::cuda::PtrStepf output)
{
    int x = blockIdx.x * blockDim.x + threadIdx.x;
    int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x <= input.cols - 1 && y <= input.rows - 1 && y >= 0 && x >= 0)
    {
        float val_re = input(x, y)[0];
        float val_im = input(x, y) [1];
        output(x, y) = val_re * val_re + val_im * val_im;
    }
}

but this results in the following error:

calling a __host__ function("cv::Vec<float, (int)2> ::operator []") from a __global__ function("gpuholo::testKernel") is not allowed

I get it. [] is __host__ restricted function since its cv::Vec2f not cv::cuda::Vec2f (which apparently does not exist). But still I would really like to access the data.

Is there other mechanism to access 2-channel data on device side similar to Vec2f?


I thought of workaround in form of splitting input into two CV_32FC1 Mats so the kernel would look like:

__global__ void testKernel(const cv::cuda::PtrStepSzf re, const cv::cuda::PtrStepSzf im, cv::cuda::PtrStepf output)

but I'm wondering whether there's a "cleaner" solution, Vec2f-like one.

michelson
  • 686
  • 9
  • 22
  • You can use `float2` instead of `cv::Vec2f`. Also `input(x, y)` should be `input(y, x)`, because the first parameter is the row and the second the column. – dari Sep 24 '17 at 13:59

2 Answers2

4

You can use raw data types to access the data of GpuMat in a custom CUDA kernel. e.g. float2 type provided by the CUDA runtime can be used as partial replacement of cv::Vec2f. Here is an example code demonstrating the usage of raw data types for accessing GpuMat data.

#include <iostream>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>

using std::cout;
using std::endl;

__global__ void kernel_absolute(float2* src, float* dst, int rows, int cols, int iStep, int oStep)
{
    int i = blockIdx.y * blockDim.y + threadIdx.y; //Row number
    int j = blockIdx.x * blockDim.x + threadIdx.x; //Column number

    if (i<rows && j<cols)
    {
        /* Compute linear index from 2D indices */
        int tidIn = i * iStep + j;
        int tidOut = i * oStep + j;

        /* Read input value */
        float2 input = src[tidIn];

        /* Calculate absolute value */
        float output = sqrtf(input.x * input.x + input.y * input.y);

        /* Write output value */
        dst[tidOut] = output;
    }
}

int main(int argc, char** argv)
{
    /* Example to compute absolute value of each element of a complex matrix */
    int rows = 10;
    int cols = 10;
    int input_data_type = CV_32FC2; //input is complex
    int output_data_type = CV_32FC1; //output is real

    /* Create input matrix on host */
    cv::Mat input = cv::Mat::zeros(rows,cols,input_data_type) + cv::Vec2f(1,1) /* Initial value is (1,1) */;

    /* Display input */
    cout<<input<<endl;

    /* Create input matrix on device */
    cv::cuda::GpuMat input_d;
    /* Copy from host to device */
    input_d.upload(input);

    /* Create output matrix on device */
    cv::cuda::GpuMat output_d(rows,cols, output_data_type);

    /* Compute element step value of input and output */
    int iStep = input_d.step / sizeof(float2);
    int oStep = output_d.step / sizeof(float);

    /* Choose appropriate block size */
    dim3 block(8,8);

     /* Compute grid size using input size and block size */
    dim3 grid ( (cols + block.x -1)/block.x, (rows + block.y -1)/block.y );

    /* Launch CUDA kernel to compute absolute value */
    kernel_absolute<<<grid, block>>>( reinterpret_cast<float2*>(input_d.data), reinterpret_cast<float*>(output_d.data), rows, cols, iStep, oStep );

    /* Check kernel launch errors */
    assert( cudaSuccess == cudaDeviceSynchronize() );

    cv::Mat output;

    /* Copy results from device to host */
    output_d.download(output);

    /* Display output */
    cout<<endl<<output<<endl;

    return 0;
}

Compiled and tested with following command on Ubuntu 14.04 with CUDA 8.0:

nvcc -o complex complex.cu -arch=sm_61 -L/usr/local/lib -lopencv_core

sgarizvi
  • 16,623
  • 9
  • 64
  • 98
1

If you want to work with single input to your kernel, you could flatten your 2 channel image to a 1 channel image.

// test image
Mat h_mat(Size(50,50),CV_32FC2,Scalar(0.0));

// Mat::reshape takes number of channels and rows, for your example 1,1
Mat h_mat_flat = h_mat.reshape(1,1);

// to upload to gpu
GpuMat d_mat_flat(h_mat_flat.size(), h_mat_flat.type());
d_mat_flat.upload(h_mat_flat);

Now you can pass the d_mat_flat to your kernel as PtrStepSzf.

zindarod
  • 6,328
  • 3
  • 30
  • 58