2

I'm working with opencv 3.1 cv::cuda template matching but the cv::cuda::minMaxLoc() function is too slow for my case. My match results have minimum size of 128x128 and max size up to 512x512. In average minMaxLoc() will take 1.65 ms for the 128x128 and up to 25 ms for something like 350x350 which is too long since this is done hundreds of times.

I underestand that my match sizes are maybe too small for what do you usually use in GPU. But I want to test along the lines that Robert Crovella did at thrust::max_element slow in comparison cublasIsamax - More efficient implementation? to see if I can get better performance.

My problem is that all those reductions the data is being read using linear indexes and cv::cuda::PtrStepSzfdoes not allow this(At least I did not find how). I try to reshape my match result but I cannot do that since the data is not contiguous. Do I need to go toward cudaMallocPitch and cudaMemcpy2DIf that the case how I do that with a cv::cuda::GPUMat read as cv::cuda::PtrStepSzf object?

    __global__ void minLoc(const    cv::cuda::PtrStepSzf data,
                                float* minVal,
                                float * minValLoc
                        )
    {
        int dsize = data.cols*data.rows
        __shared__ volatile T   vals[nTPB];
        __shared__ volatile int idxs[nTPB];
        __shared__ volatile int last_block;
        int idx = threadIdx.x+blockDim.x*blockIdx.x;
        last_block = 0;
        T   my_val = FLOAT_MIN;
        int my_idx = -1;
        // sweep from global memory
        while (idx < dsize)
        {
            //data(idx) is an illegal call;The legal one is data(x,y)
            // How do I do it?
            if (data(idx) > my_val) 
            {
                my_val = data(idx); my_idx = idx;
            }
                idx += blockDim.x*gridDim.x;
        }

                // ... rest of the kernel
    }   

   void callMinLocKernel(cv::InputArray _input,       
                cv::Point minValLoc,
                float minVal,
                cv::cuda::Stream _stream)
    {
        const cv::cuda::GpuMat input = _input.getGpuMat();
        dim3 cthreads(32, 32);
        dim3 cblocks(
            static_cast<int>(std::ceil(input1.size().width /
                static_cast<double>(cthreads.x))),
            static_cast<int>(std::ceil(input1.size().height / 
                static_cast<double>(cthreads.y))));

        // code that creates and upload d_min, d_minLoc
        float h_min    = 9999;
        int h_minLoc   = -1;
        float * d_min  = 0;
        int * d_minLoc = 0;
        //gpuErrchk is defined on other place
        gpuErrchk( cudaMalloc((void**)&d_min, sizeof(h_min)));
        gpuErrchk( cudaMalloc((void**)&d_minLoc, sizeof(h_minLoc));
        gpuErrchk( cudaMemcpy(d_min, &h_min, sizeof(h_min), cudaMemcpyHostToDevice) );
        gpuErrchk( cudaMemcpy(d_minLoc, &h_minLoc, sizeof(h_minLoc), cudaMemcpyHostToDevice) );

        cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
        minLoc<<<cblocks, cthreads, 0, stream>>>(input,d_min,d_minLoc);
        gpuErrchk(cudaGetLastError());
        //code to read the answer
        gpuErrchk( cudaMemcpy(&h_min, d_min, sizeof(h_min), cudaMemcpyDeviceToHost) );
        gpuErrchk( cudaMemcpy(&h_minLoc, d_minLoc, sizeof(h_minLoc), cudaMemcpyDeviceToHost) );

        minValLoc = cv::point(h_minLoc/data.cols,h_minLoc%data.cols)
        minVal = h_min;     
    }

    int main()
    {
        //read Background and template
        cv::Mat input = imread("cat.jpg",0);
        cv::Mat templ = imread("catNose.jpg",0)

        //convert to floats
        cv::Mat float_input, float_templ;
        input.convertTo(float_input,CV_32FC1);
        input.convertTo(float_templ,CV_32FC1);

        //upload Bckg and template to gpu
        cv::cuda::GpuMat d_src,d_templ, d_match;
        Size size = float_input.size();
        d_src.upload(float_input);
        d_templ.upload(float_templ);


        double min_val, max_val;
        Point min_loc, max_loc;   

        Ptr<cv::cuda::TemplateMatching> alg = cuda::createTemplateMatching(d_src.type(), CV_TM_SQDIFF);
        alg->match(d_src, d_templ, d_match);
        cv::cuda::Normalize(d_match,d_match);
        //Too slow
        //cv::cuda::minMaxLoc(d_match, &min_val, &max_val, &min_loc, &max_loc);

        callMinLocKernel(d_match,min_val,min_loc);

        return 0;
}
Community
  • 1
  • 1
CaribeGirl
  • 163
  • 2
  • 12

1 Answers1

1

I did not find a way to actually use linear indexes with cv::cuda::PtrStepSzf. I am not sure there is one. Looks like when this format is used it can only use 2 subscripts. Instead I used the pointer ptr on cv::cuda::GpuMat input variable in the kernel wrapper as follow:

#define nTPB 1024
#define FLOAT_MAX 9999.0f
void callMinLocKernel(cv::InputArray _input,       
        cv::Point minValLoc,
        float minVal,
        cv::cuda::Stream _stream)
{
    const cv::cuda::GpuMat input = _input.getGpuMat();
    const float* linSrc = input.ptr<float>();
    size_t step         = input.step;
    dim3 cthreads(nTPB);
    dim3 cblocks(
    static_cast<int>(std::ceil(input.size().width*input1.size().height /
        static_cast<double>(nTPB))));

    // code that creates and upload d_min, d_minLoc
    float h_min    = 9999;
    int h_minLoc   = -1;
    float * d_min  = 0;
    int * d_minLoc = 0;
    //gpuErrchk is defined on other place
    gpuErrchk( cudaMalloc((void**)&d_min, sizeof(h_min)));
    gpuErrchk( cudaMalloc((void**)&d_minLoc, sizeof(h_minLoc));
    gpuErrchk( cudaMemcpy(d_min, &h_min, sizeof(h_min), cudaMemcpyHostToDevice) );
    gpuErrchk( cudaMemcpy(d_minLoc, &h_minLoc, sizeof(h_minLoc), cudaMemcpyHostToDevice) );

    cudaStream_t stream = cv::cuda::StreamAccessor::getStream(_stream);
    minLoc<<<cblocks, cthreads, 0, stream>>>(input,d_min,d_minLoc);
    gpuErrchk(cudaGetLastError());
    //code to read the answer
    gpuErrchk( cudaMemcpy(&h_min, d_min, sizeof(h_min), cudaMemcpyDeviceToHost) );
    gpuErrchk( cudaMemcpy(&h_minLoc, d_minLoc, sizeof(h_minLoc), cudaMemcpyDeviceToHost) );

    minValLoc = cv::point(h_minLoc/data.cols,h_minLoc%data.cols)
    minVal = h_min;
}

And inside the Kernel as:

__global__ void minLoc(const float* data,
                       const size_t step,
                       cv::Size dataSz,
                       float* minVal,
                       float * minValLoc
                    )
{

    __shared__ volatile T   vals[nTPB];
    __shared__ volatile int idxs[nTPB];
    __shared__ volatile int last_block;

    int idx         = threadIdx.x+blockDim.x*blockIdx.x;
    const int dsize = dataSz.height*dataSz.width;
    last_block = 0;
    float  my_val = FLOAT_MAX;
    int my_idx = -1;
    // sweep from global memory
    while (idx < dsize)
    {
        int row = idx / dataSz.width;
        int id = ( row*step / sizeof( float ) ) + idx % dataSz.width;
        if ( data[id] < my_val )
        {
           my_val = data[id];
           my_idx = idx;
        }
        idx += blockDim.x*gridDim.x;
    }

            // ... rest of the kernel
}  

The step is in bytes so it needs to be divided by sizeof(typeVariable) I hope this help!

CaribeGirl
  • 163
  • 2
  • 12