How to implement nearest neighbours image resizing algorithm in CUDA?

Question

My main purpose is to load frames from a video with OpenCV, then copy it Nvidia Gpu memory, resize it with a Cuda based nearest neighbour algorithm, then copy it back to the host side and visualise it with cv::imshow()

Unfortunately, I always got segmentation faults. There could be a problem with defining the amount of bytes to be copied or with the data conversions. Below, you can find the main parts of the source code, but here is the repo for the full project: https://github.com/foxakarmi/imageResize

Main function:

#include <iostream>
#include "cuda_utils.h"
#include "yololayer.h"
#include <opencv2/highgui/highgui.hpp>

void *buffers[3];

int main() {

    cv::VideoCapture capture;
    cv::Mat frame;

    capture.open("/p.mp4");

    if (!capture.isOpened()) {
        std::cout << "can not open" << std::endl;
        return -1;
    }
    capture.read(frame);

    CUDA_CHECK(cudaMalloc(&buffers[0], frame.cols * frame.step[0]));
    CUDA_CHECK(cudaMalloc(&buffers[1], 3 * 640 * 640));
    buffers[2] = malloc(3 * 640 * 640);

    while (capture.read(frame)) {
        CUDA_CHECK(cudaMemcpy(buffers[0], frame.ptr(), frame.step[0] * frame.rows, cudaMemcpyHostToDevice))

        cudaNearestResize((uchar *) buffers[0], (uchar *) buffers[1], frame.cols, frame.rows, 640, 640);

        CUDA_CHECK(cudaMemcpy(buffers[2], buffers[1], 640 * 640 * 3, cudaMemcpyDeviceToHost))

        cv::Mat foo;
        foo.data = static_cast<uchar *>(buffers[2]);
        cv::imshow("img", foo);
        cv::waitKey(1);
    }

    capture.release();
    return 0;
}

The .cu file containing the kernel and a wrapper function:

#include <opencv2/core/hal/interface.h>
#include "yololayer.h"
#include "cuda_utils.h"

__global__ void kernelNearestNeighbourResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    int channel = 3;

    if (i < dst_h && j < dst_w) {
        int iIn = i * src_h / dst_h;
        int jIn = j * src_w / dst_h;

        dst_img[(i * dst_w + j) * channel + 0] = src_img[(iIn * src_w + jIn) * channel + 0];
        dst_img[(i * dst_w + j) * channel + 1] = src_img[(iIn * src_w + jIn) * channel + 1];
        dst_img[(i * dst_w + j) * channel + 2] = src_img[(iIn * src_w + jIn) * channel + 2];
    }
}

cudaError_t cudaNearestResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
    if (!src_img || !dst_img)
        return cudaErrorInvalidDevicePointer;

    if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
        return cudaErrorInvalidValue;

    kernelNearestNeighbourResize <<< 3600, 256>>>(
            src_img, dst_img, src_w,
            src_h, dst_w, dst_h);

    return cudaGetLastError();
}

The cuda grid size is only 1x1 (`<<< 1, 1 >>>`), so only 1 pixel is being handled. Also - are you sure you get a cv::Mat with `float` values for each channel from `capture.read` ? — wohlstad, May 19 '22 at 17:00
Also when you copy the data back to the host, I think you need to use `destImgHost.data` instead of `destImgHost`. — wohlstad, May 19 '22 at 17:06
Yeah, the grid size have to be improved to handle all of the pixels. Yes, I got float values as in "frame.create(frame.rows, frame.cols, CV_32FC3); " CV_32FC3 refers to 32 bit float values. — Fox1942, May 20 '22 at 08:57
about the pixel format: I am aware that `frame.create(frame.rows, frame.cols, CV_32FC3); ` creates a `float` image. But every time you call `capture.read(frame);` it will reinitialize `frame`. In a short test I did it seems like when you decode a mp4 file, the decoder will return a 3 channel `uint8` frames, not `float`. I suggest you verify it in your case. — wohlstad, May 21 '22 at 04:01
Yeah, I rewroted the whole code using uchar *, instead of float, but unfortunately, it got worse. See the updates — Fox1942, May 25 '22 at 16:40

wohlstad · Accepted Answer · 2022-10-12T16:34:19.827

Below you can see a complete working solution.

There are 3 main issues in your code:

The setup for the CUDA grid is incorrect. See an example how to set it in my code below (just an initial working version that you can further improve). See some general info here: The CUDA Programming Model.
Note: the grid setup can have a meaningful effect on the overall performance, and it is not trivial to optimize. See more info here: How do I choose grid and block dimensions for CUDA kernels?.
When copying the data to the device, you used frame.ptr() instead of frame.data.
You only set the data pointer for the output cv::Mat foo, without properly initializing it. So the cv::Mat metadata (rows, cols etc.) were not set and cv::imshow could not show it properly. In my code it is not required - see below.

Note that your code skips the first frame. I kept this behavior. You could include the first frame by checking if dst_img was already initialized, and if not (since it's the first frame) - initialize it and the CUDA buffers.

Some more notes on the code below:

There's no need to allocate buffer[2] for the host output image. Instead I initialized the cv::Mat with the proper size and use it's allocated buffer.
I renamed the device buffers, and added cudaFree for them.
It is safer to pass the number of channels to the kernel, rather than making it assume it is 3.
I passed the step (AKA stride) of the images to the kernel. This will support the case where the images have padding (see about it here: stride and padding of an image).

Code for main:

#include <iostream>
#include <opencv2/highgui/highgui.hpp>
#include "cuda_runtime.h"
#include <assert.h>

#define CUDA_CHECK(x) { cudaError_t cudaStatus = x; assert(cudaStatus == cudaSuccess); }

cudaError_t cudaNearestResize(unsigned char *src_img, unsigned char *dst_img, int channel,
    int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step);

int main()
{
    cv::VideoCapture capture;
    cv::Mat frame;
    capture.open("/p.mp4");
    if (!capture.isOpened()) 
    {
        std::cout << "can not open" << std::endl;
        return -1;
    }
    capture.read(frame);

    int src_w = frame.cols;
    int src_h = frame.rows;
    int src_step = (int)frame.step[0];
    int channels = frame.channels();
    int data_type = frame.type();
    assert((data_type & CV_MAT_DEPTH_MASK) == CV_8U);   // assert that it is a uchar image

    // Parameters you can change:
    int dst_w = 640;
    int dst_h = 640;

    cv::Mat dst_img(dst_h, dst_w, data_type);
    int dst_step = (int)dst_img.step[0];

    void * src_dev_buffer;
    void * dst_dev_buffer;
    CUDA_CHECK(cudaMalloc(&src_dev_buffer, src_h * src_step));
    CUDA_CHECK(cudaMalloc(&dst_dev_buffer, dst_h * dst_step));

    while (capture.read(frame))
    {
        // assert that the current frame has the same type and dimensions as the first one (should be guaranteed by the video decoder):
        assert(frame.cols == src_w);
        assert(frame.rows == src_h);
        assert((int)frame.step[0] == src_step);
        assert(frame.type() == data_type);

        CUDA_CHECK(cudaMemcpy(src_dev_buffer, frame.data, src_h * src_step, cudaMemcpyHostToDevice));
        CUDA_CHECK(cudaNearestResize((unsigned char *)src_dev_buffer, (unsigned char *)dst_dev_buffer, channels, src_w, src_h, src_step, dst_w, dst_h, dst_step));
        CUDA_CHECK(cudaMemcpy(dst_img.data, dst_dev_buffer, dst_h * dst_step, cudaMemcpyDeviceToHost));
        cv::imshow("dst_img", dst_img);
        cv::waitKey(1);
    }

    CUDA_CHECK(cudaFree(src_dev_buffer));
    CUDA_CHECK(cudaFree(dst_dev_buffer));

    capture.release();
    return 0;
}

Code for the CUDA kernel and the wrapping function:

#include "cuda_runtime.h"

__global__ void kernelNearestNeighbourResize(unsigned char *src_img, unsigned char *dst_img, int channels,
    int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step)
{
    int i = blockDim.y * blockIdx.y + threadIdx.y;
    int j = blockDim.x * blockIdx.x + threadIdx.x;

    if (i < dst_h && j < dst_w) 
    {
        int iIn = i * src_h / dst_h;
        int jIn = j * src_w / dst_w;

        int src_offset = i * dst_step + j * channels;
        int dst_offset = iIn * src_step + jIn * channels;
        for (int c = 0; c < channels; ++c) 
        {
            dst_img[src_offset + c] = src_img[dst_offset + c];
        }
    }
}

cudaError_t cudaNearestResize(unsigned char *src_img, unsigned char *dst_img, int channels,
    int src_w, int src_h, int src_step, int dst_w, int dst_h, int dst_step)
{
    if (!src_img || !dst_img)
        return cudaErrorInvalidDevicePointer;

    if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
        return cudaErrorInvalidValue;

    // The grid dimensions
    dim3 dimBlock(32, 32);
    dim3 dimGrid(dst_w / 32 + 1, dst_h / 32 + 1);

    kernelNearestNeighbourResize << < dimGrid, dimBlock >> >(
        src_img, dst_img, channels,
        src_w, src_h, src_step, dst_w, dst_h, dst_step);

    return cudaGetLastError();
}

How to implement nearest neighbours image resizing algorithm in CUDA?

1 Answers1