My main purpose is to load frames from a video with OpenCV, then copy it Nvidia Gpu memory, resize it with a Cuda based nearest neighbour algorithm, then copy it back to the host side and visualise it with cv::imshow()
Unfortunately, I always got segmentation faults. There could be a problem with defining the amount of bytes to be copied or with the data conversions. Below, you can find the main parts of the source code, but here is the repo for the full project: https://github.com/foxakarmi/imageResize
Main function:
#include <iostream>
#include "cuda_utils.h"
#include "yololayer.h"
#include <opencv2/highgui/highgui.hpp>
void *buffers[3];
int main() {
cv::VideoCapture capture;
cv::Mat frame;
capture.open("/p.mp4");
if (!capture.isOpened()) {
std::cout << "can not open" << std::endl;
return -1;
}
capture.read(frame);
CUDA_CHECK(cudaMalloc(&buffers[0], frame.cols * frame.step[0]));
CUDA_CHECK(cudaMalloc(&buffers[1], 3 * 640 * 640));
buffers[2] = malloc(3 * 640 * 640);
while (capture.read(frame)) {
CUDA_CHECK(cudaMemcpy(buffers[0], frame.ptr(), frame.step[0] * frame.rows, cudaMemcpyHostToDevice))
cudaNearestResize((uchar *) buffers[0], (uchar *) buffers[1], frame.cols, frame.rows, 640, 640);
CUDA_CHECK(cudaMemcpy(buffers[2], buffers[1], 640 * 640 * 3, cudaMemcpyDeviceToHost))
cv::Mat foo;
foo.data = static_cast<uchar *>(buffers[2]);
cv::imshow("img", foo);
cv::waitKey(1);
}
capture.release();
return 0;
}
The .cu file containing the kernel and a wrapper function:
#include <opencv2/core/hal/interface.h>
#include "yololayer.h"
#include "cuda_utils.h"
__global__ void kernelNearestNeighbourResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int channel = 3;
if (i < dst_h && j < dst_w) {
int iIn = i * src_h / dst_h;
int jIn = j * src_w / dst_h;
dst_img[(i * dst_w + j) * channel + 0] = src_img[(iIn * src_w + jIn) * channel + 0];
dst_img[(i * dst_w + j) * channel + 1] = src_img[(iIn * src_w + jIn) * channel + 1];
dst_img[(i * dst_w + j) * channel + 2] = src_img[(iIn * src_w + jIn) * channel + 2];
}
}
cudaError_t cudaNearestResize(uchar *src_img, uchar *dst_img, int src_w, int src_h, int dst_w, int dst_h) {
if (!src_img || !dst_img)
return cudaErrorInvalidDevicePointer;
if (src_w == 0 || src_h == 0 || dst_w == 0 || dst_h == 0)
return cudaErrorInvalidValue;
kernelNearestNeighbourResize <<< 3600, 256>>>(
src_img, dst_img, src_w,
src_h, dst_w, dst_h);
return cudaGetLastError();
}