nppi resize function with 3 channels getting strange output

Question

I'm getting a strange error when using nppi geometry transform functions from nppi cuda libraries. The code is here:

#include <nppi.h>
#include <nppi_geometry_transforms.h>

#include <iostream>
#include <opencv2/core.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgcodecs.hpp>
#include <vector>

void write(const cv::Mat &mat1, const std::string &path) {
    auto mat2 = cv::Mat(mat1.rows, mat1.cols, CV_8UC4);
    for (int i = 0; i < mat1.rows; i++) {
        for (int j = 0; j < mat1.cols; j++) {
            auto &bgra = mat2.at<cv::Vec4b>(i, j);
            auto &rgb = mat1.at<cv::Vec3b>(i, j);
            bgra[0] = rgb[2];
            bgra[1] = rgb[1];
            bgra[2] = rgb[0];
            bgra[3] = UCHAR_MAX;
        }
    }
    std::vector<int> compression_params;
    compression_params.push_back(cv::IMWRITE_PNG_COMPRESSION);
    compression_params.push_back(9);
    cv::imwrite(path, mat2, compression_params);
}

int main() {
    std::cout << "Hello, World!" << std::endl;
    auto mat = cv::Mat(256, 256, CV_8UC3);
    for (int i = 0; i < mat.rows; i++) {
        for (int j = 0; j < mat.cols; j++) {
            auto &rgb = mat.at<cv::Vec3b>(i, j);
            rgb[0] = (uint8_t)j;
            rgb[1] = (uint8_t)i;
            rgb[2] = (uint8_t)(UCHAR_MAX - j);
        }
    }
    write(mat, "./test.png");
    uint8_t *gpuBuffer1;
    uint8_t *gpuBuffer2;
    cudaMalloc(&gpuBuffer1, mat.total());
    cudaMalloc(&gpuBuffer2, mat.total());
    cudaMemcpy(gpuBuffer1, mat.data, mat.total(), cudaMemcpyHostToDevice);
    auto status = nppiResize_8u_C3R(
        gpuBuffer1, mat.cols * 3, {.width = mat.cols, .height = mat.rows},
        {.x = 0, .y = 0, .width = mat.cols, .height = mat.rows}, gpuBuffer2,
        mat.cols * 3, {.width = mat.cols, .height = mat.rows},
        {.x = 0, .y = 0, .width = mat.cols, .height = mat.rows},
        NPPI_INTER_NN);
    if (status != NPP_SUCCESS) {
        std::cerr << "Error executing Resize -- code: " << status << std::endl;
    }
    auto mat2 = cv::Mat(mat.rows, mat.cols, CV_8UC3);
    cudaMemcpy(mat2.data, gpuBuffer2, mat.total(), cudaMemcpyDeviceToHost);
    write(mat2, "./test1.png");
}

Basically I display a rainbow picture. Then write it to the GPU then resize it to the EXACT same size, then copy it back to the host then display it again. What I'm getting is garbled data in about 2/3s of the return picture.

First picture is the input picture. Second input picture is the output picture.

I expect both pictures to be the same.

If I adjust the ROI with offsets and change the width and height for the destination buffer the pixels in the top 1/3 of the resized picture actually moves and resizes correctly. But the rest of the picture is garbled. Not sure what's wrong. Does anyone with experience in cuda nppi libraries or image processing in general have an idea what's going on?

CMake file included below for convenience to anyone who wants to compile it. You have to have opencv and cuda toolkit installed as C++ libs:

cmake_minimum_required(VERSION 3.18)
project(test_nppi)
enable_language(CUDA)
set(CMAKE_CXX_STANDARD 17)

find_package(CUDAToolkit REQUIRED)
find_package(OpenCV)

message(STATUS ${CUDAToolkit_INCLUDE_DIRS})
add_executable(test_nppi main.cu)
target_link_libraries(test_nppi ${OpenCV_LIBS} CUDA::nppig)
target_include_directories(test_nppi PUBLIC ${OpenCV_INCLUDE_DIRS} ${CUDAToolkit_INCLUDE_DIRS})

set_target_properties(test_nppi PROPERTIES
        CUDA_SEPARABLE_COMPILATION ON)

I've used the nppi resize function for single channel pictures before and I don't have this issue. The 3 channel nppi resize function is getting weird output and I'm thinking I'm not completely understanding the input parameters. The Step is multiplied by 3 because of 3 color channels, but all other sizes just are measuring the dimensions by pixels; and the sizes of src and destination are the same... not sure what I'm not understanding here.

I would guess your assumption about the memory layout of the openCV mat is not correct. I recall they are usually stored in pitched memory, so check the pitch. You are probably telling the NPP call the wrong memory layout — talonmies, Jun 12 '22 at 02:38

Rotem · Accepted Answer · 2022-06-12T11:31:04.257

The issue is that mat.total() equals the total number of pixels, and not the total number of bytes.

According to OpenCV documentation:

total () const
Returns the total number of array elements.

In you code sample, mat.total() equals 256*256, while total number of bytes equals 256*256*3 (RGB applies 3 bytes per pixel).
(In OpenCV terminology "array element" is equivalent to image pixel).

cudaMemcpy(gpuBuffer1, mat.data, mat.total()... copies only 1/3 of the total image bytes, so only the upper 1/3 of the image data is valid.

According to this post, the correct way for computing the number of bytes is:

size_t mat_size_in_bytes = mat.step[0] * mat.rows;

In most cases for CV_8UC3, mat.step[0] = mat.cols*3, but for covering all the cases, we better use mat.step[0].

Corrected code sample:

#include "nppi.h"
#include "nppi_geometry_transforms.h"

#include <iostream>
#include "opencv2/core.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/imgcodecs.hpp"
#include <vector>

void write(const cv::Mat& mat1, const std::string& path) {
    auto mat2 = cv::Mat(mat1.rows, mat1.cols, CV_8UC4);
    for (int i = 0; i < mat1.rows; i++) {
        for (int j = 0; j < mat1.cols; j++) {
            auto& bgra = mat2.at<cv::Vec4b>(i, j);
            auto& rgb = mat1.at<cv::Vec3b>(i, j);
            bgra[0] = rgb[2];
            bgra[1] = rgb[1];
            bgra[2] = rgb[0];
            bgra[3] = UCHAR_MAX;
        }
    }
    std::vector<int> compression_params;
    compression_params.push_back(cv::IMWRITE_PNG_COMPRESSION);
    compression_params.push_back(9);
    cv::imwrite(path, mat2, compression_params);
}

int main() {
    std::cout << "Hello, World!" << std::endl;
    auto mat = cv::Mat(256, 256, CV_8UC3);
    auto mat2 = cv::Mat(mat.rows, mat.cols, CV_8UC3);
    for (int i = 0; i < mat.rows; i++) {
        for (int j = 0; j < mat.cols; j++) {
            auto& rgb = mat.at<cv::Vec3b>(i, j);
            rgb[0] = (uint8_t)j;
            rgb[1] = (uint8_t)i;
            rgb[2] = (uint8_t)(UCHAR_MAX - j);
        }
    }
    write(mat, "./test.png");
    uint8_t* gpuBuffer1;
    uint8_t* gpuBuffer2;
    size_t mat_size_in_bytes = mat.step[0] * mat.rows;  // https://stackoverflow.com/questions/26441072/finding-the-size-in-bytes-of-cvmat
    size_t mat2_size_in_bytes = mat2.step[0] * mat2.rows;
    cudaMalloc(&gpuBuffer1, mat_size_in_bytes);
    cudaMalloc(&gpuBuffer2, mat2_size_in_bytes);
    cudaMemcpy(gpuBuffer1, mat.data, mat_size_in_bytes, cudaMemcpyHostToDevice);

    NppiSize oSrcSize = { mat.cols, mat.rows };
    NppiRect oSrcRectROI = { 0, 0, mat.cols, mat.rows };
    NppiSize oDstSize = { mat2.cols, mat2.rows };
    NppiRect oDstRectROI = { 0, 0, mat2.cols, mat2.rows };

    auto status = nppiResize_8u_C3R(
        gpuBuffer1, mat.step[0], oSrcSize,
        oSrcRectROI, gpuBuffer2,
        mat2.step[0], oDstSize,
        oDstRectROI,
        NPPI_INTER_NN);

    if (status != NPP_SUCCESS) {
        std::cerr << "Error executing Resize -- code: " << status << std::endl;
    }
    
    cudaMemcpy(mat2.data, gpuBuffer2, mat2_size_in_bytes, cudaMemcpyDeviceToHost);
    write(mat2, "./test1.png");
}

Output:

nppi resize function with 3 channels getting strange output

1 Answers1