-1

I need to convert image from bgr to yuv420p and I first use OpenCV to do so.

Mat img = imread("1.bmp");
Mat yuvImg;
cvtColor(img,yuvImg,COLOR_BGR2YUV_I420);

The result of it is normal. However,my image is too big and its pixel is almost 6400 * 2000. I find it costs too much time of converting bgr to yuv420p with opencv api cvtcolor.

Then I decide to convert it myself and speed it with cuda.

Here is code in cpu:

void bgr_to_yuv420p(unsigned  char* yuv420p, unsigned char* bgr, int width, int height)
{
    if (yuv420p == NULL || bgr== NULL)
        return;
    int frameSize = width*height;
    int chromaSize = frameSize / 4;

    int yIndex = 0;
    int uIndex = frameSize;
    int vIndex = frameSize + chromaSize;

    int R, G, B, Y, U, V;
    for (int i = 0; i < height; i++)
    {
        for (int j = 0; j < width; j++)
        {
            B = bgr[(i * width + j) * 3 + 0];
            G = bgr[(i * width + j) * 3 + 1];
            R = bgr[(i * width + j) * 3 + 2];

            //BGR to YUV
            Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
            U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
            V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;

            yuv420p[yIndex++] = (unsigned char)((Y < 0) ? 0 : ((Y > 255) ? 255 : Y));
            if (i % 2 == 0 && j % 2 == 0)
            {
                yuv420p[uIndex++] = (unsigned char)((U < 0) ? 0 : ((U > 255) ? 255 : U));
                yuv420p[vIndex++] = (unsigned char)((V < 0) ? 0 : ((V > 255) ? 255 : V));
            }
        }
    }
}

I test the code bgr_to_yuv420p(...) and the result is also normal.

Then I speed it up with cuda.

Here is all my code include kernel function and test function.

#include <iostream>
#include <time.h>
#include <vector_types.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include "opencv2/highgui.hpp" 
#include "opencv2/opencv.hpp"
using namespace cv;
using namespace std;

//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 *  d_in, unsigned char * d_out,
                               uint imgheight, uint imgwidth)
{

    int col_num = blockIdx.x*blockDim.x+threadIdx.x;
    int row_num = blockIdx.y*blockDim.y+threadIdx.y;

    if ((row_num < imgheight) && (col_num < imgwidth))
    {
//        uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
        int global_offset = row_num*imgwidth+col_num;

        int r,g,b;
        r = int(d_in[global_offset].z);
        g = int (d_in[global_offset].y);
        b = int (d_in[global_offset].x);


        d_out[row_num * imgwidth + col_num] = ((66*r + 129*g + 25*b) >> 8) + 16;
        if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
            int uv_offset = imgwidth*imgheight+((row_num*imgwidth))+col_num;
            d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
            d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;

        }

    }
}

int main(void)
{

    Mat srcImage = imread("1.bmp");
    imshow("srcImage", srcImage);
    const uint imgheight = srcImage.rows;
    const uint imgwidth = srcImage.cols;

    Mat nv12Image(imgheight * 3 / 2, imgwidth, CV_8UC1, Scalar(255));

    //input and output 
    uchar3 *d_in;
    unsigned char *d_out;

    // malloc memo in gpu
    cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
    cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);

    //copy image from cpu to gpu
    cudaMemcpy(d_in, srcImage.data, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);

    //run kernel function
    bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);

    cudaDeviceSynchronize();

    //copy yuv420p from gpu to cpu
    cudaMemcpy(nv12Image.data, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);

    imshow("nv12",nv12Image);
    imwrite("cuda.bmp",nv12Image);

    cudaFree(d_in);
    cudaFree(d_out);


    return 0;

}

The code with cuda can run but the result is not normal. Y of YUV420p is normal but there is something wrong with U and V. I think the reason is here in __global__ void bgr2yuv420p(...)

if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
                int uv_offset = imgwidth*imgheight+((row_num*imgwidth))+col_num;
                d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
                d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;

            }

I try a lot but still cannot solve it. And I find little code about converting rgb to yuv420p, More codes are about converting yuv420p to rgb. So I want to know is somebody running into the same question or giving me some advice?

Thanks Robert Crovella.Here is my update-1.

I follow Robert Crovella's advice and change the kernel function like this:

//kernel function to convert bgr to yuv420p
    __global__ void bgr2yuv420p(uchar3 *  d_in, unsigned char * d_out,
                                   uint imgheight, uint imgwidth)
    {

        int col_num = blockIdx.x*blockDim.x+threadIdx.x;
        int row_num = blockIdx.y*blockDim.y+threadIdx.y;

        if ((row_num < imgheight) && (col_num < imgwidth))
        {
    //        uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
            int global_offset = row_num*imgwidth+col_num;

            int r,g,b;
            r = int(d_in[global_offset].z);
            g = int (d_in[global_offset].y);
            b = int (d_in[global_offset].x);


            d_out[row_num * imgwidth + col_num] = ((66*r + 129*g + 25*b) >> 8) + 16;
            if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
                int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
                d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
                d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;

            }

        }
    }

I test the new kernel with excitement,but the result is also not normal. Here is my result image with the updated kernel function. yuv420p image converted by myself

Then the normal result image converted by opencv api is here. yuv420p image converted by opencv api

As we can see, the difference between the two images is U and V. I have already changed the index of U and V in kernel function, i.e.

if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
                int uv_offset = imgwidth*imgheight+((row_num >>1)*imgwidth)+col_num;
                d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;
                d_out[uv_offset+1] = ((-38*r + -74*g + 112*b) >> 8) + 128;

            }

I think it will work but it does not. Any other advice? Robert Crovella

Edit: The solution is Robert Crovella's latest answer. I have double checked it and it is really perfect.

yuyuyu
  • 1
  • 4
  • Your kernel definition is `bgr2yuv420p` but your kernel launch is `rgb2yuv420p` so your code won't compile. I guess this is not actually the code you are running. In your kernel, your calculation of `uv_offset` is generating illegal, out-of-bounds accesses in the subsequent lines. You may wish to study that indexing and use the method described [here](https://stackoverflow.com/questions/27277365/unspecified-launch-failure-on-memcpy/27278218#27278218) to help debug. – Robert Crovella Apr 27 '20 at 14:54
  • Think carefully about your `uv_offset` calculation. The `imgwidth*imgheight` offset gets you past the Y area (correctly), but from that point, is it correct to use `row_num*imgwidth` to index by row into the UV planar region? (hint, it is not. You do not have that many rows in the UV planar region, you only have half as many rows). – Robert Crovella Apr 27 '20 at 15:00
  • I think you will get closer to something that is correct if you make this change: `int uv_offset = imgwidth*imgheight+(((row_num>>1)*imgwidth))+col_num;` – Robert Crovella Apr 27 '20 at 15:09
  • 1
    comparing your `bgr_to_yuv420p` CPU code to your `bgr2yuv420p` GPU kernel we also see that the ordering of U and V storage is reversed, and there are some other calculation differences. If I sort those all out, I can get matching results between them. Note that your `bgr_to_yuv420p` CPU code implies planar storage for Y,U,V whereas your GPU code is delivering semi-planar storage for Y plane and UV interleaved plane. – Robert Crovella Apr 27 '20 at 15:43
  • Sorry for the low level error about kernel function rgb2yuv420p() and bgr2yuv420p(). I just misinput when running kernel function. In fact, I define bgr2yuv420p() and I also run bgr2yuv420p(). I will do double check about it and edit my question. – yuyuyu Apr 28 '20 at 01:49
  • "As we can see, the difference between the two images is U and V." I already pointed out this: *comparing your bgr_to_yuv420p CPU code to your bgr2yuv420p GPU kernel we also see that the ordering of U and V storage is reversed*. Maybe you didn't get that. Hopefully this will make it clear. In your GPU code this is the line that stores U: `d_out[uv_offset] = ((112*r + -94*g + -18*b) >> 8) + 128;` In your CPU code, this is the line that calculates U: `U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;`. See the difference? You have U and V reversed between your CPU and your GPU code. – Robert Crovella Apr 30 '20 at 22:37

1 Answers1

1

There are a variety of issues:

  • the calculations to convert R,G,B to Y,U,V between your CPU and GPU codes are not identical. Yes, this matters.
  • Your CPU code has planar Y,U,V storage. That means Y has its own plane, U has its own plane, and V has its own plane. Your GPU codes is semi planar (NV12) format. That means Y has its own plane, and U,V are interleaved in a single plane: UVUVUVUVUVUV.... Obviously the output of those two codes could never match identically.
  • IMO, there is no need to drag OpenCV into this.
  • Your UV offset calculation in the kernel (GPU) code was broken. The imgwidth*imgheight offset gets you past the Y area (correctly), but from that point, it is not correct to use row_num*imgwidth to index by row into the UV planar region. You do not have that many rows in the UV planar region, you only have half as many rows.
  • In your GPU kernel, you had U,V ordering reversed, you were effectively doing VUVUVUVU...

My recommendation would be to start by harmonizing the calculation differences and storage order/format. The following code has the above issues addressed, and gives matching results for me between CPU and GPU codes:

$ cat t1708.cu
#include <iostream>
#include <time.h>
#include <cstdlib>
using namespace std;
// I have no idea if these are the correct conversion formulas
// I simply lifted what I saw in your host code so that we 
// are using the same conversion calculations in host and device
__host__ __device__ unsigned char bgr2y(int R, int G, int B){
  int Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
  return (unsigned char)((Y<0)? 0 : ((Y > 255) ? 255 : Y));}
__host__ __device__ int bgr2u(int R, int G, int B){
  int U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
  return (unsigned char)((U<0)? 0 : ((U > 255) ? 255 : U));}
__host__ __device__ int bgr2v(int R, int G, int B){
  int V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
  return (unsigned char)((V<0)? 0 : ((V > 255) ? 255 : V));}

void bgr_to_yuv420p(unsigned  char* yuv420p, unsigned char* bgr, int width, int height)
{
    if (yuv420p == NULL || bgr== NULL)
        return;
    int frameSize = width*height;

    int yIndex = 0;
    int uIndex = frameSize;

    int R, G, B;
    for (int i = 0; i < height; i++)
    {
        for (int j = 0; j < width; j++)
        {
            B = bgr[(i * width + j) * 3 + 0];
            G = bgr[(i * width + j) * 3 + 1];
            R = bgr[(i * width + j) * 3 + 2];

            //BGR to YUV
            yuv420p[yIndex++] = bgr2y(R,G,B);
            if (i % 2 == 0 && j % 2 == 0)
            {
                yuv420p[uIndex] = bgr2u(R,G,B);
                yuv420p[uIndex+1] = bgr2v(R,G,B);
                uIndex+=2;
            }
        }
    }
}

//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 *  d_in, unsigned char * d_out,
                               uint imgheight, uint imgwidth)
{

    int col_num = blockIdx.x*blockDim.x+threadIdx.x;
    int row_num = blockIdx.y*blockDim.y+threadIdx.y;

    if ((row_num < imgheight) && (col_num < imgwidth))
    {
//        uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
        int global_offset = row_num*imgwidth+col_num;

        int r,g,b;
        r = int(d_in[global_offset].z);
        g = int (d_in[global_offset].y);
        b = int (d_in[global_offset].x);


        d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
        if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
            int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
            d_out[uv_offset] = bgr2u(r,g,b);
            d_out[uv_offset+1] = bgr2v(r,g,b);

        }

    }
}

int main(void)
{

    const uint imgheight = 1000;
    const uint imgwidth = 1500;

    //input and output
    uchar3 *d_in;
    unsigned char *d_out;
    uchar3 *idata = new uchar3[imgheight*imgwidth];
    unsigned char *odata = new unsigned char[imgheight*imgwidth*3/2];
    unsigned char *cdata = new unsigned char[imgheight*imgwidth*3/2];
    uchar3 pix;
    for (int i = 0; i < imgheight*imgwidth; i++){
      pix.x = (rand()%30)+40;
      pix.y = (rand()%30)+40;
      pix.z = (rand()%30)+40;
      idata[i] = pix;}
    for (int i = 0; i < imgheight*imgwidth; i++) idata[i] = pix;
    bgr_to_yuv420p(cdata, (unsigned char*) idata, imgwidth, imgheight);
    // malloc memo in gpu
    cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
    cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);

    //copy image from cpu to gpu
    cudaMemcpy(d_in, idata, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);

    //run kernel function
    bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);

    cudaDeviceSynchronize();

    //copy yuv420p from gpu to cpu
    cudaMemcpy(odata, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
    for (int i = 0; i < (imgwidth*imgheight*3/2); i++) if (odata[i] != cdata[i]) {std::cout << "mismatch at: " << i << " was: " << (int)odata[i] << " should be: " << (int)cdata[i] << std::endl; return 0;}
    cudaFree(d_in);
    cudaFree(d_out);


    return 0;

}
$ nvcc -o t1708 t1708.cu
$ cuda-memcheck ./t1708
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$

Any time you are having trouble with a CUDA code, I recommend

  1. Proper CUDA error checking
  2. Running your code with cuda-memcheck

EDIT: Based on additional comments, here is a version of the above code that uses the OP-supplied CPU code verbatim, and provides a CUDA kernel that generates YUV planar storage (instead of semi-planar storage):

#include <iostream>
#include <time.h>
#include <cstdlib>
using namespace std;
__host__ __device__ unsigned char bgr2y(int R, int G, int B){
  int Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
  return (unsigned char)((Y<0)? 0 : ((Y > 255) ? 255 : Y));}
__host__ __device__ int bgr2u(int R, int G, int B){
  int U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
  return (unsigned char)((U<0)? 0 : ((U > 255) ? 255 : U));}
__host__ __device__ int bgr2v(int R, int G, int B){
  int V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;
  return (unsigned char)((V<0)? 0 : ((V > 255) ? 255 : V));}

void bgr_to_yuv420sp(unsigned  char* yuv420p, unsigned char* bgr, int width, int height)
{
    if (yuv420p == NULL || bgr== NULL)
        return;
    int frameSize = width*height;

    int yIndex = 0;
    int uIndex = frameSize;

    int R, G, B;
    for (int i = 0; i < height; i++)
    {
        for (int j = 0; j < width; j++)
        {
            B = bgr[(i * width + j) * 3 + 0];
            G = bgr[(i * width + j) * 3 + 1];
            R = bgr[(i * width + j) * 3 + 2];

            //BGR to YUV
            yuv420p[yIndex++] = bgr2y(R,G,B);
            if (i % 2 == 0 && j % 2 == 0)
            {
                yuv420p[uIndex] = bgr2u(R,G,B);
                yuv420p[uIndex+1] = bgr2v(R,G,B);
                uIndex+=2;
            }
        }
    }
}
void bgr_to_yuv420p(unsigned  char* yuv420p, unsigned char* bgr, int width, int height)
{
    if (yuv420p == NULL || bgr== NULL)
        return;
    int frameSize = width*height;
    int chromaSize = frameSize / 4;

    int yIndex = 0;
    int uIndex = frameSize;
    int vIndex = frameSize + chromaSize;

    int R, G, B, Y, U, V;
    for (int i = 0; i < height; i++)
    {
        for (int j = 0; j < width; j++)
        {
            B = bgr[(i * width + j) * 3 + 0];
            G = bgr[(i * width + j) * 3 + 1];
            R = bgr[(i * width + j) * 3 + 2];

            //BGR to YUV
            Y = ((66 * R + 129 * G + 25 * B + 128) >> 8) + 16;
            U = ((-38 * R - 74 * G + 112 * B + 128) >> 8) + 128;
            V = ((112 * R - 94 * G - 18 * B + 128) >> 8) + 128;

            yuv420p[yIndex++] = (unsigned char)((Y < 0) ? 0 : ((Y > 255) ? 255 : Y));
            if (i % 2 == 0 && j % 2 == 0)
            {
                yuv420p[uIndex++] = (unsigned char)((U < 0) ? 0 : ((U > 255) ? 255 : U));
                yuv420p[vIndex++] = (unsigned char)((V < 0) ? 0 : ((V > 255) ? 255 : V));
            }
        }
    }
}
//kernel function to convert bgr to yuv420sp
__global__ void bgr2yuv420sp(uchar3 *  d_in, unsigned char * d_out,
                               uint imgheight, uint imgwidth)
{

    int col_num = blockIdx.x*blockDim.x+threadIdx.x;
    int row_num = blockIdx.y*blockDim.y+threadIdx.y;

    if ((row_num < imgheight) && (col_num < imgwidth))
    {
//        uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
        int global_offset = row_num*imgwidth+col_num;

        int r,g,b;
        r = int(d_in[global_offset].z);
        g = int (d_in[global_offset].y);
        b = int (d_in[global_offset].x);


        d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
        if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
            int uv_offset = imgwidth*imgheight+((row_num>>1)*imgwidth)+col_num;
            d_out[uv_offset] = bgr2u(r,g,b);
            d_out[uv_offset+1] = bgr2v(r,g,b);

        }

    }
}
//kernel function to convert bgr to yuv420p
__global__ void bgr2yuv420p(uchar3 *  d_in, unsigned char * d_out,
                               uint imgheight, uint imgwidth)
{

    int col_num = blockIdx.x*blockDim.x+threadIdx.x;
    int row_num = blockIdx.y*blockDim.y+threadIdx.y;

    if ((row_num < imgheight) && (col_num < imgwidth))
    {
//        uint32_t a = *((uint32_t *)&dinput[global_offset*3]);
        int global_offset = row_num*imgwidth+col_num;

        int r,g,b;
        r = int(d_in[global_offset].z);
        g = int (d_in[global_offset].y);
        b = int (d_in[global_offset].x);


        d_out[row_num * imgwidth + col_num] = bgr2y(r,g,b);
        if(((threadIdx.x & 1) == 0)  && ((threadIdx.y & 1) == 0)){
            int u_offset = imgwidth*imgheight+((row_num>>1)*(imgwidth>>1))+(col_num>>1);
            d_out[u_offset] = bgr2u(r,g,b);
            int v_offset = u_offset+((imgheight>>1)*(imgwidth>>1));
            d_out[v_offset] = bgr2v(r,g,b);

        }
    }
}


int main(void)
{

    const uint imgheight = 1000;
    const uint imgwidth = 1500;

    //input and output
    uchar3 *d_in;
    unsigned char *d_out;
    uchar3 *idata = new uchar3[imgheight*imgwidth];
    unsigned char *odata = new unsigned char[imgheight*imgwidth*3/2];
    unsigned char *cdata = new unsigned char[imgheight*imgwidth*3/2];
    uchar3 pix;
    for (int i = 0; i < imgheight*imgwidth; i++){
      pix.x = (rand()%30)+40;
      pix.y = (rand()%30)+40;
      pix.z = (rand()%30)+40;
      idata[i] = pix;}
    for (int i = 0; i < imgheight*imgwidth; i++) idata[i] = pix;
    bgr_to_yuv420p(cdata, (unsigned char*) idata, imgwidth, imgheight);
    // malloc memo in gpu
    cudaMalloc((void**)&d_in, imgheight*imgwidth*sizeof(uchar3));
    cudaMalloc((void**)&d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2);

    //copy image from cpu to gpu
    cudaMemcpy(d_in, idata, imgheight*imgwidth*sizeof(uchar3), cudaMemcpyHostToDevice);

    dim3 threadsPerBlock(32, 32);
    dim3 blocksPerGrid((imgwidth + threadsPerBlock.x - 1) / threadsPerBlock.x,
                       (imgheight + threadsPerBlock.y - 1) / threadsPerBlock.y);

    //run kernel function
    bgr2yuv420p<<<blocksPerGrid, threadsPerBlock>>>(d_in, d_out, imgheight, imgwidth);

    cudaDeviceSynchronize();

    //copy yuv420p from gpu to cpu
    cudaMemcpy(odata, d_out, imgheight*imgwidth*sizeof(unsigned char) * 3 / 2, cudaMemcpyDeviceToHost);
    for (int i = 0; i < (imgwidth*imgheight*3/2); i++) if (odata[i] != cdata[i]) {std::cout << "mismatch at: " << i << " was: " << (int)odata[i] << " should be: " << (int)cdata[i] << std::endl; return 0;}
    cudaFree(d_in);
    cudaFree(d_out);


    return 0;

}

I don't claim correctness for this code or any other code that I post. Anyone using any code I post does so at their own risk. I merely claim that I have attempted to address the deficiencies that I found in the original posting, and provide some explanation thereof. I am not claiming my code is defect-free, or that it is suitable for any particular purpose. Use it (or not) at your own risk.

Robert Crovella
  • 143,785
  • 11
  • 213
  • 257
  • I test your code and the result is nice: result in cpu and result in gpu is totally the same. But your code is different from mine. I use yuv420p and you use yuv420sp.I will offer a answer discussing about it below. – yuyuyu May 12 '20 at 06:36
  • In your question, you have a CPU code that is planar and a GPU implementation that is semi-planar. Of course the results cannot match. The result organization is different. I had to pick one approach or the other to show equivalent outputs, I picked semi-planar. Its entirely possible to have a GPU code that outputs a planar format result. – Robert Crovella May 12 '20 at 15:25
  • Awesome! Your new code works.I really appreciate you. You keep on answering my question so timely and finally solving it. I would like to express my thanks again. Hope this answer will be helpful to someone else. – yuyuyu May 13 '20 at 09:33