I'm attempting to convert a frame of 3 channel packed rgb to nv12 using Nvidia's npp library. Here is the code I have so far:
//cpu buffer that will hold converted data
Npp8u* converted_data = (Npp8u*)malloc(frameToWrite.getWidth());
memset(converted_data, 0, frameToWrite.getSize());
//Begin - load data and convert rgb to yuv
{
NppStatus ret = NPP_SUCCESS;
int stepSource;
Npp8u* frame = nppiMalloc_8u_C3(frameToWrite.getWidth(), frameToWrite.getHeight(), &stepSource);
cudaMemcpy2D(frame, stepSource, frameToWrite.getFrame(), frameToWrite.getSizePerRow(), frameToWrite.getWidth(), frameToWrite.getHeight(), cudaMemcpyHostToDevice);
int stepDestP1, stepDestP2, stepDestP3;
Npp8u* m_stYuvP1 = nppiMalloc_8u_C1(frameToWrite.getWidth(), frameToWrite.getHeight(), &stepDestP1);
Npp8u* m_stYuvP2 = nppiMalloc_8u_C1(frameToWrite.getWidth(), frameToWrite.getHeight(), &stepDestP2);
Npp8u* m_stYuvP3 = nppiMalloc_8u_C1(frameToWrite.getWidth(), frameToWrite.getHeight(), &stepDestP3);
int d_steps[3] = { stepDestP1, stepDestP2, stepDestP3 };
Npp8u* d_ptrs[3] = { m_stYuvP1, m_stYuvP2, m_stYuvP3 };
NppiSize ROI = { frameToWrite.getWidth(), frameToWrite.getHeight() };
if ((ret = nppiRGBToYUV_8u_C3P3R(frame, stepSource, d_ptrs, stepDestP1, ROI)) != NPP_SUCCESS)
return ERROR_CODE_NVENC_ERROR_UNKNOWN;
cudaMemcpy2D(converted_data, frameToWrite.getWidth(), m_stYuvP1, stepDestP1, frameToWrite.getWidth(), frameToWrite.getHeight(), cudaMemcpyDeviceToHost);
}
Its mostly based off of this stack overflow question, but I adjusted it to fit my case. As a side note, frameToWrite.getSize()
is calculated like this:
mFrameSize = ((getBytesPerPixel() * mWidth) + mPaddingInBytes) * mHeight;
where getBytesPerPixel()
usually returns 3
.
Ultimately my questions are:
- How should I go about retrieving the converted image data from device memory?
- Did I pass the unconverted image data to the device in the correct manner?