I'm attempting to render video using using the Microsoft sample DX11VideoRenderer found at: https://github.com/Microsoft/Windows-classic-samples/tree/master/Samples/DX11VideoRenderer From my extensive research it seems that using DirectX 11 with hardware-acceleration is the most up-to-date method (least likely to be deprecated) and offers the best performance solution.
There are 2 similar functions within Presenter.cpp that process frames but I cannot figure out what the difference is between them. ProcessFrameUsingD3D11()
uses VideoProcessorBlt()
to actually do the render. The mystery is that ProcessFrameUsingXVP()
does not use this function so how does it actually do the render? Or is it doing something else entirely? Also it appears that my implementation is using ProcessFrameUsingXVP()
based in the value of the variable m_useXVP
which is by default set to '1'. Here is the code sample:
if (m_useXVP)
{
BOOL bInputFrameUsed = FALSE;
hr = ProcessFrameUsingXVP( pCurrentType, pSample, pTexture2D, rcDest, ppOutputSample, &bInputFrameUsed );
if (SUCCEEDED(hr) && !bInputFrameUsed)
{
*pbProcessAgain = TRUE;
}
}
else
{
hr = ProcessFrameUsingD3D11( pTexture2D, pEVTexture2D, dwViewIndex, dwEVViewIndex, rcDest, *punInterlaceMode, ppOutputSample );
LONGLONG hnsDuration = 0;
LONGLONG hnsTime = 0;
DWORD dwSampleFlags = 0;
if (ppOutputSample != NULL && *ppOutputSample != NULL)
{
if (SUCCEEDED(pSample->GetSampleDuration(&hnsDuration)))
{
(*ppOutputSample)->SetSampleDuration(hnsDuration);
}
if (SUCCEEDED(pSample->GetSampleTime(&hnsTime)))
{
(*ppOutputSample)->SetSampleTime(hnsTime);
}
if (SUCCEEDED(pSample->GetSampleFlags(&dwSampleFlags)))
{
(*ppOutputSample)->SetSampleFlags(dwSampleFlags);
}
}
}
The reason for setting m_useXVP
is also a mystery to me and I cannot find an answer in my research. It uses a registry key that does not exist on my particular Windows10 PC so the value is not modified.
const TCHAR* lpcszInVP = TEXT("XVP");
const TCHAR* lpcszREGKEY = TEXT("SOFTWARE\\Microsoft\\Scrunch\\CodecPack\\MSDVD");
if(0 == RegOpenKeyEx(HKEY_CURRENT_USER, lpcszREGKEY, 0, KEY_READ, &hk))
{
dwData = 0;
cbData = sizeof(DWORD);
if (0 == RegQueryValueEx(hk, lpcszInVP, 0, &cbType, (LPBYTE)&dwData, &cbData))
{
m_useXVP = dwData;
}
}
So since my PC does not have this key, the code is defaulting to using ProcessFrameUsingXVP()
. Here is the definition:
HRESULT DX11VideoRenderer::CPresenter::ProcessFrameUsingXVP(IMFMediaType* pCurrentType, IMFSample* pVideoFrame, ID3D11Texture2D* pTexture2D, RECT rcDest, IMFSample** ppVideoOutFrame, BOOL* pbInputFrameUsed)
{
HRESULT hr = S_OK;
ID3D11VideoContext* pVideoContext = NULL;
ID3D11Texture2D* pDXGIBackBuffer = NULL;
IMFSample* pRTSample = NULL;
IMFMediaBuffer* pBuffer = NULL;
IMFAttributes* pAttributes = NULL;
D3D11_VIDEO_PROCESSOR_CAPS vpCaps = { 0 };
do
{
if (!m_pDX11VideoDevice)
{
hr = m_pD3D11Device->QueryInterface(__uuidof(ID3D11VideoDevice), (void**)&m_pDX11VideoDevice);
if (FAILED(hr))
{
break;
}
}
hr = m_pD3DImmediateContext->QueryInterface(__uuidof(ID3D11VideoContext), (void**)&pVideoContext);
if (FAILED(hr))
{
break;
}
// remember the original rectangles
RECT TRectOld = m_rcDstApp;
RECT SRectOld = m_rcSrcApp;
UpdateRectangles(&TRectOld, &SRectOld);
//Update destination rect with current client rect
m_rcDstApp = rcDest;
D3D11_TEXTURE2D_DESC surfaceDesc;
pTexture2D->GetDesc(&surfaceDesc);
BOOL fTypeChanged = FALSE;
if (!m_pVideoProcessorEnum || !m_pSwapChain1 || m_imageWidthInPixels != surfaceDesc.Width || m_imageHeightInPixels != surfaceDesc.Height)
{
SafeRelease(m_pVideoProcessorEnum);
SafeRelease(m_pSwapChain1);
m_imageWidthInPixels = surfaceDesc.Width;
m_imageHeightInPixels = surfaceDesc.Height;
fTypeChanged = TRUE;
D3D11_VIDEO_PROCESSOR_CONTENT_DESC ContentDesc;
ZeroMemory(&ContentDesc, sizeof(ContentDesc));
ContentDesc.InputFrameFormat = D3D11_VIDEO_FRAME_FORMAT_INTERLACED_TOP_FIELD_FIRST;
ContentDesc.InputWidth = surfaceDesc.Width;
ContentDesc.InputHeight = surfaceDesc.Height;
ContentDesc.OutputWidth = surfaceDesc.Width;
ContentDesc.OutputHeight = surfaceDesc.Height;
ContentDesc.Usage = D3D11_VIDEO_USAGE_PLAYBACK_NORMAL;
hr = m_pDX11VideoDevice->CreateVideoProcessorEnumerator(&ContentDesc, &m_pVideoProcessorEnum);
if (FAILED(hr))
{
break;
}
m_rcSrcApp.left = 0;
m_rcSrcApp.top = 0;
m_rcSrcApp.right = m_uiRealDisplayWidth;
m_rcSrcApp.bottom = m_uiRealDisplayHeight;
if (m_b3DVideo)
{
hr = m_pVideoProcessorEnum->GetVideoProcessorCaps(&vpCaps);
if (FAILED(hr))
{
break;
}
if (vpCaps.FeatureCaps & D3D11_VIDEO_PROCESSOR_FEATURE_CAPS_STEREO)
{
m_bStereoEnabled = TRUE;
}
DXGI_MODE_DESC1 modeFilter = { 0 };
modeFilter.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
modeFilter.Width = surfaceDesc.Width;
modeFilter.Height = surfaceDesc.Height;
modeFilter.Stereo = m_bStereoEnabled;
DXGI_MODE_DESC1 matchedMode;
if (m_bFullScreenState)
{
hr = m_pDXGIOutput1->FindClosestMatchingMode1(&modeFilter, &matchedMode, m_pD3D11Device);
if (FAILED(hr))
{
break;
}
}
hr = m_pXVP->GetAttributes(&pAttributes);
if (FAILED(hr))
{
break;
}
hr = pAttributes->SetUINT32(MF_ENABLE_3DVIDEO_OUTPUT, (0 != m_vp3DOutput) ? MF3DVideoOutputType_Stereo : MF3DVideoOutputType_BaseView);
if (FAILED(hr))
{
break;
}
}
}
// now create the input and output media types - these need to reflect
// the src and destination rectangles that we have been given.
RECT TRect = m_rcDstApp;
RECT SRect = m_rcSrcApp;
UpdateRectangles(&TRect, &SRect);
const BOOL fDestRectChanged = !EqualRect(&TRect, &TRectOld);
const BOOL fSrcRectChanged = !EqualRect(&SRect, &SRectOld);
if (!m_pSwapChain1 || fDestRectChanged)
{
hr = UpdateDXGISwapChain();
if (FAILED(hr))
{
break;
}
}
if (fTypeChanged || fSrcRectChanged || fDestRectChanged)
{
// stop streaming to avoid multiple start\stop calls internally in XVP
hr = m_pXVP->ProcessMessage(MFT_MESSAGE_NOTIFY_END_STREAMING, 0);
if (FAILED(hr))
{
break;
}
if (fTypeChanged)
{
hr = SetXVPOutputMediaType(pCurrentType, DXGI_FORMAT_B8G8R8A8_UNORM);
if (FAILED(hr))
{
break;
}
}
if (fDestRectChanged)
{
hr = m_pXVPControl->SetDestinationRectangle(&m_rcDstApp);
if (FAILED(hr))
{
break;
}
}
if (fSrcRectChanged)
{
hr = m_pXVPControl->SetSourceRectangle(&SRect);
if (FAILED(hr))
{
break;
}
}
hr = m_pXVP->ProcessMessage(MFT_MESSAGE_NOTIFY_BEGIN_STREAMING, 0);
if (FAILED(hr))
{
break;
}
}
m_bCanProcessNextSample = FALSE;
// Get Backbuffer
hr = m_pSwapChain1->GetBuffer(0, __uuidof(ID3D11Texture2D), (void**)&pDXGIBackBuffer);
if (FAILED(hr))
{
break;
}
// create the output media sample
hr = MFCreateSample(&pRTSample);
if (FAILED(hr))
{
break;
}
hr = MFCreateDXGISurfaceBuffer(__uuidof(ID3D11Texture2D), pDXGIBackBuffer, 0, FALSE, &pBuffer);
if (FAILED(hr))
{
break;
}
hr = pRTSample->AddBuffer(pBuffer);
if (FAILED(hr))
{
break;
}
if (m_b3DVideo && 0 != m_vp3DOutput)
{
SafeRelease(pBuffer);
hr = MFCreateDXGISurfaceBuffer(__uuidof(ID3D11Texture2D), pDXGIBackBuffer, 1, FALSE, &pBuffer);
if (FAILED(hr))
{
break;
}
hr = pRTSample->AddBuffer(pBuffer);
if (FAILED(hr))
{
break;
}
}
DWORD dwStatus = 0;
MFT_OUTPUT_DATA_BUFFER outputDataBuffer = {};
outputDataBuffer.pSample = pRTSample;
hr = m_pXVP->ProcessOutput(0, 1, &outputDataBuffer, &dwStatus);
if (hr == MF_E_TRANSFORM_NEED_MORE_INPUT)
{
//call process input on the MFT to deliver the YUV video sample
// and the call process output to extract of newly processed frame
hr = m_pXVP->ProcessInput(0, pVideoFrame, 0);
if (FAILED(hr))
{
break;
}
*pbInputFrameUsed = TRUE;
hr = m_pXVP->ProcessOutput(0, 1, &outputDataBuffer, &dwStatus);
if (FAILED(hr))
{
break;
}
}
else
{
*pbInputFrameUsed = FALSE;
}
if (ppVideoOutFrame != NULL)
{
*ppVideoOutFrame = pRTSample;
(*ppVideoOutFrame)->AddRef();
}
} while (FALSE);
SafeRelease(pAttributes);
SafeRelease(pBuffer);
SafeRelease(pRTSample);
SafeRelease(pDXGIBackBuffer);
SafeRelease(pVideoContext);
return hr;
}
And here is the definition of ProcessFrameUsingD3D11()
:
HRESULT DX11VideoRenderer::CPresenter::ProcessFrameUsingD3D11( ID3D11Texture2D* pLeftTexture2D, ID3D11Texture2D* pRightTexture2D, UINT dwLeftViewIndex, UINT dwRightViewIndex,
RECT rcDest, UINT32 unInterlaceMode, IMFSample** ppVideoOutFrame )
{
HRESULT hr = S_OK;
ID3D11VideoContext* pVideoContext = NULL;
ID3D11VideoProcessorInputView* pLeftInputView = NULL;
ID3D11VideoProcessorInputView* pRightInputView = NULL;
ID3D11VideoProcessorOutputView* pOutputView = NULL;
ID3D11Texture2D* pDXGIBackBuffer = NULL;
ID3D11RenderTargetView* pRTView = NULL;
IMFSample* pRTSample = NULL;
IMFMediaBuffer* pBuffer = NULL;
D3D11_VIDEO_PROCESSOR_CAPS vpCaps = {0};
LARGE_INTEGER lpcStart,lpcEnd;
do
{
if (!m_pDX11VideoDevice)
{
hr = m_pD3D11Device->QueryInterface(__uuidof(ID3D11VideoDevice), (void**)&m_pDX11VideoDevice);
if (FAILED(hr))
{
break;
}
}
hr = m_pD3DImmediateContext->QueryInterface(__uuidof( ID3D11VideoContext ), (void**)&pVideoContext);
if (FAILED(hr))
{
break;
}
// remember the original rectangles
RECT TRectOld = m_rcDstApp;
RECT SRectOld = m_rcSrcApp;
UpdateRectangles(&TRectOld, &SRectOld);
//Update destination rect with current client rect
m_rcDstApp = rcDest;
D3D11_TEXTURE2D_DESC surfaceDesc;
pLeftTexture2D->GetDesc(&surfaceDesc);
if (!m_pVideoProcessorEnum || !m_pVideoProcessor || m_imageWidthInPixels != surfaceDesc.Width || m_imageHeightInPixels != surfaceDesc.Height)
{
SafeRelease(m_pVideoProcessorEnum);
SafeRelease(m_pVideoProcessor);
m_imageWidthInPixels = surfaceDesc.Width;
m_imageHeightInPixels = surfaceDesc.Height;
D3D11_VIDEO_PROCESSOR_CONTENT_DESC ContentDesc;
ZeroMemory( &ContentDesc, sizeof( ContentDesc ) );
ContentDesc.InputFrameFormat = D3D11_VIDEO_FRAME_FORMAT_INTERLACED_TOP_FIELD_FIRST;
ContentDesc.InputWidth = surfaceDesc.Width;
ContentDesc.InputHeight = surfaceDesc.Height;
ContentDesc.OutputWidth = surfaceDesc.Width;
ContentDesc.OutputHeight = surfaceDesc.Height;
ContentDesc.Usage = D3D11_VIDEO_USAGE_PLAYBACK_NORMAL;
hr = m_pDX11VideoDevice->CreateVideoProcessorEnumerator(&ContentDesc, &m_pVideoProcessorEnum);
if (FAILED(hr))
{
break;
}
UINT uiFlags;
DXGI_FORMAT VP_Output_Format = DXGI_FORMAT_B8G8R8A8_UNORM;
hr = m_pVideoProcessorEnum->CheckVideoProcessorFormat(VP_Output_Format, &uiFlags);
if (FAILED(hr) || 0 == (uiFlags & D3D11_VIDEO_PROCESSOR_FORMAT_SUPPORT_OUTPUT))
{
hr = MF_E_UNSUPPORTED_D3D_TYPE;
break;
}
m_rcSrcApp.left = 0;
m_rcSrcApp.top = 0;
m_rcSrcApp.right = m_uiRealDisplayWidth;
m_rcSrcApp.bottom = m_uiRealDisplayHeight;
DWORD index;
hr = FindBOBProcessorIndex(&index); // GG This may not be needed. BOB is something to do with deinterlacing.
if (FAILED(hr))
{
break;
}
hr = m_pDX11VideoDevice->CreateVideoProcessor(m_pVideoProcessorEnum, index, &m_pVideoProcessor);
if (FAILED(hr))
{
break;
}
if (m_b3DVideo)
{
hr = m_pVideoProcessorEnum->GetVideoProcessorCaps(&vpCaps);
if (FAILED(hr))
{
break;
}
if (vpCaps.FeatureCaps & D3D11_VIDEO_PROCESSOR_FEATURE_CAPS_STEREO)
{
m_bStereoEnabled = TRUE;
}
DXGI_MODE_DESC1 modeFilter = { 0 };
modeFilter.Format = DXGI_FORMAT_B8G8R8A8_UNORM;
modeFilter.Width = surfaceDesc.Width;
modeFilter.Height = surfaceDesc.Height;
modeFilter.Stereo = m_bStereoEnabled;
DXGI_MODE_DESC1 matchedMode;
if (m_bFullScreenState)
{
hr = m_pDXGIOutput1->FindClosestMatchingMode1(&modeFilter, &matchedMode, m_pD3D11Device);
if (FAILED(hr))
{
break;
}
}
}
}
// now create the input and output media types - these need to reflect
// the src and destination rectangles that we have been given.
RECT TRect = m_rcDstApp;
RECT SRect = m_rcSrcApp;
UpdateRectangles(&TRect, &SRect);
const BOOL fDestRectChanged = !EqualRect(&TRect, &TRectOld);
if (!m_pSwapChain1 || fDestRectChanged)
{
hr = UpdateDXGISwapChain();
if (FAILED(hr))
{
break;
}
}
m_bCanProcessNextSample = FALSE;
// Get Backbuffer
hr = m_pSwapChain1->GetBuffer(0, __uuidof(ID3D11Texture2D), (void**)&pDXGIBackBuffer);
if (FAILED(hr))
{
break;
}
// create the output media sample
hr = MFCreateSample(&pRTSample);
if (FAILED(hr))
{
break;
}
hr = MFCreateDXGISurfaceBuffer(__uuidof(ID3D11Texture2D), pDXGIBackBuffer, 0, FALSE, &pBuffer);
if (FAILED(hr))
{
break;
}
hr = pRTSample->AddBuffer(pBuffer);
if (FAILED(hr))
{
break;
}
// GG For 3D - don't need.
if (m_b3DVideo && 0 != m_vp3DOutput)
{
SafeRelease(pBuffer);
hr = MFCreateDXGISurfaceBuffer(__uuidof(ID3D11Texture2D), pDXGIBackBuffer, 1, FALSE, &pBuffer);
if (FAILED(hr))
{
break;
}
hr = pRTSample->AddBuffer(pBuffer);
if (FAILED(hr))
{
break;
}
}
QueryPerformanceCounter(&lpcStart);
QueryPerformanceCounter(&lpcEnd);
//
// Create Output View of Output Surfaces.
//
D3D11_VIDEO_PROCESSOR_OUTPUT_VIEW_DESC OutputViewDesc;
ZeroMemory( &OutputViewDesc, sizeof( OutputViewDesc ) );
if (m_b3DVideo && m_bStereoEnabled)
{
OutputViewDesc.ViewDimension = D3D11_VPOV_DIMENSION_TEXTURE2DARRAY;
}
else
{
OutputViewDesc.ViewDimension = D3D11_VPOV_DIMENSION_TEXTURE2D;
}
OutputViewDesc.Texture2D.MipSlice = 0;
OutputViewDesc.Texture2DArray.MipSlice = 0;
OutputViewDesc.Texture2DArray.FirstArraySlice = 0;
if (m_b3DVideo && 0 != m_vp3DOutput)
{
OutputViewDesc.Texture2DArray.ArraySize = 2; // STEREO
}
QueryPerformanceCounter(&lpcStart);
hr = m_pDX11VideoDevice->CreateVideoProcessorOutputView(pDXGIBackBuffer, m_pVideoProcessorEnum, &OutputViewDesc, &pOutputView);
if (FAILED(hr))
{
break;
}
D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC InputLeftViewDesc;
ZeroMemory( &InputLeftViewDesc, sizeof( InputLeftViewDesc ) );
InputLeftViewDesc.FourCC = 0;
InputLeftViewDesc.ViewDimension = D3D11_VPIV_DIMENSION_TEXTURE2D;
InputLeftViewDesc.Texture2D.MipSlice = 0;
InputLeftViewDesc.Texture2D.ArraySlice = dwLeftViewIndex;
hr = m_pDX11VideoDevice->CreateVideoProcessorInputView(pLeftTexture2D, m_pVideoProcessorEnum, &InputLeftViewDesc, &pLeftInputView);
if (FAILED(hr))
{
break;
}
if (m_b3DVideo && MFVideo3DSampleFormat_MultiView == m_vp3DOutput && pRightTexture2D)
{
D3D11_VIDEO_PROCESSOR_INPUT_VIEW_DESC InputRightViewDesc;
ZeroMemory( &InputRightViewDesc, sizeof( InputRightViewDesc ) );
InputRightViewDesc.FourCC = 0;
InputRightViewDesc.ViewDimension = D3D11_VPIV_DIMENSION_TEXTURE2D;
InputRightViewDesc.Texture2D.MipSlice = 0;
InputRightViewDesc.Texture2D.ArraySlice = dwRightViewIndex;
hr = m_pDX11VideoDevice->CreateVideoProcessorInputView(pRightTexture2D, m_pVideoProcessorEnum, &InputRightViewDesc, &pRightInputView);
if (FAILED(hr))
{
break;
}
}
QueryPerformanceCounter(&lpcEnd);
QueryPerformanceCounter(&lpcStart);
SetVideoContextParameters(pVideoContext, &SRect, &TRect, unInterlaceMode);
// Enable/Disable Stereo
if (m_b3DVideo)
{
pVideoContext->VideoProcessorSetOutputStereoMode(m_pVideoProcessor, m_bStereoEnabled);
D3D11_VIDEO_PROCESSOR_STEREO_FORMAT vpStereoFormat = D3D11_VIDEO_PROCESSOR_STEREO_FORMAT_SEPARATE;
if (MFVideo3DSampleFormat_Packed_LeftRight == m_vp3DOutput)
{
vpStereoFormat = D3D11_VIDEO_PROCESSOR_STEREO_FORMAT_HORIZONTAL;
}
else if (MFVideo3DSampleFormat_Packed_TopBottom == m_vp3DOutput)
{
vpStereoFormat = D3D11_VIDEO_PROCESSOR_STEREO_FORMAT_VERTICAL;
}
pVideoContext->VideoProcessorSetStreamStereoFormat(m_pVideoProcessor,
0, m_bStereoEnabled, vpStereoFormat, TRUE, TRUE, D3D11_VIDEO_PROCESSOR_STEREO_FLIP_NONE, 0);
}
QueryPerformanceCounter(&lpcEnd);
QueryPerformanceCounter(&lpcStart);
D3D11_VIDEO_PROCESSOR_STREAM StreamData;
ZeroMemory( &StreamData, sizeof( StreamData ) );
StreamData.Enable = TRUE;
StreamData.OutputIndex = 0;
StreamData.InputFrameOrField = 0;
StreamData.PastFrames = 0;
StreamData.FutureFrames = 0;
StreamData.ppPastSurfaces = NULL;
StreamData.ppFutureSurfaces = NULL;
StreamData.pInputSurface = pLeftInputView;
StreamData.ppPastSurfacesRight = NULL;
StreamData.ppFutureSurfacesRight = NULL;
if (m_b3DVideo && MFVideo3DSampleFormat_MultiView == m_vp3DOutput && pRightTexture2D)
{
StreamData.pInputSurfaceRight = pRightInputView;
}
hr = pVideoContext->VideoProcessorBlt(m_pVideoProcessor, pOutputView, 0, 1, &StreamData );
if (FAILED(hr))
{
break;
}
QueryPerformanceCounter(&lpcEnd);
if (ppVideoOutFrame != NULL)
{
*ppVideoOutFrame = pRTSample;
(*ppVideoOutFrame)->AddRef();
}
}
while (FALSE);
SafeRelease(pBuffer);
SafeRelease(pRTSample);
SafeRelease(pDXGIBackBuffer);
SafeRelease(pOutputView);
SafeRelease(pLeftInputView);
SafeRelease(pRightInputView);
SafeRelease(pVideoContext);
return hr;
}
One last note, the documentation states that:
Specifically, this sample shows how to:
- Decode the video using the Media Foundation APIs
- Render the decoded video using the DirectX 11 APIs
- Output the video stream to multi-monitor displays
I cannot find anything that does decoding unless by some MF magic chant phrase that I haven't stumbled across yet. But it's not a showstopper because I can stick an H.264 decoder MFT in front no problem. I would just like to clarify the documentation.
Any help would be much appreciated. Thank you!