I am trying to decode a raw H264 stream using VideoToolbox APIs in Swift (macOS).
In the viewDidLoad()
I setup my display layer and CMTimeBase as so:
self.view.wantsLayer = true
self.VideoLayer = AVSampleBufferDisplayLayer()
self.VideoLayer.frame = self.view.bounds
self.view.layer?.addSublayer(self.VideoLayer)
var _CMTimebasePointer: CMTimebase? = nil
let status = CMTimebaseCreateWithMasterClock(
allocator: kCFAllocatorDefault,
masterClock: CMClockGetHostTimeClock(),
timebaseOut: &_CMTimebasePointer)
self.VideoLayer.controlTimebase = _CMTimebasePointer
CMTimebaseSetTime(
self.VideoLayer.controlTimebase!,
time: CMTime.zero);
CMTimebaseSetRate(
self.VideoLayer.controlTimebase!,
rate: 1.0);
Then I read my H264 file as raw bytes and parse into separate NALUs. (I cross-checked with NALU parsers in other projects, and my NALU parser is correct, but if you think I should post it's code here, leave a comment and I'll edit my question :) )
This is how I process each NALU (I basically set the NALU length in the first 4 bytes (to convert to avcC format), and for SPS & PPS NALUs, I ignore the first 4 bytes.):
func decodeFrame(_ videoPacket: inout VideoPacket)
{
// replace start code with nal size
var biglen = CFSwapInt32HostToBig(UInt32(videoPacket.count - 4)) // NALU length doesn't contain the first 4 size bytes
memcpy(&videoPacket, &biglen, 4)
let nalType = videoPacket[4] & 0x1F
switch nalType
{
case 0x05:
// print("Nal type is IDR frame")
// inside this I create the format description and decompression session
createDecompressionSession()
decodeVideoPacket(videoPacket)
case 0x07:
// print("Nal type is SPS")
spsSize = videoPacket.count - 4
sps = Array(videoPacket[4..<videoPacket.count])
case 0x08:
// print("Nal type is PPS")
ppsSize = videoPacket.count - 4
pps = Array(videoPacket[4..<videoPacket.count])
default:
// print("Nal type is B/P frame: \(nalType)")
decodeVideoPacket(videoPacket)
break;
}
}
I then create the VideoFormatDescription like so:
let pointerSPS = UnsafePointer<UInt8>(spsData)
let pointerPPS = UnsafePointer<UInt8>(ppsData)
// make pointers array
let dataParamArray = [pointerSPS, pointerPPS]
let parameterSetPointers = UnsafePointer<UnsafePointer<UInt8>>(dataParamArray)
// make parameter sizes array
let sizeParamArray = [spsData.count, ppsData.count]
let parameterSetSizes = UnsafePointer<Int>(sizeParamArray)
let status = CMVideoFormatDescriptionCreateFromH264ParameterSets(
allocator: kCFAllocatorDefault,
parameterSetCount: 2,
parameterSetPointers: parameterSetPointers,
parameterSetSizes: parameterSetSizes,
nalUnitHeaderLength: 4,
formatDescriptionOut: &self.VideoFormatDescription) // class variable
And I make the VTDecompressionSession
like so:
let decoderParameters = NSMutableDictionary()
let destinationPixelBufferAttributes = NSMutableDictionary()
destinationPixelBufferAttributes.setValue(
NSNumber(value: kCVPixelFormatType_32ARGB), // I've tried various values here to no avail...
forKey: kCVPixelBufferPixelFormatTypeKey as String
)
var outputCallback = VTDecompressionOutputCallbackRecord()
outputCallback.decompressionOutputCallback = decompressionSessionDecodeFrameCallback
outputCallback.decompressionOutputRefCon = UnsafeMutableRawPointer(Unmanaged.passUnretained(self).toOpaque())
let status = VTDecompressionSessionCreate(
allocator: kCFAllocatorDefault,
formatDescription: videoDescription,
decoderSpecification: decoderParameters,
imageBufferAttributes: destinationPixelBufferAttributes,
outputCallback: &outputCallback,
decompressionSessionOut: &self.DecompressionSession)
Then, this is how I decode each frame:
func decodeVideoPacket(_ videoPacket: VideoPacket)
{
let bufferPointer = UnsafeMutablePointer<UInt8>(mutating: videoPacket)
var blockBuffer: CMBlockBuffer?
var status = CMBlockBufferCreateWithMemoryBlock(
allocator: kCFAllocatorDefault,
memoryBlock: bufferPointer,
blockLength: videoPacket.count,
blockAllocator: kCFAllocatorNull,
customBlockSource: nil,
offsetToData: 0,
dataLength: videoPacket.count,
flags: 0,
blockBufferOut: &blockBuffer)
if status != noErr
{
print("CMBlockBufferCreateWithMemoryBlock ERROR: \(status)")
return
}
var sampleBuffer: CMSampleBuffer?
let sampleSizeArray = [videoPacket.count]
let frameFPS = Double(1) / Double(60)
let tval = Double(frameFPS * Double(self.frameCount))
let presentationTime = CMTimeMakeWithSeconds(tval, preferredTimescale: 1000)
var info = CMSampleTimingInfo(
duration: CMTimeMakeWithSeconds(frameFPS, preferredTimescale: 1000),
presentationTimeStamp: presentationTime,
decodeTimeStamp: presentationTime)
self.frameCount += 1
status = CMSampleBufferCreateReady(
allocator: kCFAllocatorDefault,
dataBuffer: blockBuffer,
formatDescription: self.VideoFormatDescription,
sampleCount: 1,
sampleTimingEntryCount: 1,
sampleTimingArray: &info,
sampleSizeEntryCount: 1,
sampleSizeArray: sampleSizeArray,
sampleBufferOut: &sampleBuffer)
if status != noErr
{
print("CMSampleBufferCreateReady ERROR: \(status)")
return
}
guard let buffer = sampleBuffer
else
{
print("Could not unwrap sampleBuffer!")
return
}
if self.VideoLayer.isReadyForMoreMediaData
{
self.VideoLayer?.enqueue(buffer)
self.VideoLayer.displayIfNeeded()
}
if let session = self.DecompressionSession
{
var outputBuffer: CVPixelBuffer?
status = VTDecompressionSessionDecodeFrame(
session,
sampleBuffer: buffer,
flags: [],
frameRefcon: &outputBuffer,
infoFlagsOut: nil)
if status != noErr
{
print("VTDecompressionSessionDecodeFrame ERROR: \(status)")
}
status = VTDecompressionSessionWaitForAsynchronousFrames(session)
if status != noErr
{
print("VTDecompressionSessionWaitForAsynchronousFrames ERROR: \(status)")
}
}
}
Last, in the decode callback function, currently I just try to check if imageBuffer
is nil
or not, but it's always nil
and the OSStatus
is always set to noErr
private func decompressionSessionDecodeFrameCallback(
_ decompressionOutputRefCon: UnsafeMutableRawPointer?,
_ sourceFrameRefCon: UnsafeMutableRawPointer?,
_ status: OSStatus,
_ infoFlags: VTDecodeInfoFlags,
_ imageBuffer: CVImageBuffer?,
_ presentationTimeStamp: CMTime,
_ presentationDuration: CMTime) -> Void
{
print("status: \(status), image_nil?: \(imageBuffer == nil)")
}
Clearly, since the imageBuffer
is nil
, there is something wrong I assume...
(Also AVSampleBufferDisplayLayer doesn't render any image)
Can you guys please help me find what is wrong with my code or perhaps tell me how to dive deeper into finding out the VTDecompression error that might be happening but is hidden from me?
PS: let me know whatever might need explaining more in my code