Real-time converting the PCM buffer to AAC data for iOS using Remote IO and Audio Convert Service

Question

I'm using Remote IO to get the audio buffer from PCM, I want to real-time send the data to Darwin Server by cellular network (3G network). I choose The AAC format as there is an article from Fraunhofer called "AAC-ELD based Audio Communication on iOS A Developer’s Guide". The sample code works great. The audio is recorded in LPCM format and encoded to AACELD and decoded back to LPCM and finally performed playback immediately, but it's AACELD(Enhanced Low Delay) format. When I change the format from "kAudioFormatMPEG4AAC_ELD" to "kAudioFormatMPEG4AAC". I can hear the audio for 1 second and the audio is stuck for the next 1 second and the pattern continues. And the audio is twice as frequent as the reality which means the sound last 1 second in real world will only last 0.5 second for playback. I then change the sample frame size from 512 to 1024. the frequency is normal but I can hear the audio for 2 second and it is stuck for the next 2 seconds and the pattern continues... I figured out that the AudioConverterFillComplexBuffer function fails for 2 seconds and then works well in the next 2 seconds. I don't know why. Please help. Thanks in advance. I really didn't change much of the code just changed the formatID and sample frame size from 512 to 1024 The article is here: http://www.full-hd-voice.com/content/dam/fullhdvoice/documents/iOS-ACE-AP-v2.pdf

1.global variables

static AudioBuffer            g_inputBuffer;
static AudioBuffer            g_outputBuffer;
static AudioComponentInstance g_audioUnit;
static AudioUnitElement       g_outputBus      = 0; 
static AudioUnitElement       g_inputBus       = 1;
static UInt32                 g_outChannels    = 2;
static UInt32                 g_inChannels     = 1;
static UInt32                 g_frameSize      = 1024;
static UInt32                 g_inputByteSize  = 0; 
static UInt32                 g_outputByteSize = 0; 
static unsigned int           g_initialized    = 0;
static AACELDEncoder         *g_encoder        = NULL;
static AACELDDecoder         *g_decoder        = NULL;
static MagicCookie            g_cookie;

/* Structure to keep the encoder configuration */
typedef struct EncoderProperties_
{
  Float64 samplingRate;
  UInt32  inChannels;
  UInt32  outChannels;
  UInt32  frameSize;
  UInt32  bitrate;
} EncoderProperties;

/* Structure to keep the magic cookie */
typedef struct MagicCookie_
{
  void *data;
  int byteSize;
} MagicCookie;

/* Structure to keep one encoded AU */
typedef struct EncodedAudioBuffer_
{
  UInt32 mChannels;
  UInt32 mDataBytesSize;
  void *data;
} EncodedAudioBuffer;

typedef struct DecoderProperties_
{
  Float64 samplingRate;
  UInt32  inChannels;
  UInt32  outChannels;
  UInt32  frameSize;
} DecoderProperties;

2.initialise Audio Session and Audio Unit and encoder&decoder

void InitAudioUnit()

{
  /* Calculate the required input and output buffer sizes */
  g_inputByteSize  = g_frameSize * g_inChannels  * sizeof(AudioSampleType);
  g_outputByteSize = g_frameSize * g_outChannels * sizeof(AudioSampleType);

  /* Initialize the I/O buffers */
  g_inputBuffer.mNumberChannels = g_inChannels;
  g_inputBuffer.mDataByteSize   = g_inputByteSize;

  if (g_initialized)
    free(g_inputBuffer.mData);
  g_inputBuffer.mData           = malloc(sizeof(unsigned char)*g_inputByteSize);
  memset(g_inputBuffer.mData, 0, g_inputByteSize);

  g_outputBuffer.mNumberChannels = g_outChannels;
  g_outputBuffer.mDataByteSize   = g_outputByteSize;
  if (g_initialized)
    free(g_outputBuffer.mData);
  g_outputBuffer.mData           = malloc(sizeof(unsigned char)*g_outputByteSize);
  memset(g_outputBuffer.mData, 0, g_outputByteSize);
  g_initialized = 1;

  /* Initialize the audio session */
  AudioSessionInitialize(NULL, NULL, interruptionListener, NULL);
  /* Activate the audio session */
  AudioSessionSetActive(TRUE);

  /* Enable recording for full-duplex I/O */
  UInt32 audioCategory = kAudioSessionCategory_PlayAndRecord;
  AudioSessionSetProperty(kAudioSessionProperty_AudioCategory, 
                          sizeof(audioCategory), 
                          &audioCategory);
  /* Set the route change listener */
  AudioSessionAddPropertyListener(kAudioSessionProperty_AudioRouteChange, 
                                  routeChangeListener, 
                                  NULL);

  /* Set the preferred buffer time */
  Float32 preferredBufferTime = 1024.0 / 44100.0;
  AudioSessionSetProperty(kAudioSessionProperty_PreferredHardwareIOBufferDuration, 
                          sizeof(preferredBufferTime), 
                          &preferredBufferTime);

  /* Setup the audio component for I/O */
  AudioComponentDescription componentDesc;
  memset(&componentDesc, 0, sizeof(componentDesc));

  componentDesc.componentType         = kAudioUnitType_Output;
  componentDesc.componentSubType      = kAudioUnitSubType_RemoteIO; 
  componentDesc.componentManufacturer = kAudioUnitManufacturer_Apple;

  /* Find and create the audio component */
  AudioComponent auComponent = AudioComponentFindNext(NULL, &componentDesc);
  AudioComponentInstanceNew(auComponent, &g_audioUnit);

  /* Enable the audio input */
  UInt32 enableAudioInput = 1;
  AudioUnitSetProperty(g_audioUnit, 
                       kAudioOutputUnitProperty_EnableIO, 
                       kAudioUnitScope_Input, 
                       g_inputBus, 
                       &enableAudioInput, 
                       sizeof(enableAudioInput));

  /* Setup the render callback */
  AURenderCallbackStruct renderCallbackInfo;
  renderCallbackInfo.inputProc       = audioUnitRenderCallback;
  renderCallbackInfo.inputProcRefCon = NULL;
  AudioUnitSetProperty(g_audioUnit, 
                       kAudioUnitProperty_SetRenderCallback, 
                       kAudioUnitScope_Input, 
                       g_outputBus, 
                       &renderCallbackInfo, 
                       sizeof(renderCallbackInfo));

  /* Set the input and output audio stream formats */
  AudioStreamBasicDescription audioFormat;
  audioFormat.mSampleRate       = 44100;
  audioFormat.mFormatID         = kAudioFormatLinearPCM;
  audioFormat.mFormatFlags      = kAudioFormatFlagIsSignedInteger | kAudioFormatFlagIsPacked;
  audioFormat.mFramesPerPacket  = 1;
  audioFormat.mBitsPerChannel   = 8 * sizeof(AudioSampleType);
  audioFormat.mChannelsPerFrame = g_inChannels;
  audioFormat.mBytesPerFrame    = audioFormat.mChannelsPerFrame * sizeof(AudioSampleType);
  audioFormat.mBytesPerPacket   = audioFormat.mBytesPerFrame;

  AudioUnitSetProperty(g_audioUnit, 
                       kAudioUnitProperty_StreamFormat, 
                       kAudioUnitScope_Output, 
                       g_inputBus, 
                       &audioFormat, 
                       sizeof(audioFormat));

  audioFormat.mChannelsPerFrame = g_outChannels;
  audioFormat.mBytesPerFrame    = audioFormat.mChannelsPerFrame * sizeof(AudioSampleType);
  audioFormat.mBytesPerPacket   = audioFormat.mBytesPerFrame;

  AudioUnitSetProperty(g_audioUnit, 
                       kAudioUnitProperty_StreamFormat, 
                       kAudioUnitScope_Input, 
                       g_outputBus, 
                       &audioFormat, 
                       sizeof(audioFormat));

  /* Initialize the ELD codec */
  InitAACELD();
}

void InitAACELD()
{
  EncoderProperties p;
  p.samplingRate = 44100.0;
  p.inChannels   = 1;
  p.outChannels  = 1;
  p.frameSize    = 1024;
  p.bitrate      = 32000;

  g_encoder = CreateAACELDEncoder();
  InitAACELDEncoder(g_encoder, p, &g_cookie);

  DecoderProperties dp;
  dp.samplingRate = 44100.0;
  dp.inChannels   = 1;
  dp.outChannels  = 2;
  dp.frameSize    = p.frameSize;

  g_decoder = CreateAACELDDecoder();
  InitAACELDDecoder(g_decoder, dp, &g_cookie);
}

int InitAACELDEncoder(AACELDEncoder *encoder, EncoderProperties props, MagicCookie *outCookie)
{
  /* Copy the provided encoder properties */
  encoder->inChannels   = props.inChannels;
  encoder->outChannels  = props.outChannels;
  encoder->samplingRate = props.samplingRate;
  encoder->frameSize    = props.frameSize;
  encoder->bitrate      = props.bitrate;

  /* Convenience macro to fill out the ASBD structure.
     Available only when __cplusplus is defined! */
  FillOutASBDForLPCM(encoder->sourceFormat, 
                     encoder->samplingRate, 
                     encoder->inChannels, 
                     8*sizeof(AudioSampleType), 
                     8*sizeof(AudioSampleType), 
                     false, 
                     false);

  /* Set the format parameters for AAC-ELD encoding. */
  encoder->destinationFormat.mFormatID         = kAudioFormatMPEG4AAC;
  encoder->destinationFormat.mChannelsPerFrame = encoder->outChannels;
  encoder->destinationFormat.mSampleRate       = encoder->samplingRate;

  /* Get the size of the formatinfo structure */
  UInt32 dataSize = sizeof(encoder->destinationFormat);

  /* Request the propertie from CoreAudio */
  AudioFormatGetProperty(kAudioFormatProperty_FormatInfo, 
                         0, 
                         NULL, 
                         &dataSize, 
                         &(encoder->destinationFormat));

  /* Create a new audio converter */
  AudioConverterNew(&(encoder->sourceFormat), 
                    &(encoder->destinationFormat), 
                    &(encoder->audioConverter));

  if (!encoder->audioConverter)
  {
    return -1;
  }

  /* Try to set the desired output bitrate */
  UInt32 outputBitrate = encoder->bitrate;
  dataSize = sizeof(outputBitrate);

  AudioConverterSetProperty(encoder->audioConverter, 
                            kAudioConverterEncodeBitRate, 
                            dataSize, 
                            &outputBitrate);

  /* Query the maximum possible output packet size */
  if (encoder->destinationFormat.mBytesPerPacket == 0) 
  {
    UInt32 maxOutputSizePerPacket = 0;
    dataSize = sizeof(maxOutputSizePerPacket);
    AudioConverterGetProperty(encoder->audioConverter, 
                              kAudioConverterPropertyMaximumOutputPacketSize, 
                              &dataSize, 
                              &maxOutputSizePerPacket);
    encoder->maxOutputPacketSize = maxOutputSizePerPacket;
  }
  else
  {
    encoder->maxOutputPacketSize = encoder->destinationFormat.mBytesPerPacket;
  }

  /* Fetch the Magic Cookie from the ELD implementation */
  UInt32 cookieSize = 0;
  AudioConverterGetPropertyInfo(encoder->audioConverter, 
                                kAudioConverterCompressionMagicCookie, 
                                &cookieSize, 
                                NULL);

  char* cookie = (char*)malloc(cookieSize*sizeof(char));
  AudioConverterGetProperty(encoder->audioConverter, 
                            kAudioConverterCompressionMagicCookie, 
                            &cookieSize, 
                            cookie);

  outCookie->data     = cookie;
  outCookie->byteSize = cookieSize;

  /* Prepare the temporary AU buffer for encoding */
  encoder->encoderBuffer = malloc(encoder->maxOutputPacketSize);

  return 0;
}

int InitAACELDDecoder(AACELDDecoder* decoder, DecoderProperties props, const MagicCookie *cookie)
{
  /* Copy the provided decoder properties */
  decoder->inChannels   = props.inChannels;
  decoder->outChannels  = props.outChannels;
  decoder->samplingRate = props.samplingRate;
  decoder->frameSize    = props.frameSize;

  /* We will decode to LPCM */
  FillOutASBDForLPCM(decoder->destinationFormat, 
                     decoder->samplingRate, 
                     decoder->outChannels, 
                     8*sizeof(AudioSampleType), 
                     8*sizeof(AudioSampleType), 
                     false, 
                     false);

  /* from AAC-ELD, having the same sampling rate, but possibly a different channel configuration */
  decoder->sourceFormat.mFormatID         = kAudioFormatMPEG4AAC;
  decoder->sourceFormat.mChannelsPerFrame = decoder->inChannels;
  decoder->sourceFormat.mSampleRate       = decoder->samplingRate;

  /* Get the rest of the format info */
  UInt32 dataSize = sizeof(decoder->sourceFormat);
  AudioFormatGetProperty(kAudioFormatProperty_FormatInfo, 
                         0, 
                         NULL, 
                         &dataSize, 
                         &(decoder->sourceFormat));

  /* Create a new AudioConverter instance for the conversion AAC-ELD -> LPCM */
  AudioConverterNew(&(decoder->sourceFormat), 
                    &(decoder->destinationFormat), 
                    &(decoder->audioConverter));

  if (!decoder->audioConverter)
  {
    return -1;
  }

  /* Check for variable output packet size */
  if (decoder->destinationFormat.mBytesPerPacket == 0) 
  {
    UInt32 maxOutputSizePerPacket = 0;
    dataSize = sizeof(maxOutputSizePerPacket);
    AudioConverterGetProperty(decoder->audioConverter,
                              kAudioConverterPropertyMaximumOutputPacketSize, 
                              &dataSize, 
                              &maxOutputSizePerPacket);
    decoder->maxOutputPacketSize = maxOutputSizePerPacket;
  }
  else
  {
    decoder->maxOutputPacketSize = decoder->destinationFormat.mBytesPerPacket;
  }

  /* Set the corresponding encoder cookie */
  AudioConverterSetProperty(decoder->audioConverter, 
                            kAudioConverterDecompressionMagicCookie, 
                            cookie->byteSize, 
                            cookie->data);

  return 0;
}

3.Render Callback and encoder & decoder

static OSStatus audioUnitRenderCallback(void                       *inRefCon, 
                                        AudioUnitRenderActionFlags *ioActionFlags,
                                        const AudioTimeStamp       *inTimeStamp,
                                        UInt32                      inBusNumber,
                                        UInt32                      inNumberOfFrames,
                                        AudioBufferList            *ioData)
{

  /* Get the input samples */
  AudioUnitRender(g_audioUnit,
                  ioActionFlags,
                  inTimeStamp,
                  g_inputBus,
                  inNumberOfFrames,
                  ioData);

  /* Copy to global input buffer */
  memcpy(g_inputBuffer.mData, ioData->mBuffers[0].mData, g_inputBuffer.mDataByteSize);

  /* Encode with AudioConverter */
  EncodedAudioBuffer encodedAU;
  EncodeAACELD(g_encoder, &g_inputBuffer, &encodedAU);

  /* Decode with AudioConverter */
  g_outputBuffer.mDataByteSize = g_outputByteSize;
  DecodeAACELD(g_decoder, &encodedAU, &g_outputBuffer);

  /* Copy output samples to Audio Units' IO buffer */
  ioData->mBuffers[0].mNumberChannels = g_outputBuffer.mNumberChannels;
  ioData->mBuffers[0].mDataByteSize   = g_outputBuffer.mDataByteSize;
  memcpy(ioData->mBuffers[0].mData, g_outputBuffer.mData, g_outputBuffer.mDataByteSize); 

  return noErr;
}

static OSStatus encodeProc(AudioConverterRef inAudioConverter, 
                           UInt32 *ioNumberDataPackets, 
                           AudioBufferList *ioData, 
                           AudioStreamPacketDescription **outDataPacketDescription, 
                           void *inUserData)
{
  /* Get the current encoder state from the inUserData parameter */
  AACELDEncoder *encoder = (AACELDEncoder*) inUserData;

  /* Compute the maximum number of output packets */
  UInt32 maxPackets = encoder->bytesToEncode / encoder->sourceFormat.mBytesPerPacket;

  if (*ioNumberDataPackets > maxPackets)
  {
    /* If requested number of packets is bigger, adjust */
    *ioNumberDataPackets = maxPackets;
  }

  /* Check to make sure we have only one audio buffer */
  if (ioData->mNumberBuffers != 1)
  {
    return 1;
  }

  /* Set the data to be encoded */
  ioData->mBuffers[0].mDataByteSize   = encoder->currentSampleBuffer->mDataByteSize;
  ioData->mBuffers[0].mData           = encoder->currentSampleBuffer->mData;
  ioData->mBuffers[0].mNumberChannels = encoder->currentSampleBuffer->mNumberChannels;

  if (outDataPacketDescription)
  {
    *outDataPacketDescription = NULL;
  }

  if (encoder->bytesToEncode == 0)
  {
    // We are currently out of data but want to keep on processing 
    // See Apple Technical Q&A QA1317
    return 1; 
  }

  encoder->bytesToEncode = 0;


  return noErr;
}


int EncodeAACELD(AACELDEncoder *encoder, AudioBuffer *inSamples, EncodedAudioBuffer *outData)
{
  /* Clear the encoder buffer */
  memset(encoder->encoderBuffer, 0, sizeof(encoder->maxOutputPacketSize));

  /* Keep a reference to the samples that should be encoded */
  encoder->currentSampleBuffer = inSamples;
  encoder->bytesToEncode       = inSamples->mDataByteSize;

  UInt32 numOutputDataPackets = 1;

  AudioStreamPacketDescription outPacketDesc[1];

  /* Create the output buffer list */
  AudioBufferList outBufferList;
  outBufferList.mNumberBuffers = 1;
  outBufferList.mBuffers[0].mNumberChannels = encoder->outChannels;
  outBufferList.mBuffers[0].mDataByteSize   = encoder->maxOutputPacketSize;
  outBufferList.mBuffers[0].mData           = encoder->encoderBuffer;

  /* Start the encoding process */
  OSStatus status = AudioConverterFillComplexBuffer(encoder->audioConverter,
                                                    encodeProc, 
                                                    encoder, 
                                                    &numOutputDataPackets, 
                                                    &outBufferList, 
                                                    outPacketDesc);

  if (status != noErr)
  {
    return -1;
  }

  /* Set the ouput data */
  outData->mChannels      = encoder->outChannels;
  outData->data           = encoder->encoderBuffer;
  outData->mDataBytesSize = outPacketDesc[0].mDataByteSize;

  return 0;
}


static OSStatus decodeProc(AudioConverterRef inAudioConverter, 
                           UInt32 *ioNumberDataPackets, 
                           AudioBufferList *ioData, 
                           AudioStreamPacketDescription **outDataPacketDescription, 
                           void *inUserData)
{
  /* Get the current decoder state from the inUserData parameter */
  AACELDDecoder *decoder = (AACELDDecoder*)inUserData;

  /* Compute the maximum number of output packets */
  UInt32 maxPackets = decoder->bytesToDecode / decoder->maxOutputPacketSize;

  if (*ioNumberDataPackets > maxPackets)
  {
    /* If requested number of packets is bigger, adjust */
    *ioNumberDataPackets = maxPackets;
  }

  /* If there is data to be decoded, set it accordingly */
  if (decoder->bytesToDecode)
  {
    ioData->mBuffers[0].mData           = decoder->decodeBuffer;
    ioData->mBuffers[0].mDataByteSize   = decoder->bytesToDecode;
    ioData->mBuffers[0].mNumberChannels = decoder->inChannels;
  } 

  /* And set the packet description */
  if (outDataPacketDescription)
  {
    decoder->packetDesc[0].mStartOffset            = 0;
    decoder->packetDesc[0].mVariableFramesInPacket = 0;
    decoder->packetDesc[0].mDataByteSize           = decoder->bytesToDecode;

    (*outDataPacketDescription) = decoder->packetDesc;
  }

  if (decoder->bytesToDecode == 0)
  {
    // We are currently out of data but want to keep on processing 
    // See Apple Technical Q&A QA1317
    return 1;
  }

  decoder->bytesToDecode = 0;

  return noErr;
}

int DecodeAACELD(AACELDDecoder* decoder, EncodedAudioBuffer *inData, AudioBuffer *outSamples)
{
  OSStatus status = noErr;

  /* Keep a reference to the samples that should be decoded */
  decoder->decodeBuffer  = inData->data;
  decoder->bytesToDecode = inData->mDataBytesSize;

  UInt32 outBufferMaxSizeBytes = decoder->frameSize * decoder->outChannels * sizeof(AudioSampleType);

  assert(outSamples->mDataByteSize <= outBufferMaxSizeBytes);

  UInt32 numOutputDataPackets = outBufferMaxSizeBytes / decoder->maxOutputPacketSize;

  /* Output packet stream are 512 LPCM samples */
  AudioStreamPacketDescription outputPacketDesc[1024];

  /* Create the output buffer list */
  AudioBufferList outBufferList;
  outBufferList.mNumberBuffers = 1;
  outBufferList.mBuffers[0].mNumberChannels = decoder->outChannels;
  outBufferList.mBuffers[0].mDataByteSize   = outSamples->mDataByteSize;
  outBufferList.mBuffers[0].mData           = outSamples->mData;

  /* Start the decoding process */
  status = AudioConverterFillComplexBuffer(decoder->audioConverter, 
                                           decodeProc, 
                                           decoder, 
                                           &numOutputDataPackets, 
                                           &outBufferList, 
                                           outputPacketDesc);

  if (noErr != status)
  {
    return -1;
  }

  return 0;
}

Sorry for the code mess, I've got a little trouble using it. — Ge Liu, May 10 '13 at 17:29
I really didn't change much of the code just changed the formatID and sample frame size from 512 to 1024 — Ge Liu, May 10 '13 at 17:35
@MihaiGhete, Sorry, It's been too long, I did solve the problem but I cannot remember how. — Ge Liu, Jan 10 '16 at 13:09
Damn... it would have save me a lot of headaches... Thanks anyway! — Mihai, Jan 10 '16 at 18:11
Was it something in number of in/out channels? Or maybe in the AudioStreamBasicDescription configuration? Does it ring any bells? — Mihai, Jan 10 '16 at 18:14
Sorry, I sincerely want very much to be of any help but I really couldn't recall any details. I just graduated from college that time and everything was a mess then... — Ge Liu, Jan 11 '16 at 06:34
I finally found the issue! I was losing my mind over this one. The converter was using kAppleHardwareAudioCodecManufacturer by default. Changing it to kAppleSoftwareAudioCodecManufacturer fixed it. — Mihai, Jan 14 '16 at 02:17

Real-time converting the PCM buffer to AAC data for iOS using Remote IO and Audio Convert Service

0 Answers0