0

I am building and app that utilizes speech recognition to translate speech to text. Everything works fine but I want to save the audio recording so that I can compare what was said to what was transcribe. Here's my recording code

    func StartRecording() -> String{
    
    // Configure the audio session for the app.
    let audioSession = AVAudioSession.sharedInstance()
    try! audioSession.setCategory(.record, mode: .measurement, options: .duckOthers)
    try! audioSession.setActive(true, options: .notifyOthersOnDeactivation)
    let inputNode = audioEngine.inputNode
    //
    let recordingFormat = inputNode.outputFormat(forBus: 0)
    inputNode.installTap(onBus: 0, bufferSize: 1024, format: recordingFormat) { (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
        self.recognitionRequest?.append(buffer)
    }

    audioEngine.prepare()
    try! audioEngine.start()
    // Create and configure the speech recognition request.
    recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
    guard let recognitionRequest = recognitionRequest else { fatalError("Unable to create a SFSpeechAudioBufferRecognitionRequest object") }
    recognitionRequest.shouldReportPartialResults = true
    
    // Create a recognition task for the speech recognition session.
    // Keep a reference to the task so that it can be canceled.
    recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
        var isFinal = false
        if let result = result {
            // Update the text view with the results.
            self.recognizedText = result.bestTranscription.formattedString
            isFinal = result.isFinal
        }
        if error != nil || isFinal {
            // Stop recognizing speech if there is a problem.
            self.audioEngine.stop()
            inputNode.removeTap(onBus: 0)
            self.recognitionRequest = nil
            self.recognitionTask = nil
        }
    }                   
                  
     return recognizedText
}

I am trying to upload the captured audio to aws s3 with this function

func tapUploadVideo(_ sender: Any) {

//guard let path = Bundle.main.path(forResource: "Video", ofType: "mov") else { return }
let videoUrl = URL(fileURLWithPath: "your video file path")
AWSS3Manager.shared.uploadVideo(videoUrl: videoUrl, progress: { [weak self] (progress) in
    
    guard let strongSelf = self else { return }
    strongSelf.progressView.progress = Float(progress)
    
}) { [weak self] (uploadedFileUrl, error) in
    
    guard let strongSelf = self else { return }
    if let finalPath = uploadedFileUrl as? String {
        strongSelf.s3UrlLabel.text = "Uploaded file url: " + finalPath
    } else {
        print("\(String(describing: error?.localizedDescription))")
    }
  }
}

How could I get the captured audio local url?

Kenneth Argo
  • 1,697
  • 12
  • 19
e.iluf
  • 1,389
  • 5
  • 27
  • 69

1 Answers1

0

Currently you're using an API that takes advantage of streaming audio. The upside to this is that you could theoretically stream forever without taking up any (well besides perhaps caching and whatever) disk space. The downside is that there is no recording.

Here are two ways you could approach fixing this.

1.) First record the audio, save it to disk, then pass it to the other API Apple provides. SFSpeechURLRecognitionRequest instead of SFSpeechAudioBufferRecognitionRequest. You'd find some way to record the audio and then pass in the URL to file you recorded.

2.) This one is probably the one you're looking for. As you're building up self.recognitionRequest in the installTap callback, you can save that buffer to the disk using AVAudioFile. Here is an example that seems to work :)

Once you save the file you can use your existing code with the URL path you specified to file, to upload it wherever you want. Good luck!

private func startRecording() throws {
    // Cancel the previous task if it's running.
    recognitionTask?.cancel()
    self.recognitionTask = nil
    
    // Configure the audio session for the app.
    let audioSession = AVAudioSession.sharedInstance()
    try audioSession.setCategory(.record, mode: .measurement, options: .duckOthers)
    try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
    let inputNode = audioEngine.inputNode

    // Create and configure the speech recognition request.
    recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
    guard let recognitionRequest = recognitionRequest else { fatalError("Unable to create a SFSpeechAudioBufferRecognitionRequest object") }
    recognitionRequest.shouldReportPartialResults = true
    
    // Keep speech recognition data on device
    if #available(iOS 13, *) {
        recognitionRequest.requiresOnDeviceRecognition = false
    }
    
    // Create a recognition task for the speech recognition session.
    // Keep a reference to the task so that it can be canceled.
    recognitionTask = speechRecognizer.recognitionTask(with: recognitionRequest) { result, error in
        var isFinal = false
        
        if let result = result {
            // Update the text view with the results.
            self.textView.text = result.bestTranscription.formattedString
            isFinal = result.isFinal
            print("Text \(result.bestTranscription.formattedString)")
        }
        
        if error != nil || isFinal {
            // Stop recognizing speech if there is a problem.
            self.audioEngine.stop()
            inputNode.removeTap(onBus: 0)

            self.recognitionRequest = nil
            self.recognitionTask = nil

            self.recordButton.isEnabled = true
            self.recordButton.setTitle("Start Recording", for: [])
        }
    }

    guard let commonFormat = AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: 44100, channels: 2, interleaved: false) else {
        return
    }

    //Setup a file to record to
    let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)
    let recordingPath = paths[0].appendingPathComponent("recording.wav")
    let audioFile = try! AVAudioFile(forWriting: recordingPath, settings: commonFormat.settings, commonFormat: commonFormat.commonFormat, interleaved: false)

    // Configure the microphone input.
    inputNode.installTap(onBus: 0, bufferSize: 1024, format: commonFormat) { (buffer: AVAudioPCMBuffer, when: AVAudioTime) in
        self.recognitionRequest?.append(buffer)

        do {
            try audioFile.write(from: buffer)
        } catch {
            print(error.localizedDescription)
        }

    }
    
    audioEngine.prepare()
    try audioEngine.start()
    
    // Let the user know to start talking.
    textView.text = "(Go ahead, I'm listening)"
}