0

I can see that the system audio for WasapiLoopbackCapture is not the right rate to try and get Microsoft Windows' audio to text recognition working (only 8/16 is supported, eg AudioBitsPerSample.Sixteen ), from the output of this program. The odd thing I cannot figure out is why the resampler in this code is never called. I set a breakpoint - even wait 5s for the system audio to play and be placed in the buffer. I am trying to get system sound, to text using the audio speech recognizer. I can see that bytes are written to captureStream during debugging, so why does the resampler never output bytes? - The Console.WriteLine("Never getting here"); is never getting there.

using System;
using System.Speech.Recognition;
using NAudio.Wave;
using NAudio.CoreAudioApi.Interfaces;

using NAudio.CoreAudioApi;
using System.IO;
using System.Speech.AudioFormat;

namespace SpeechRecognitionApp
{

    class FakeStreamer : Stream
    {
        public bool bExit = false;
        Stream stream;
        Stream client;
        public FakeStreamer(Stream client)
        {
            this.client = client;
            this.stream = client;
        }
        public override bool CanRead
        {
            get { return stream.CanRead; }
        }

        public override bool CanSeek
        {
            get { return false; }
        }

        public override bool CanWrite
        {
            get { return stream.CanWrite; }
        }

        public override long Length
        {
            get { return -1L; }
        }

        public override long Position
        {
            get { return 0L; }
            set { }
        }
        public override long Seek(long offset, SeekOrigin origin)
        {
            return 0L;
        }

        public override void SetLength(long value)
        {
            stream.SetLength(value);
        }
        public override int Read(byte[] buffer, int offset, int count)
        {
            int len = 0, c = count;
            while (c > 0 && !bExit)
            {
                try
                {
                    len = stream.Read(buffer, offset, c);
                }
                catch (Exception e)
                {
                    Console.WriteLine("ouch");
                }
                /*if (!client.Connected || len == 0)
                {
                    //Exit read loop
                    return 0;
                }*/
                offset += len;
                c -= len;
            }
            return count;
        }

        public override void Write(byte[] buffer, int offset, int count)
        {
            stream.Write(buffer, offset, count);
        }

        public override void Close()
        {
            stream.Close();
            base.Close();
        }

        public override void Flush()
        {
            stream.Flush();
        }
    }
    class Program
    {
        static void Main(string[] args)
        {

            // Create an in-process speech recognizer for the en-US locale.  
            using (
            SpeechRecognitionEngine recognizer =
              new SpeechRecognitionEngine(
                new System.Globalization.CultureInfo("en-US")))
            {

                // Create and load a dictation grammar.  
                recognizer.LoadGrammar(new DictationGrammar());

                // Add a handler for the speech recognized event.  
                recognizer.SpeechRecognized +=
                  new EventHandler<SpeechRecognizedEventArgs>(recognizer_SpeechRecognized);

                // Configure input to the speech recognizer.  
                //recognizer.SetInputToDefaultAudioDevice();  
                WasapiLoopbackCapture capture = new WasapiLoopbackCapture();
                Stream captureStream = new System.IO.MemoryStream();
                //Stream buffStream = new FakeStreamer(captureStream);
                capture.DataAvailable += (s, a) =>
                {
                    //It is getting here.
                    captureStream.Write(a.Buffer, 0, a.BytesRecorded);
                };
                Console.WriteLine(capture.WaveFormat.AverageBytesPerSecond);
                Console.WriteLine(capture.WaveFormat.BitsPerSample);
                var newFormat = new WaveFormat(8000, 16, 1);
                //using (var conversionStream = new WaveFormatConversionStream(newFormat, capture)
                //capture.StartRecording();
                using (var resampler = new MediaFoundationResampler(new NAudio.Wave.RawSourceWaveStream(captureStream, capture.WaveFormat), newFormat))
                {
                    Stream captureConvertStream = new System.IO.MemoryStream();
                    resampler.ResamplerQuality = 60;
                    //WaveFileWriter.WriteWavFileToStream(captureConvertStream, resampler);
                    //recognizer.SetInputToDefaultAudioDevice();
                    Stream buffStream = new FakeStreamer(captureConvertStream);

                    recognizer.SetInputToAudioStream(buffStream, new SpeechAudioFormatInfo(
                        8000, AudioBitsPerSample.Sixteen, AudioChannel.Mono));

                    // Start asynchronous, continuous speech recognition.  
                    recognizer.RecognizeAsync(RecognizeMode.Multiple);

                    capture.StartRecording();
                    //Never getting to the resampler, the read is always zero!? even if waiting 5s for the audio to buffer.
                    System.Threading.Thread.Sleep(5000);
                    var arr = new byte[128];
                    while (resampler.Read(arr, 0, arr.Length) > 0)
                    {
                        captureConvertStream.Write(arr, 0, arr.Length);
                        Console.WriteLine("Never getting here");
                    }
                    // Keep the console window open.  
                    while (true)
                    {
                        Console.ReadLine();
                    }
                }
            }
        }

        // Handle the SpeechRecognized event.  
        static void recognizer_SpeechRecognized(object sender, SpeechRecognizedEventArgs e)
        {
            Console.WriteLine("Recognized text: " + e.Result.Text);
        }
    }
}
NoBugs
  • 9,310
  • 13
  • 80
  • 146
  • That's not going very well, hard to get ahead. That FakeStreamer is quite important, run [silence.exe](https://matthewvaneerde.wordpress.com/2008/12/16/sample-wasapi-loopback-capture-record-what-you-hear/) to ensure you need it. Resampling is not a great idea, skip that until you find out you actually need it. Could be a volume problem. – Hans Passant Nov 05 '19 at 22:04
  • Yeah you can see the latest revision that work somewhat, here: https://stackoverflow.com/questions/58678228/wasapiloopbackcapture-internal-audio-recognition-gives-jibberish-and-text-when-n – NoBugs Nov 06 '19 at 02:56

0 Answers0