0

I'm very close to completing a personal project for modifying the Windows speech dictionary via C# (SAPI 5.4). The last bit I'm working on is how to get the SAPI phone set for a given word. I've found a way to do this via a C# form and spoken recognition obtained through SpSharedRecoContext. However I'm trying to get the recognition to work with a voice file (*.wav) as the input. I understand that this needs to be done via an SpInprocRecoContext.

Every example from Microsoft I've found regarding SAPI 5.4 recognition (like this one for VB) is for SpSharedRecoContext and not SpInprocRecoContext (and I believe I've seen comments that some of these examples are missing details). Additionally, I've found multiple topics on this forum (mostly answered by Eric Brown, see topic 1, topic 2, topic 3) that mention using a SpInprocRecoContext requires more setup than SpSharedRecoContext, but I have yet to find a definitive answer for how to capture voice recognition events when using SpInprocRecoContext in C#.

How can I proceed on this?

Here is my code so far (edited for better organization):

using SpeechLib;
using System;

namespace SpeechTest
{
    class Program
    {       
        static void Main(string[] args)
        {
            string MyText = "dolphins"; // Text string of interest
            string WaveFile = @"C:\Reco\MYWAVE.wav"; // Path to wav file used for voice recognition

            // Declare important recognition objects
            SpInprocRecognizer Recognizer;
            SpInProcRecoContext RecoContext;
            ISpeechRecoGrammar grammar;
            ISpeechFileStream MyFileStream;
            ISpeechVoice MyVoice;

            // Create recognizer and recognition context
            RecoContext = new SpInProcRecoContext();
            Recognizer = (SpInprocRecognizer)RecoContext.Recognizer;            

            // Set up recognition event handling            
            RecoContext.Recognition += new _ISpeechRecoContextEvents_RecognitionEventHandler(RecoContext_Recognition);

            // Set up the grammar
            grammar = RecoContext.CreateGrammar(); // Initialize the grammar
            grammar.DictationLoad("", SpeechLoadOption.SLOStatic); // Set up dictation grammar
            grammar.DictationSetState(SpeechRuleState.SGDSActive); // Activate the grammar

            // Set up audio input for SpInProcRecoContext
            SpObjectTokenCategory Category = new SpObjectTokenCategory();
            Category.SetId(SpeechStringConstants.SpeechCategoryAudioIn);
            SpObjectToken AudioToken = new SpObjectToken();
            AudioToken.SetId(Category.Default);
            Recognizer.AudioInput = AudioToken;

            //Category.SetId(SpeechStringConstants.SpeechCategoryRecognizers); // <== generates a runtime error!!!
            //SpObjectToken EngineToken = new SpObjectToken();
            //EngineToken.SetId(Category.Default);
            //Recognizer.Recognizer = EngineToken;

            //Category.SetId(SpeechStringConstants.SpeechCategoryRecoProfiles); // <== generates a runtime error!!!
            //SpObjectToken ProfileToken = new SpObjectToken();
            //ProfileToken.SetId(Category.Default);
            //Recognizer.Profile = ProfileToken;

            // Create an audio file stream from MyText
            MyFileStream = new SpFileStream(); // Create new SpFileStream instance
            TextToWave(MyText, WaveFile); // Call function to create a wav file that voices MyText
            MyFileStream.Open(WaveFile, SpeechStreamFileMode.SSFMOpenForRead, true);

            // Activate the recognizer and input the audio file stream into the recognizer
            RecoContext.State = SpeechRecoContextState.SRCS_Enabled;
            Recognizer.State = SpeechRecognizerState.SRSActive;
            Recognizer.AudioInputStream = MyFileStream; // <== generates a runtime error!!!

            // Output info and cleanup
            Console.WriteLine(MyText + " = " + SAPIPhonemes);
            MyFileStream.Close();
            Console.ReadLine();
        }

        static void TextToWave(string text, string file)
        {
            SpFileStream fileStream = new SpFileStream();
            SpVoice voice = new SpVoice();

            fileStream.Open(file, SpeechStreamFileMode.SSFMCreateForWrite, true);
            voice.AudioOutputStream = fileStream;
            voice.Speak(text);

            fileStream.Close();
        }

        public static string SAPIPhonemes = null;

        public static void RecoContext_Recognition(int StreamNumber, object StreamPosition, SpeechRecognitionType RecognitionType, ISpeechRecoResult Result)
        {
            // This event is recognized and all the below code works fine when using SpSharedRecoContext

            Console.WriteLine(Result.ToString());
            string SAPIPhonemes = null;
            SpPhoneConverter MyPhoneConverter = new SpPhoneConverter();
            MyPhoneConverter.LanguageId = 1033;

            foreach (ISpeechPhraseElement MyPhrase in Result.PhraseInfo.Elements)
            {
                SAPIPhonemes += " " + MyPhoneConverter.IdToPhone(MyPhrase.Pronunciation);
            }
        }
    }
}

For reference here is the form-based SpSharedRecoContext code that works:

using SpeechLib;
using System;
using System.Windows.Forms;

namespace RecoForm
{
    public partial class Form1 : Form
    {
        // Speech Recognition Object
        SpSharedRecoContext listener;

        // Grammar object
        ISpeechRecoGrammar grammar;

        public Form1()
        {
            InitializeComponent();
        }

        private void Form1_Load(object sender, EventArgs e)
        {
            // nothing
        }

        public string ps;
        private void button1_Click(object sender, EventArgs e)
        {
            if (btnListen.Text == "Start Listening")
            {
               // textBox1.Clear();
                try
                {

                    listener = new SpSharedRecoContext();
                    listener.Recognition += new _ISpeechRecoContextEvents_RecognitionEventHandler(listener_Reco);
                    grammar = listener.CreateGrammar(0);
                    grammar.DictationLoad("", SpeechLoadOption.SLOStatic);
                    grammar.DictationSetState(SpeechRuleState.SGDSActive);
                    btnListen.Text = "Stop Listening";
                    if (ps == "1")
                    {
                        listener.Resume();
                        ps = "0";
                    }
                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.Message);
                }
            }
            else if (btnListen.Text == "Stop Listening")
            {
                listener.Pause();
                btnListen.Text = "Start Listening";
                if (ps == "0")
                {
                    ps = "1";
                }
            }
        }        

        public void listener_Reco(int StreamNumber, object StreamPosition, SpeechRecognitionType RecognitionType, ISpeechRecoResult Result)
        {
            string heard = Result.PhraseInfo.GetText(0, -1, true);
            textBox1.Text += " " + heard;

            SpPhoneConverter MyPhoneConverter = new SpPhoneConverter();
            MyPhoneConverter.LanguageId = 1033;

            foreach (ISpeechPhraseElement MyPhrase in Result.PhraseInfo.Elements)
                textBox2.Text += " " + MyPhoneConverter.IdToPhone(MyPhrase.Pronunciation);
        }
    }
}

// https://stackoverflow.com/questions/11935533/c-sharp-sapi-5-4-languages

Here is another example (in VB) that combines the Microsoft examples (here and here) which STILL doesn't work (see the comments in Command1_Click to find the location where I encounter a runtime error).

Imports SpeechLib

Public Class Form1
    Const WaveFile = "C:\Reco\MYWAVE.wav"

    Dim WithEvents RC As SpInProcRecoContext
    Dim Recognizer As SpInprocRecognizer
    Dim myGrammar As ISpeechRecoGrammar
    Dim MyFileStream As SpeechLib.SpFileStream
    Dim MyVoice As SpeechLib.SpVoice
    Dim MyText As String

    Private Sub Form1_Load(sender As Object, e As EventArgs) Handles MyBase.Load
        On Error GoTo EH

        RC = New SpInProcRecoContext
        Recognizer = RC.Recognizer

        myGrammar = RC.CreateGrammar
        myGrammar.DictationSetState(SpeechRuleState.SGDSActive)
        MyVoice = New SpVoice
        MyVoice.Voice = MyVoice.GetVoices("gender=male").Item(0)

        Dim Category As SpObjectTokenCategory
        Category = New SpObjectTokenCategory
        Category.SetId(SpeechStringConstants.SpeechCategoryAudioIn)

        Dim Token As SpObjectToken
        Token = New SpObjectToken
        Token.SetId(Category.Default)
        Recognizer.AudioInput = Token

        TextBox1.Text = "play the eight of clubs"

EH:
        If Err.Number Then ShowErrMsg()
    End Sub

    Private Sub Command1_Click(sender As Object, e As EventArgs) Handles Command1.Click
        MyFileStream = MakeWAVFileFromText(TextBox1.Text, WaveFile)
        MyFileStream.Open(WaveFile)
        Recognizer.AudioInputStream = MyFileStream ' ==> produces a runtime error!!!
    End Sub

    Private Sub RC_Recognition(ByVal StreamNumber As Long, ByVal StreamPosition As Object, ByVal RecognitionType As SpeechLib.SpeechRecognitionType, ByVal Result As SpeechLib.ISpeechRecoResult)
        On Error GoTo EH

        TextBox2.Text = Result.PhraseInfo.GetText

EH:
        If Err.Number Then ShowErrMsg()
    End Sub

    Private Sub ShowErrMsg()

        ' Declare identifiers:
        Const NL = vbNewLine
        Dim T As String

        T = "Desc: " & Err.Description & NL
        T = T & "Err #: " & Err.Number
        MsgBox(T, vbExclamation, "Run-Time Error")
        End

    End Sub

    Private Function MakeWAVFileFromText(ByVal strText As String, ByVal strFName As String) As SpFileStream

        On Error GoTo EH

        ' Declare identifiers:
        Dim FileStream As SpFileStream
        Dim Voice As SpVoice

        ' Instantiate Voice and FileStream objects:
        Voice = New SpVoice
        FileStream = New SpFileStream

        ' Open specified .wav file, set voice output
        ' to file, and speak synchronously:
        FileStream.Open(strFName, SpeechStreamFileMode.SSFMCreateForWrite, True)
        Voice.AudioOutputStream = FileStream
        Voice.Speak(strText, SpeechVoiceSpeakFlags.SVSFIsXML)

        ' Close file and return reference to FileStream object:
        FileStream.Close()
        MakeWAVFileFromText = FileStream

EH:
        If Err.Number Then ShowErrMsg()
    End Function
End Class

' https://msdn.microsoft.com/en-us/library/ee125184%28v=vs.85%29.aspx
' https://msdn.microsoft.com/en-us/library/ee125344(v=vs.85).aspx

UPDATE: so THIS works however the end of stream event does not fire, keeping Application.Run from returning. I can use some StopWatch jiggery to close up everything as a workaround but obviously that's not ideal. Please bear in mind that I'm still very novice with C# so my comments may not be 100% accurate.

Any ideas how to get the end stream event to fire?

using SpeechLib;
using System;
using System.Windows.Forms;

namespace SAPITextFromVoice
{
    class Program
    {
        // Initialize variables needed throughout this code
        static ISpeechRecoGrammar grammar; // Declare the grammar
        static SpFileStream FileStream; // Declare the voice recognition input file stream
        static string AudioPath = null; // Declare directory path to wav file
        static string GrammarPath = null; // Declare directory path to grammar file

        static void Main(string[] args)
        {
            // Initialize string variable for storing the text of interest
            string MyText = "the rain in spain";

            // Store path to speech grammar XML file
            //GrammarPath = @"C:\Reco\MyGrammar.xml";

            // Store path to voice recognition input wav file
            AudioPath = @"C:\Reco\MyAudio.wav";

            TextToWav(AudioPath, MyText);

            try // Attempt the following code
            {
                // Open the created wav in a new FileStream
                FileStream = new SpFileStream(); // Create new instance of SpFileStream
                FileStream.Open(AudioPath, SpeechStreamFileMode.SSFMOpenForRead, true); // Open the specified file in the FileStream for reading with events enabled

                // Create speech recognizer and associated context
                SpInprocRecognizer MyRecognizer = new SpInprocRecognizer(); // Create new instance of SpInprocRecognizer
                SpInProcRecoContext RecoContext = (SpInProcRecoContext)MyRecognizer.CreateRecoContext(); // Initialize the SpInProcRecoContext (in-process recognition context)

                // Set the voice recognition input as the FileStream
                MyRecognizer.AudioInputStream = FileStream; // This will internally "speak" the wav file for input into the voice recognition engine

                // Set up recognition event handling
                RecoContext.Recognition += new _ISpeechRecoContextEvents_RecognitionEventHandler(RecoContext_Recognition); // Register for successful voice recognition events
                RecoContext.FalseRecognition += new _ISpeechRecoContextEvents_FalseRecognitionEventHandler(RecoContext_FalseRecognition); // Register for failed (low confidence) voice recognition events
                RecoContext.Hypothesis += new _ISpeechRecoContextEvents_HypothesisEventHandler(RecoContext_Hypothesis); // Register for voice recognition hypothesis events
                RecoContext.EndStream += new _ISpeechRecoContextEvents_EndStreamEventHandler(RecoContext_EndStream); // Register for end of file stream events

                // Set up the grammar
                grammar = RecoContext.CreateGrammar(); // Initialize the grammar object
                //grammar.CmdLoadFromFile(GrammarPath, SpeechLoadOption.SLODynamic); // Load custom XML grammar file
                //grammar.CmdSetRuleIdState(0, SpeechRuleState.SGDSActive); // Activate the loaded grammar
                grammar.DictationLoad("", SpeechLoadOption.SLOStatic); // Load blank dictation topic into the grammar
                grammar.DictationSetState(SpeechRuleState.SGDSActive); // Activate dictation grammar
            }
            catch // Handle exceptions in above code
            {
                Console.WriteLine("Error during voice recognition setup");
                return; // Stop executing the code
            }

            Application.Run(); // Starts a standard application message loop on the current thread

            Console.WriteLine("done");
            Console.ReadLine();
        }

        // Function for converting text to a voiced wav file via text-to-speech
        public static bool TextToWav(string FilePath, string text)
        {
            try // Attempt the following code
            {
                if (System.IO.File.Exists(FilePath) == true) // Check if voice recognition wav file already exists
                    System.IO.File.Delete(FilePath); // Delete existing voice recognitoin wav file
                SpFileStream stream = new SpFileStream(); // Create new SpFileStream instance
                stream.Format.Type = SpeechAudioFormatType.SAFT48kHz16BitStereo; // Set the file stream audio format
                stream.Open(FilePath, SpeechStreamFileMode.SSFMCreateForWrite, true); // Open the specified file for writing with events enabled

                SpVoice voice = new SpVoice(); // Create new SPVoice instance
                voice.Volume = 100; // Set the volume level of the text-to-speech voice
                voice.Rate = -2; // Set the rate at which text is spoken by the text-to-speech engine
                string NameAttribute = "Name = " + "Microsoft Anna";
                voice.Voice = voice.GetVoices(NameAttribute).Item(0);
                //voice.Speak(text);
                voice.AudioOutputStream = stream; // Send the audio output to the file stream
                voice.Speak(text, SpeechVoiceSpeakFlags.SVSFDefault); // Internally "speak" the inputted text (which records it in the wav file)

                stream.Close(); // Close the file stream
                return true; // Send "true" back to calling code line
            }
            catch // Handle exceptions in above code
            {
                Console.WriteLine("Error during wav file creation");
                return false; // Send "false" back to calling code line
            }
        }

        // Event handler for successful (higher confidence) voice recognition
        public static void RecoContext_Recognition(int StreamNumber, object StreamPosition, SpeechRecognitionType RecognitionType, ISpeechRecoResult Result)
        {
            RecognitionProcessing(Result, true); // Process the voice recognition result
        }

        // Event handler for false (low confidence) voice recognition
        public static void RecoContext_FalseRecognition(int StreamNumber, object StreamPosition, ISpeechRecoResult Result)
        {
            RecognitionProcessing(Result, false); // Process the voice recognition result
        }

        // Event handler for voice recognition hypotheses
        public static void RecoContext_Hypothesis(int StreamNumber, object StreamPosition, ISpeechRecoResult Result)
        {
            float confidence = Result.PhraseInfo.Elements.Item(0).EngineConfidence;
            Console.WriteLine(("Hypothesis = " + Result.PhraseInfo.GetText() + " (" + Decimal.Round(Convert.ToDecimal(confidence), (confidence > 0.01 ? 3 : 4)) + ")")); // Output info to console
        }

        // Event handler for reaching the end of an audio input stream
        public static void RecoContext_EndStream(int StreamNumber, object StreamPosition, bool StreamReleased)
        {
            // Clean up now that voice recognition is complete

            Console.WriteLine("--- END OF STREAM ---"); // Output info to the console

            try // Attempt the following code
            {
                //grammar.CmdSetRuleIdState(0, SpeechRuleState.SGDSInactive); // Deactivate the loaded grammar
                grammar.DictationSetState(SpeechRuleState.SGDSInactive); // Deactivate dictation grammar
                FileStream.Close(); // Close the input FileStream

                Application.ExitThread(); // Terminates the message loop on the current thread
            }
            catch // Handle exceptions in above code
            {
                Console.WriteLine("Error during cleanup process");
            }
        }

        // Function for processing voice recognition results
        public static void RecognitionProcessing(ISpeechRecoResult Result, bool RecoType)
        {
            try // Attempt the following code
            {
                string RecognizedText = Result.PhraseInfo.GetText().Trim(); // Store recognized text    
                float confidence = Result.PhraseInfo.Elements.Item(0).EngineConfidence; // Get confidence of voice recognition result
                decimal RecognitionConfidence = Decimal.Round(Convert.ToDecimal(confidence), (confidence > 0.01 ? 3 : 4)); // Calculate confidence of voice recognition result convert to decimal, and round the result
                Console.WriteLine((RecoType == false ? "false " : "") + "recognition = " + RecognizedText + " (" + RecognitionConfidence + ")"); // Output info to the console
                GetPhonemes(Result); // Retrieve SAPI phonemes from recognized words
            }
            catch // Handle exceptions in above code
            {
                Console.WriteLine("Error during processing of recognition result");
            }
        }

        // Function for extracting SAPI phonemes from voice recognition results
        public static void GetPhonemes(ISpeechRecoResult Result)
        {
            try // Attempt the following code
            {
                SpPhoneConverter MyPhoneConverter = new SpPhoneConverter(); // Create new SPPhoneConverter instance
                MyPhoneConverter.LanguageId = 1033; // Set the phone converter's language (English = 1033)
                string SAPIPhonemesRaw = null; // Initialize string for storing raw SAPI phoneme data
                string SAPIPhonemes = null; // Initialize string for storing delimited SAPI phoneme data
                int i = 1; // Initialize integer for tracking phoneme count

                foreach (ISpeechPhraseElement MyPhrase in Result.PhraseInfo.Elements) // Loop through each element of the recognized text
                {
                    SAPIPhonemesRaw += " " + MyPhoneConverter.IdToPhone(MyPhrase.Pronunciation); // Build string of SAPI phonemes extracted from the recognized text
                    SAPIPhonemes += (i++ > 1 ? " - " : " ") + MyPhoneConverter.IdToPhone(MyPhrase.Pronunciation); // Build string of SAPI phonemes extracted from the recognized text, delimited by "-"
                }

                Console.WriteLine("Phonemes = " + SAPIPhonemes.Trim());
            }
            catch // Handle exceptions in above code
            {
                Console.WriteLine("Error during phoneme extraction");
            }
        }
    }
}
Exergist
  • 157
  • 12
  • Thanks for the edits @halfer, I appreciate your advice about avoiding extraneous content. – Exergist Apr 14 '18 at 05:53
  • No worries Exergist, you're welcome. There's some reference discussions on Meta if you're interested e.g. [here](https://meta.stackoverflow.com/q/260776) and [here](https://meta.stackoverflow.com/q/288160). Anyway, looks like a good Q! – halfer Apr 14 '18 at 09:48
  • Also, what runtime errors are you getting? – Eric Brown May 15 '18 at 22:38
  • Not very obvious how this happened, the .NET wrapper for SAPI in the System.Speech namespace is very good. Much easier to get going. And to get help with. – Hans Passant May 16 '18 at 19:34
  • Basically I have a voice recognition application that can run C# code as part of a macro. I've created a collection of C# functions that can modify the Windows Speech Recognition dictionary (which the app can leverage for improved recognition and Text-to-Speech), and as far as I know the only way to do this is through SAPI directly. The last piece of my project involves recognizing spoken phrases and extracting the SAPI phonemes which can then serve as input into the dictionary. I know I can get IPA pronunciations with System.Speech, but not sure about SAPI. That's the story in a nutshell. – Exergist May 16 '18 at 23:15

2 Answers2

1

Sorry to take so long, but looking over your code I see a couple of probable issues.

  1. You need to set the input stream on the recognizer before setting the recognizer active. Once the recognizer becomes active, it will start reading immediately. Changing the input stream on an active recognizer will result in an error.
  2. You do need to set a reco profile and reco engine before setting the recognizer active as well. I would create separate SpObjectTokenCategory objects for each type.
Eric Brown
  • 13,774
  • 7
  • 30
  • 71
  • Thanks for the comment Eric, I was hoping you'd find your way to my post :) How does my edited code compare against your above answer? – Exergist May 16 '18 at 18:52
  • Thanks again for your help Eric! You seem to know a lot about speech recognition and SAPI, and I was hoping you might quickly take a look at my related question [here](https://stackoverflow.com/questions/49783718/sapi-symbol-usage-for-speech-dictionary-input). It is the final piece of my project and your feedback would be extremely appreciated! – Exergist Dec 14 '18 at 19:33
0

I'm circling back to provide the complete solution that allows me to take a given word, create a voiced file stream with text-to-speech, and then extract the SAPI phonemes for this word. Contained within is the answer to my original question. Also using SpeechLib refers to Interop.SpeechLib.dll, which is the (COM) Microsoft Speech Object Library v5.4.

Please bear in mind that this code is used as an "inline function" within another parent application called VoiceAttack, so the formatting for the code is slightly different than what you'd expect within Visual Studio. Conversion from this format to Visual Studio isn't difficult, and hopefully others can use this as a springboard for future work.

Please note that I'm a C# hobbyist. The code works exactly as I need it to in terms of functionality and speed, but it may not be as "optimized" as some folks might like and the descriptive comments are limited to my available knowledge. I'm definitely open to suggestions for how to improve it.

Many thanks to Eric Brown for the feedback!

using SpeechLib;
using System;
using System.IO;
using System.Threading;
using System.Windows.Forms;

class VAInline
{
    // Initialize variables needed throughout this code
    ISpeechRecoGrammar grammar; // Declare the grammar
    SpFileStream FileStream; // Declare the voice recognition input file stream
    string AudioPath = null; // Declare directory path to wav file
    string GrammarPath = null; // Declare directory path to grammar file
    string RecognitionFlag = "";
    string RecognitionConfidence = "";
    bool UseDictation; // Declare boolean variable for storing pronunciation dictation grammar setting

    public void main()
    {
        // Reset relevant VoiceAttack text variables
        VA.SetText("~~RecognitionError", null);
        VA.SetText("~~RecognizedText", null);
        VA.SetText("~~SAPIPhonemes", null);
        VA.SetText("~~SAPIPhonemesRaw", null);
        //VA.SetText("~~FalseRecognitionFlag", null);

        // Retrieve the desired word data contained within VoiceAttack text variable
        string ProcessText = null; // Initialize string variable for storing the text of interest
        if (VA.GetText("~~ProcessText") != null) // Check if user provided valid text in input variable
            ProcessText = VA.GetText("~~ProcessText"); // Store text of interest held by VA text variable
        else
        {
            VA.SetText("~~RecognitionError", "Error in input text string (SAPI)"); // Send error detail back to VoiceAttack as text variable
            return; // End code processing
        }

        // Retrieve path to speech grammar XML file from VoiceAttack
        GrammarPath = VA.GetText("~~GrammarFilePath");

        // Retrieve path to voice recognition input wav file from VoiceAttack
        AudioPath = VA.GetText("~~AudioFilePath");

        // Check if TTS engine is voicing the input for the speech recognition engine
        if (VA.GetBoolean("~~UserVoiceInput") == false)
        {
            //VA.WriteToLog("creating wav file");
            if (TextToWav(AudioPath, ProcessText) == false) // Create wav file with specified path that voices specified text (with text-to-speech) and check if the creation was NOT successful
                return; // Stop executing the code
        }

        // Create speech recognizer and associated context
        SpInprocRecognizer MyRecognizer = new SpInprocRecognizer(); // Create new instance of SpInprocRecognizer
        SpInProcRecoContext RecoContext = (SpInProcRecoContext)MyRecognizer.CreateRecoContext(); // Initialize the SpInProcRecoContext (in-process recognition context)

        try // Attempt the following code
        {
            // Open the created wav in a new FileStream
            FileStream = new SpFileStream(); // Create new instance of SpFileStream
            FileStream.Open(AudioPath, SpeechStreamFileMode.SSFMOpenForRead, true); // Open the specified file in the FileStream for reading with events enabled

            // Set the voice recognition input as the FileStream
            MyRecognizer.AudioInputStream = FileStream; // This will internally "speak" the wav file for input into the voice recognition engine

            // Set up recognition event handling
            RecoContext.Recognition += new _ISpeechRecoContextEvents_RecognitionEventHandler(RecoContext_Recognition); // Register for successful voice recognition events
            RecoContext.FalseRecognition += new _ISpeechRecoContextEvents_FalseRecognitionEventHandler(RecoContext_FalseRecognition); // Register for failed (low confidence) voice recognition events
            if (VA.GetBoolean("~~ShowRecognitionHypothesis") == true) // Check if user wants to show voice recognition hypothesis results
                RecoContext.Hypothesis += new _ISpeechRecoContextEvents_HypothesisEventHandler(RecoContext_Hypothesis); // Register for voice recognition hypothesis events
            RecoContext.EndStream += new _ISpeechRecoContextEvents_EndStreamEventHandler(RecoContext_EndStream); // Register for end of file stream events

            // Set up the grammar
            grammar = RecoContext.CreateGrammar(); // Initialize the grammar object
            UseDictation = (bool?)VA.GetBoolean("~~UseDictation") ?? false; // Set UserDictation based on value from VoiceAttack boolean variable
            if (UseDictation == true) // Check if pronunciation dictation grammar should be used with speech recognition
            {
                //grammar.DictationLoad("", SpeechLoadOption.SLOStatic); // Load blank dictation topic into the grammar
                grammar.DictationLoad("Pronunciation", SpeechLoadOption.SLOStatic); // Load pronunciation dictation topic into the grammar so that the raw (unfiltered) phonemes may be retrieved
                grammar.DictationSetState(SpeechRuleState.SGDSActive); // Activate dictation grammar
            }
            else
            {
                grammar.CmdLoadFromFile(GrammarPath, SpeechLoadOption.SLODynamic); // Load custom XML grammar file
                grammar.CmdSetRuleIdState(0, SpeechRuleState.SGDSActive); // Activate the loaded grammar
            }
            Application.Run(); // Starts a standard application message loop on the current thread
        }
        catch // Handle exceptions in above code
        {
            VA.SetText("~~RecognitionError", "Error during voice recognition setup (SAPI)"); // Send error detail back to VoiceAttack as text variable
            return; // Stop executing the code
        }
        finally // Runs whether an exception is encountered or not
        {
            MyRecognizer = null; // Set to null in preparation for garbage collection
            FileStream.Close(); // Close the input FileStream
            FileStream = null; // Set to null in preparation for garbage collection

            // Close up recognition event handling
            RecoContext.Recognition -= new _ISpeechRecoContextEvents_RecognitionEventHandler(RecoContext_Recognition); // Unregister for successful voice recognition events
            RecoContext.FalseRecognition -= new _ISpeechRecoContextEvents_FalseRecognitionEventHandler(RecoContext_FalseRecognition); // Unregister for failed (low confidence) voice recognition events
            if (VA.GetBoolean("~~ShowRecognitionHypothesis") == true) // Check if user wanted to show voice recognition hypothesis results
                RecoContext.Hypothesis -= new _ISpeechRecoContextEvents_HypothesisEventHandler(RecoContext_Hypothesis); // Unregister for voice recognition hypothesis events
            RecoContext.EndStream -= new _ISpeechRecoContextEvents_EndStreamEventHandler(RecoContext_EndStream); // Unregister for end of file stream events
            RecoContext = null; // Set to null in preparation for garbage collection
        }
        //VA.WriteToLog("voice recognition complete"); // Output info to event log
    }

    // Function for converting text to a voiced wav file via text-to-speech
    public bool TextToWav(string FilePath, string text)
    {
        //VA.WriteToLog("creating wav file"); // Output info to event log
        SpFileStream stream = new SpFileStream(); // Create new SpFileStream instance
        try // Attempt the following code
        {
            if (System.IO.File.Exists(FilePath) == true) // Check if voice recognition wav file already exists
                System.IO.File.Delete(FilePath); // Delete existing voice recognition wav file
            stream.Format.Type = SpeechAudioFormatType.SAFT48kHz16BitStereo; // Set the file stream audio format
            stream.Open(FilePath, SpeechStreamFileMode.SSFMCreateForWrite, true); // Open the specified file for writing with events enabled
            SpVoice voice = new SpVoice(); // Create new SPVoice instance
            voice.Volume = 100; // Set the volume level of the text-to-speech voice
            voice.Rate = -2; // Set the rate at which text is spoken by the text-to-speech engine
            string NameAttribute = "Name = " + VA.GetText("~~TextToSpeechVoice");
            voice.Voice = voice.GetVoices(NameAttribute).Item(0);
            //voice.Speak(text);
            voice.AudioOutputStream = stream; // Send the audio output to the file stream
            voice.Speak(text, SpeechVoiceSpeakFlags.SVSFDefault); // Internally "speak" the inputted text (which records it in the wav file)
            voice = null; // Set to null in preparation for garbage collection
        }
        catch // Handle exceptions in above code
        {
            VA.SetText("~~RecognitionError", "Error during wav file creation (SAPI)"); // Send error detail back to VoiceAttack as text variable
            return false; // Send "false" back to calling code line
        }
        finally // Runs whether an exception is encountered or not
        {
            stream.Close(); // Close the file stream
            stream = null; // Set to null in preparation for garbage collection
        }
        return true; // Send "true" back to calling code line
    }

    // Event handler for successful (higher confidence) voice recognition
    public void RecoContext_Recognition(int StreamNumber, object StreamPosition, SpeechRecognitionType RecognitionType, ISpeechRecoResult Result)
    {
        //VA.WriteToLog("Recognition successful"); // Output info to event log

        //VA.SetText("~~FalseRecognitionFlag", ""); // Send blank recognition flag ("") back to VoiceAttack as text variable
        //RecognitionFlag = ""; // Set the RecognitionFlag as blank
        RecognitionProcessing(Result); // Process the voice recognition result
        //if (UseDictation == false) // Check if pronunciation dictation grammar should NOT be used with speech recognition
        GetPhonemes(Result); // Retrieve SAPI phonemes from recognition result
    }

    // Event handler for unsuccessful (low confidence) voice recognition
    public void RecoContext_FalseRecognition(int StreamNumber, object StreamPosition, ISpeechRecoResult Result)
    {
        //VA.WriteToLog("Low confidence recognition"); // Output info to event log

        //VA.WriteToLog(Result.PhraseInfo.GetText());
        //VA.SetText("~~FalseRecognitionFlag", "*"); // Send unsuccessful recognition flag (text character) back to VoiceAttack as text variable
        RecognitionFlag = "*"; // Set the RecognitionFlag as "*"
        RecognitionProcessing(Result); // Process the voice recognition result
        GetPhonemes(Result); // Retrieve SAPI phonemes from recognition result
    }

    // Event handler for voice recognition hypotheses
    public void RecoContext_Hypothesis(int StreamNumber, object StreamPosition, ISpeechRecoResult Result)
    {
        //VA.WriteToLog("Recognition hypothesis"); // Output info to event log

        float confidence = Result.PhraseInfo.Elements.Item(0).EngineConfidence;
        VA.WriteToLog("Hypothesis = " + Result.PhraseInfo.GetText() + " (" + Decimal.Round(Convert.ToDecimal(confidence), (confidence > 0.01 ? 3 : 4)) + ")"); // Output info to event log
    }

    // Event handler for reaching the end of an audio input stream
    public void RecoContext_EndStream(int StreamNumber, object StreamPosition, bool StreamReleased)
    {
        // VA.WriteToLog("End of stream, cleaning up now"); // Output info to event log

        // Clean up now that voice recognition is complete
        try // Attempt the following code
        {
            if (UseDictation == true)
                grammar.DictationSetState(SpeechRuleState.SGDSInactive); // Deactivate dictation grammar
            else
                grammar.CmdSetRuleIdState(0, SpeechRuleState.SGDSInactive); // Deactivate the loaded grammar
        }
        catch // Handle exceptions in above code
        {
            VA.SetText("~~RecognitionError", "Error during cleanup process (SAPI)"); // Send error detail back to VoiceAttack as text variable
        }
        finally // Runs whether an exception is encountered or not
        {
            Application.ExitThread(); // Terminates the message loop on the current thread
        }
    }

    // Function for processing voice recognition results
    public void RecognitionProcessing(ISpeechRecoResult Result)
    {
        //VA.WriteToLog("Processing recognition result"); // Output info to event log

        try // Attempt the following code
        {
            string RecognizedText = Result.PhraseInfo.GetText().Trim(); // Store recognized text    
            float confidence = Result.PhraseInfo.Elements.Item(0).EngineConfidence; // Get confidence of voice recognition result
            decimal RecognitionConfidenceScore = Decimal.Round(Convert.ToDecimal(confidence), (confidence > 0.01 ? 3 : 4)); // Calculate confidence of voice recognition result convert to decimal, and round the result
            string RecognitionConfidenceLevel = Result.PhraseInfo.Elements.Item(0).ActualConfidence.ToString().Replace("SEC", "").Replace("Confidence", "");
            VA.SetText("~~RecognizedText", RecognizedText); // Send recognized text back to VoiceAttack as text variable
            //VA.SetText("~~RecognitionConfidenceLevel", RecognitionConfidenceLevel); // Send speech recognition confidence level back to VoiceAttack as text variable
            //VA.SetDecimal("~~RecognitionConfidence", RecognitionConfidenceScore); // Send recognized confidence back to VoiceAttack as decimal variable

            if (VA.GetBoolean("~~ShowConfidence") == true)
                RecognitionConfidence = "(" + RecognitionConfidenceLevel + " @ " + RecognitionConfidenceScore.ToString() + ")" + RecognitionFlag;
            //VA.SetText("~~RecognitionConfidence", RecognitionConfidenceLevel + " @ " + RecognitionConfidenceScore.ToString()); // Send speech recognition confidence data back to VoiceAttack as text variable
            VA.SetText("~~RecognitionConfidence", RecognitionConfidence); // Send formatted speech recognition confidence data back to VoiceAttack as text variable
            if (UseDictation == true) // Check if pronunciation dictation grammar should be used with speech recognition
            {
                RecognizedText = RecognizedText.Replace("hh", "h"); // Replace any instances of "hh" in recognized phonemes with "h"
                VA.SetText("~~SAPIPhonemes", RecognizedText); // Send word-delimited SAPI phoneme data back to VoiceAttack as text variable
            }
        }
        catch (Exception e) // Handle exceptions in above code
        {
            VA.WriteToLog(e.ToString());
            VA.SetText("~~RecognitionError", "Error during processing of recognition result (SAPI)"); // Send error detail back to VoiceAttack as text variable
        }
    }

    // Function for extracting SAPI phonemes from voice recognition results
    public void GetPhonemes(ISpeechRecoResult Result)
    {
        //VA.WriteToLog("Extracting phonemes from voice recognition result"); // Output info to event log

        try // Attempt the following code
        {
            SpPhoneConverter MyPhoneConverter = new SpPhoneConverter(); // Create new SPPhoneConverter instance
            MyPhoneConverter.LanguageId = 1033; // Set the phone converter's language (English = 1033)
            string SAPIPhonemesRaw = null; // Initialize string for storing raw SAPI phoneme data
            string SAPIPhonemes = null; // Initialize string for storing delimited SAPI phoneme data
            int i = 1; // Initialize integer for tracking phoneme count
            string WordSeparator = " "; // Initialize string variable for storing the characters used to separate words within the phoneme result

            if (VA.GetBoolean("~~SeparatePhonemes") == true) // Check if user wants to have the "-" character separate the words within the phoneme result
                WordSeparator = " - "; // Redefine the WordSeparator            
            foreach (ISpeechPhraseElement MyPhrase in Result.PhraseInfo.Elements) // Loop through each element of the recognized text
            {
                if (MyPhrase.DisplayText != " ")
                {
                    SAPIPhonemesRaw += " " + MyPhoneConverter.IdToPhone(MyPhrase.Pronunciation); // Build string of SAPI phonemes extracted from the recognized text
                    SAPIPhonemes += (i++ > 1 ? WordSeparator : " ") + MyPhoneConverter.IdToPhone(MyPhrase.Pronunciation); // Build string of SAPI phonemes extracted from the recognized text, delimited by " "
                }
            }
            MyPhoneConverter = null; // Set to null in preparation for garbage collection

            VA.SetText("~~SAPIPhonemesRaw", SAPIPhonemesRaw.Trim()); // Send raw SAPI phoneme data back to VoiceAttack as text variable
            VA.SetText("~~SAPIPhonemes", SAPIPhonemes.Trim()); // Send word-delimited SAPI phoneme data back to VoiceAttack as text variable
        }
        catch // Handle exceptions in above code
        {
            VA.SetText("~~RecognitionError", "Error during phoneme extraction"); // Send error detail back to VoiceAttack as text variable
        }
    }
}

// References:
// https://github.com/rti7743/rtilabs/blob/master/files/asobiba/DictationFilter/DictationFilter/SpeechRecognitionRegexp.cs
// https://stackoverflow.com/questions/6193874/help-with-sapi-v5-1-speechrecognitionengine-always-gives-same-wrong-result-with/6203533#6203533
// http://www.drdobbs.com/com-objects-c-and-the-microsoft-speech-a/184416575
// http://vbcity.com/forums/t/125150.aspx
// https://people.kth.se/~maguire/DEGREE-PROJECT-REPORTS/050702-Johan_Sverin-with-cover.pdf
// https://msdn.microsoft.com/en-us/library/ee125471(v=vs.85).aspx
// https://stackoverflow.com/questions/20770593/speech-to-phoneme-in-net
Exergist
  • 157
  • 12