I'm very close to completing a personal project for modifying the Windows speech dictionary via C# (SAPI 5.4). The last bit I'm working on is how to get the SAPI phone set for a given word. I've found a way to do this via a C# form and spoken recognition obtained through SpSharedRecoContext. However I'm trying to get the recognition to work with a voice file (*.wav) as the input. I understand that this needs to be done via an SpInprocRecoContext.
Every example from Microsoft I've found regarding SAPI 5.4 recognition (like this one for VB) is for SpSharedRecoContext and not SpInprocRecoContext (and I believe I've seen comments that some of these examples are missing details). Additionally, I've found multiple topics on this forum (mostly answered by Eric Brown, see topic 1, topic 2, topic 3) that mention using a SpInprocRecoContext requires more setup than SpSharedRecoContext, but I have yet to find a definitive answer for how to capture voice recognition events when using SpInprocRecoContext in C#.
How can I proceed on this?
Here is my code so far (edited for better organization):
using SpeechLib;
using System;
namespace SpeechTest
{
class Program
{
static void Main(string[] args)
{
string MyText = "dolphins"; // Text string of interest
string WaveFile = @"C:\Reco\MYWAVE.wav"; // Path to wav file used for voice recognition
// Declare important recognition objects
SpInprocRecognizer Recognizer;
SpInProcRecoContext RecoContext;
ISpeechRecoGrammar grammar;
ISpeechFileStream MyFileStream;
ISpeechVoice MyVoice;
// Create recognizer and recognition context
RecoContext = new SpInProcRecoContext();
Recognizer = (SpInprocRecognizer)RecoContext.Recognizer;
// Set up recognition event handling
RecoContext.Recognition += new _ISpeechRecoContextEvents_RecognitionEventHandler(RecoContext_Recognition);
// Set up the grammar
grammar = RecoContext.CreateGrammar(); // Initialize the grammar
grammar.DictationLoad("", SpeechLoadOption.SLOStatic); // Set up dictation grammar
grammar.DictationSetState(SpeechRuleState.SGDSActive); // Activate the grammar
// Set up audio input for SpInProcRecoContext
SpObjectTokenCategory Category = new SpObjectTokenCategory();
Category.SetId(SpeechStringConstants.SpeechCategoryAudioIn);
SpObjectToken AudioToken = new SpObjectToken();
AudioToken.SetId(Category.Default);
Recognizer.AudioInput = AudioToken;
//Category.SetId(SpeechStringConstants.SpeechCategoryRecognizers); // <== generates a runtime error!!!
//SpObjectToken EngineToken = new SpObjectToken();
//EngineToken.SetId(Category.Default);
//Recognizer.Recognizer = EngineToken;
//Category.SetId(SpeechStringConstants.SpeechCategoryRecoProfiles); // <== generates a runtime error!!!
//SpObjectToken ProfileToken = new SpObjectToken();
//ProfileToken.SetId(Category.Default);
//Recognizer.Profile = ProfileToken;
// Create an audio file stream from MyText
MyFileStream = new SpFileStream(); // Create new SpFileStream instance
TextToWave(MyText, WaveFile); // Call function to create a wav file that voices MyText
MyFileStream.Open(WaveFile, SpeechStreamFileMode.SSFMOpenForRead, true);
// Activate the recognizer and input the audio file stream into the recognizer
RecoContext.State = SpeechRecoContextState.SRCS_Enabled;
Recognizer.State = SpeechRecognizerState.SRSActive;
Recognizer.AudioInputStream = MyFileStream; // <== generates a runtime error!!!
// Output info and cleanup
Console.WriteLine(MyText + " = " + SAPIPhonemes);
MyFileStream.Close();
Console.ReadLine();
}
static void TextToWave(string text, string file)
{
SpFileStream fileStream = new SpFileStream();
SpVoice voice = new SpVoice();
fileStream.Open(file, SpeechStreamFileMode.SSFMCreateForWrite, true);
voice.AudioOutputStream = fileStream;
voice.Speak(text);
fileStream.Close();
}
public static string SAPIPhonemes = null;
public static void RecoContext_Recognition(int StreamNumber, object StreamPosition, SpeechRecognitionType RecognitionType, ISpeechRecoResult Result)
{
// This event is recognized and all the below code works fine when using SpSharedRecoContext
Console.WriteLine(Result.ToString());
string SAPIPhonemes = null;
SpPhoneConverter MyPhoneConverter = new SpPhoneConverter();
MyPhoneConverter.LanguageId = 1033;
foreach (ISpeechPhraseElement MyPhrase in Result.PhraseInfo.Elements)
{
SAPIPhonemes += " " + MyPhoneConverter.IdToPhone(MyPhrase.Pronunciation);
}
}
}
}
For reference here is the form-based SpSharedRecoContext code that works:
using SpeechLib;
using System;
using System.Windows.Forms;
namespace RecoForm
{
public partial class Form1 : Form
{
// Speech Recognition Object
SpSharedRecoContext listener;
// Grammar object
ISpeechRecoGrammar grammar;
public Form1()
{
InitializeComponent();
}
private void Form1_Load(object sender, EventArgs e)
{
// nothing
}
public string ps;
private void button1_Click(object sender, EventArgs e)
{
if (btnListen.Text == "Start Listening")
{
// textBox1.Clear();
try
{
listener = new SpSharedRecoContext();
listener.Recognition += new _ISpeechRecoContextEvents_RecognitionEventHandler(listener_Reco);
grammar = listener.CreateGrammar(0);
grammar.DictationLoad("", SpeechLoadOption.SLOStatic);
grammar.DictationSetState(SpeechRuleState.SGDSActive);
btnListen.Text = "Stop Listening";
if (ps == "1")
{
listener.Resume();
ps = "0";
}
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
}
else if (btnListen.Text == "Stop Listening")
{
listener.Pause();
btnListen.Text = "Start Listening";
if (ps == "0")
{
ps = "1";
}
}
}
public void listener_Reco(int StreamNumber, object StreamPosition, SpeechRecognitionType RecognitionType, ISpeechRecoResult Result)
{
string heard = Result.PhraseInfo.GetText(0, -1, true);
textBox1.Text += " " + heard;
SpPhoneConverter MyPhoneConverter = new SpPhoneConverter();
MyPhoneConverter.LanguageId = 1033;
foreach (ISpeechPhraseElement MyPhrase in Result.PhraseInfo.Elements)
textBox2.Text += " " + MyPhoneConverter.IdToPhone(MyPhrase.Pronunciation);
}
}
}
// https://stackoverflow.com/questions/11935533/c-sharp-sapi-5-4-languages
Here is another example (in VB) that combines the Microsoft examples (here and here) which STILL doesn't work (see the comments in Command1_Click to find the location where I encounter a runtime error).
Imports SpeechLib
Public Class Form1
Const WaveFile = "C:\Reco\MYWAVE.wav"
Dim WithEvents RC As SpInProcRecoContext
Dim Recognizer As SpInprocRecognizer
Dim myGrammar As ISpeechRecoGrammar
Dim MyFileStream As SpeechLib.SpFileStream
Dim MyVoice As SpeechLib.SpVoice
Dim MyText As String
Private Sub Form1_Load(sender As Object, e As EventArgs) Handles MyBase.Load
On Error GoTo EH
RC = New SpInProcRecoContext
Recognizer = RC.Recognizer
myGrammar = RC.CreateGrammar
myGrammar.DictationSetState(SpeechRuleState.SGDSActive)
MyVoice = New SpVoice
MyVoice.Voice = MyVoice.GetVoices("gender=male").Item(0)
Dim Category As SpObjectTokenCategory
Category = New SpObjectTokenCategory
Category.SetId(SpeechStringConstants.SpeechCategoryAudioIn)
Dim Token As SpObjectToken
Token = New SpObjectToken
Token.SetId(Category.Default)
Recognizer.AudioInput = Token
TextBox1.Text = "play the eight of clubs"
EH:
If Err.Number Then ShowErrMsg()
End Sub
Private Sub Command1_Click(sender As Object, e As EventArgs) Handles Command1.Click
MyFileStream = MakeWAVFileFromText(TextBox1.Text, WaveFile)
MyFileStream.Open(WaveFile)
Recognizer.AudioInputStream = MyFileStream ' ==> produces a runtime error!!!
End Sub
Private Sub RC_Recognition(ByVal StreamNumber As Long, ByVal StreamPosition As Object, ByVal RecognitionType As SpeechLib.SpeechRecognitionType, ByVal Result As SpeechLib.ISpeechRecoResult)
On Error GoTo EH
TextBox2.Text = Result.PhraseInfo.GetText
EH:
If Err.Number Then ShowErrMsg()
End Sub
Private Sub ShowErrMsg()
' Declare identifiers:
Const NL = vbNewLine
Dim T As String
T = "Desc: " & Err.Description & NL
T = T & "Err #: " & Err.Number
MsgBox(T, vbExclamation, "Run-Time Error")
End
End Sub
Private Function MakeWAVFileFromText(ByVal strText As String, ByVal strFName As String) As SpFileStream
On Error GoTo EH
' Declare identifiers:
Dim FileStream As SpFileStream
Dim Voice As SpVoice
' Instantiate Voice and FileStream objects:
Voice = New SpVoice
FileStream = New SpFileStream
' Open specified .wav file, set voice output
' to file, and speak synchronously:
FileStream.Open(strFName, SpeechStreamFileMode.SSFMCreateForWrite, True)
Voice.AudioOutputStream = FileStream
Voice.Speak(strText, SpeechVoiceSpeakFlags.SVSFIsXML)
' Close file and return reference to FileStream object:
FileStream.Close()
MakeWAVFileFromText = FileStream
EH:
If Err.Number Then ShowErrMsg()
End Function
End Class
' https://msdn.microsoft.com/en-us/library/ee125184%28v=vs.85%29.aspx
' https://msdn.microsoft.com/en-us/library/ee125344(v=vs.85).aspx
UPDATE: so THIS works however the end of stream event does not fire, keeping Application.Run from returning. I can use some StopWatch jiggery to close up everything as a workaround but obviously that's not ideal. Please bear in mind that I'm still very novice with C# so my comments may not be 100% accurate.
Any ideas how to get the end stream event to fire?
using SpeechLib;
using System;
using System.Windows.Forms;
namespace SAPITextFromVoice
{
class Program
{
// Initialize variables needed throughout this code
static ISpeechRecoGrammar grammar; // Declare the grammar
static SpFileStream FileStream; // Declare the voice recognition input file stream
static string AudioPath = null; // Declare directory path to wav file
static string GrammarPath = null; // Declare directory path to grammar file
static void Main(string[] args)
{
// Initialize string variable for storing the text of interest
string MyText = "the rain in spain";
// Store path to speech grammar XML file
//GrammarPath = @"C:\Reco\MyGrammar.xml";
// Store path to voice recognition input wav file
AudioPath = @"C:\Reco\MyAudio.wav";
TextToWav(AudioPath, MyText);
try // Attempt the following code
{
// Open the created wav in a new FileStream
FileStream = new SpFileStream(); // Create new instance of SpFileStream
FileStream.Open(AudioPath, SpeechStreamFileMode.SSFMOpenForRead, true); // Open the specified file in the FileStream for reading with events enabled
// Create speech recognizer and associated context
SpInprocRecognizer MyRecognizer = new SpInprocRecognizer(); // Create new instance of SpInprocRecognizer
SpInProcRecoContext RecoContext = (SpInProcRecoContext)MyRecognizer.CreateRecoContext(); // Initialize the SpInProcRecoContext (in-process recognition context)
// Set the voice recognition input as the FileStream
MyRecognizer.AudioInputStream = FileStream; // This will internally "speak" the wav file for input into the voice recognition engine
// Set up recognition event handling
RecoContext.Recognition += new _ISpeechRecoContextEvents_RecognitionEventHandler(RecoContext_Recognition); // Register for successful voice recognition events
RecoContext.FalseRecognition += new _ISpeechRecoContextEvents_FalseRecognitionEventHandler(RecoContext_FalseRecognition); // Register for failed (low confidence) voice recognition events
RecoContext.Hypothesis += new _ISpeechRecoContextEvents_HypothesisEventHandler(RecoContext_Hypothesis); // Register for voice recognition hypothesis events
RecoContext.EndStream += new _ISpeechRecoContextEvents_EndStreamEventHandler(RecoContext_EndStream); // Register for end of file stream events
// Set up the grammar
grammar = RecoContext.CreateGrammar(); // Initialize the grammar object
//grammar.CmdLoadFromFile(GrammarPath, SpeechLoadOption.SLODynamic); // Load custom XML grammar file
//grammar.CmdSetRuleIdState(0, SpeechRuleState.SGDSActive); // Activate the loaded grammar
grammar.DictationLoad("", SpeechLoadOption.SLOStatic); // Load blank dictation topic into the grammar
grammar.DictationSetState(SpeechRuleState.SGDSActive); // Activate dictation grammar
}
catch // Handle exceptions in above code
{
Console.WriteLine("Error during voice recognition setup");
return; // Stop executing the code
}
Application.Run(); // Starts a standard application message loop on the current thread
Console.WriteLine("done");
Console.ReadLine();
}
// Function for converting text to a voiced wav file via text-to-speech
public static bool TextToWav(string FilePath, string text)
{
try // Attempt the following code
{
if (System.IO.File.Exists(FilePath) == true) // Check if voice recognition wav file already exists
System.IO.File.Delete(FilePath); // Delete existing voice recognitoin wav file
SpFileStream stream = new SpFileStream(); // Create new SpFileStream instance
stream.Format.Type = SpeechAudioFormatType.SAFT48kHz16BitStereo; // Set the file stream audio format
stream.Open(FilePath, SpeechStreamFileMode.SSFMCreateForWrite, true); // Open the specified file for writing with events enabled
SpVoice voice = new SpVoice(); // Create new SPVoice instance
voice.Volume = 100; // Set the volume level of the text-to-speech voice
voice.Rate = -2; // Set the rate at which text is spoken by the text-to-speech engine
string NameAttribute = "Name = " + "Microsoft Anna";
voice.Voice = voice.GetVoices(NameAttribute).Item(0);
//voice.Speak(text);
voice.AudioOutputStream = stream; // Send the audio output to the file stream
voice.Speak(text, SpeechVoiceSpeakFlags.SVSFDefault); // Internally "speak" the inputted text (which records it in the wav file)
stream.Close(); // Close the file stream
return true; // Send "true" back to calling code line
}
catch // Handle exceptions in above code
{
Console.WriteLine("Error during wav file creation");
return false; // Send "false" back to calling code line
}
}
// Event handler for successful (higher confidence) voice recognition
public static void RecoContext_Recognition(int StreamNumber, object StreamPosition, SpeechRecognitionType RecognitionType, ISpeechRecoResult Result)
{
RecognitionProcessing(Result, true); // Process the voice recognition result
}
// Event handler for false (low confidence) voice recognition
public static void RecoContext_FalseRecognition(int StreamNumber, object StreamPosition, ISpeechRecoResult Result)
{
RecognitionProcessing(Result, false); // Process the voice recognition result
}
// Event handler for voice recognition hypotheses
public static void RecoContext_Hypothesis(int StreamNumber, object StreamPosition, ISpeechRecoResult Result)
{
float confidence = Result.PhraseInfo.Elements.Item(0).EngineConfidence;
Console.WriteLine(("Hypothesis = " + Result.PhraseInfo.GetText() + " (" + Decimal.Round(Convert.ToDecimal(confidence), (confidence > 0.01 ? 3 : 4)) + ")")); // Output info to console
}
// Event handler for reaching the end of an audio input stream
public static void RecoContext_EndStream(int StreamNumber, object StreamPosition, bool StreamReleased)
{
// Clean up now that voice recognition is complete
Console.WriteLine("--- END OF STREAM ---"); // Output info to the console
try // Attempt the following code
{
//grammar.CmdSetRuleIdState(0, SpeechRuleState.SGDSInactive); // Deactivate the loaded grammar
grammar.DictationSetState(SpeechRuleState.SGDSInactive); // Deactivate dictation grammar
FileStream.Close(); // Close the input FileStream
Application.ExitThread(); // Terminates the message loop on the current thread
}
catch // Handle exceptions in above code
{
Console.WriteLine("Error during cleanup process");
}
}
// Function for processing voice recognition results
public static void RecognitionProcessing(ISpeechRecoResult Result, bool RecoType)
{
try // Attempt the following code
{
string RecognizedText = Result.PhraseInfo.GetText().Trim(); // Store recognized text
float confidence = Result.PhraseInfo.Elements.Item(0).EngineConfidence; // Get confidence of voice recognition result
decimal RecognitionConfidence = Decimal.Round(Convert.ToDecimal(confidence), (confidence > 0.01 ? 3 : 4)); // Calculate confidence of voice recognition result convert to decimal, and round the result
Console.WriteLine((RecoType == false ? "false " : "") + "recognition = " + RecognizedText + " (" + RecognitionConfidence + ")"); // Output info to the console
GetPhonemes(Result); // Retrieve SAPI phonemes from recognized words
}
catch // Handle exceptions in above code
{
Console.WriteLine("Error during processing of recognition result");
}
}
// Function for extracting SAPI phonemes from voice recognition results
public static void GetPhonemes(ISpeechRecoResult Result)
{
try // Attempt the following code
{
SpPhoneConverter MyPhoneConverter = new SpPhoneConverter(); // Create new SPPhoneConverter instance
MyPhoneConverter.LanguageId = 1033; // Set the phone converter's language (English = 1033)
string SAPIPhonemesRaw = null; // Initialize string for storing raw SAPI phoneme data
string SAPIPhonemes = null; // Initialize string for storing delimited SAPI phoneme data
int i = 1; // Initialize integer for tracking phoneme count
foreach (ISpeechPhraseElement MyPhrase in Result.PhraseInfo.Elements) // Loop through each element of the recognized text
{
SAPIPhonemesRaw += " " + MyPhoneConverter.IdToPhone(MyPhrase.Pronunciation); // Build string of SAPI phonemes extracted from the recognized text
SAPIPhonemes += (i++ > 1 ? " - " : " ") + MyPhoneConverter.IdToPhone(MyPhrase.Pronunciation); // Build string of SAPI phonemes extracted from the recognized text, delimited by "-"
}
Console.WriteLine("Phonemes = " + SAPIPhonemes.Trim());
}
catch // Handle exceptions in above code
{
Console.WriteLine("Error during phoneme extraction");
}
}
}
}