Kinect SDKとSpeech Platformを使った音声認識は、意外とたくさんのクラスを使うため、少しややこしいです。
そのため、まとめて必要な部分だけを外出しにしてみました。
全体のプロジェクトはこちらにあります。ベースはKinect SDKの「Speech」サンプルです。
- speech_sdk_sandbox/KinectSpeechRecognize at master · kaorun55/speech_sdk_sandbox · GitHub
- speech_sdk_sandbox/SpeechRecognizer at master · kaorun55/speech_sdk_sandbox · GitHub
Recognizerクラス
Speech Recognizerで使う「SpeechRecognitionEngine」、「RecognizerInfo」、「GrammarBuilder」、「Grammar」をまとめました。
コンストラクタで使用する言語と選択肢を入れれば、あとはストリームを設定してできあがりです。
using System; using System.IO; using System.Linq; using Microsoft.Speech.AudioFormat; using Microsoft.Speech.Recognition; namespace SpeechRecognizer { public class Recognizer { SpeechRecognitionEngine engine; RecognizerInfo info; GrammarBuilder builder; Grammar grammar; public event EventHandler<SpeechRecognitionRejectedEventArgs> SpeechRecognitionRejected; public event EventHandler<SpeechHypothesizedEventArgs> SpeechHypothesized; public event EventHandler<SpeechRecognizedEventArgs> SpeechRecognized; public string Name { get { return info.Name; } } public Recognizer( string name, Choices alternateChoices ) { info = GetRecognizer( name ); engine = new SpeechRecognitionEngine( info.Id ); builder = new GrammarBuilder(); builder.Culture = info.Culture; builder.Append( alternateChoices ); grammar = new Grammar( builder ); engine.LoadGrammar( grammar ); engine.SpeechRecognized += Recognizer_SpeechRecognized; engine.SpeechHypothesized += Recognizer_SpeechHypothesized; engine.SpeechRecognitionRejected += Recognizer_SpeechRecognitionRejected; } public void SetInputToAudioStream( Stream audioSource, SpeechAudioFormatInfo audioFormat ) { engine.SetInputToAudioStream( audioSource, audioFormat ); } public void RecognizeAsync( RecognizeMode mode ) { engine.RecognizeAsync( mode ); } public void RecognizeAsyncStop() { engine.RecognizeAsyncStop(); } private void Recognizer_SpeechRecognitionRejected( object sender, SpeechRecognitionRejectedEventArgs e ) { if ( SpeechRecognitionRejected != null ) { SpeechRecognitionRejected( sender, e ); } } private void Recognizer_SpeechHypothesized( object sender, SpeechHypothesizedEventArgs e ) { if ( SpeechHypothesized != null ) { SpeechHypothesized( sender, e ); } } private void Recognizer_SpeechRecognized( object sender, SpeechRecognizedEventArgs e ) { if ( SpeechRecognized != null ) { SpeechRecognized( sender, e ); } } private static RecognizerInfo GetRecognizer( string name ) { Func<RecognizerInfo, bool> matchingFunc = r => { return name.Equals( r.Culture.Name, StringComparison.InvariantCultureIgnoreCase ); }; return SpeechRecognitionEngine.InstalledRecognizers().Where( matchingFunc ).FirstOrDefault(); } } }
使う側
基本的にはSpeechサンプルのままですが、音声認識のあたりがよりシンプルになっています。
using System; using System.IO; using Microsoft.Research.Kinect.Audio; using Microsoft.Speech.AudioFormat; using Microsoft.Speech.Recognition; using SpeechRecognizer; namespace KinectSpeechRecognize { class Program { static void Main( string[] args ) { try { using ( var source = new KinectAudioSource() ) { source.FeatureMode = true; source.AutomaticGainControl = false; source.SystemMode = SystemMode.OptibeamArrayOnly; var colors = new Choices(); colors.Add( "red" ); colors.Add( "green" ); colors.Add( "blue" ); colors.Add( "end" ); colors.Add( "赤" ); colors.Add( "ミドリ" ); colors.Add( "あお" ); Recognizer r = new Recognizer( "ja-JP", colors ); r.SpeechRecognized += SreSpeechRecognized; r.SpeechHypothesized += SreSpeechHypothesized; r.SpeechRecognitionRejected += SreSpeechRecognitionRejected; Console.WriteLine( "Using: {0}", r.Name ); using ( Stream s = source.Start() ) { r.SetInputToAudioStream( s, new SpeechAudioFormatInfo( EncodingFormat.Pcm, 16000, 16, 1, 32000, 2, null ) ); Console.WriteLine( "Recognizing. Say: 'red', 'green' or 'blue'. Press ENTER to stop" ); r.RecognizeAsync( RecognizeMode.Multiple ); Console.ReadLine(); Console.WriteLine( "Stopping recognizer ..." ); r.RecognizeAsyncStop(); } } } catch ( Exception ex ) { Console.WriteLine( ex.Message ); } } static void SreSpeechRecognitionRejected( object sender, SpeechRecognitionRejectedEventArgs e ) { Console.WriteLine( "\nSpeech Rejected" ); } static void SreSpeechHypothesized( object sender, SpeechHypothesizedEventArgs e ) { Console.Write( "\rSpeech Hypothesized: \t{0}", e.Result.Text ); } static void SreSpeechRecognized( object sender, SpeechRecognizedEventArgs e ) { Console.WriteLine( "\nSpeech Recognized: \t{0}", e.Result.Text ); } } }
まとめ
これで音声認識のところを独立できたので、OpenNIでも使えるかなぁと思ったのですが、事態はそう簡単にはいかないようですw