ブログ@kaorun55

HoloLensやKinectなどのDepthセンサーを中心に書いています。

Speech Recognizerをまとめてシンプルにする

Kinect SDKとSpeech Platformを使った音声認識は、意外とたくさんのクラスを使うため、少しややこしいです。
そのため、まとめて必要な部分だけを外出しにしてみました。


全体のプロジェクトはこちらにあります。ベースはKinect SDKの「Speech」サンプルです。

Recognizerクラス

Speech Recognizerで使う「SpeechRecognitionEngine」、「RecognizerInfo」、「GrammarBuilder」、「Grammar」をまとめました。
コンストラクタで使用する言語と選択肢を入れれば、あとはストリームを設定してできあがりです。

using System;
using System.IO;
using System.Linq;
using Microsoft.Speech.AudioFormat;
using Microsoft.Speech.Recognition;

namespace SpeechRecognizer
{
    public class Recognizer
    {
        SpeechRecognitionEngine engine;
        RecognizerInfo info;
        GrammarBuilder builder;
        Grammar grammar;

        public event EventHandler<SpeechRecognitionRejectedEventArgs> SpeechRecognitionRejected;
        public event EventHandler<SpeechHypothesizedEventArgs> SpeechHypothesized;
        public event EventHandler<SpeechRecognizedEventArgs> SpeechRecognized;

        public string Name
        {
            get
            {
                return info.Name;
            }
        }

        public Recognizer( string name, Choices alternateChoices )
        {
            info = GetRecognizer( name );
            engine = new SpeechRecognitionEngine( info.Id );

            builder = new GrammarBuilder();
            builder.Culture = info.Culture;
            builder.Append( alternateChoices );

            grammar = new Grammar( builder );
            engine.LoadGrammar( grammar );
            engine.SpeechRecognized += Recognizer_SpeechRecognized;
            engine.SpeechHypothesized += Recognizer_SpeechHypothesized;
            engine.SpeechRecognitionRejected += Recognizer_SpeechRecognitionRejected;
        }

        public void SetInputToAudioStream( Stream audioSource, SpeechAudioFormatInfo audioFormat )
        {
            engine.SetInputToAudioStream( audioSource, audioFormat );
        }

        public void RecognizeAsync( RecognizeMode mode )
        {
            engine.RecognizeAsync( mode );
        }

        public void RecognizeAsyncStop()
        {
            engine.RecognizeAsyncStop();
        }


        private void Recognizer_SpeechRecognitionRejected( object sender, SpeechRecognitionRejectedEventArgs e )
        {
            if ( SpeechRecognitionRejected != null ) {
                SpeechRecognitionRejected( sender, e );
            }
        }

        private void Recognizer_SpeechHypothesized( object sender, SpeechHypothesizedEventArgs e )
        {
            if ( SpeechHypothesized != null ) {
                SpeechHypothesized( sender, e );
            }
        }

        private void Recognizer_SpeechRecognized( object sender, SpeechRecognizedEventArgs e )
        {
            if ( SpeechRecognized != null ) {
                SpeechRecognized( sender, e );
            }
        }


        private static RecognizerInfo GetRecognizer( string name )
        {
            Func<RecognizerInfo, bool> matchingFunc = r =>
            {
                return name.Equals( r.Culture.Name, StringComparison.InvariantCultureIgnoreCase );
            };
            return SpeechRecognitionEngine.InstalledRecognizers().Where( matchingFunc ).FirstOrDefault();
        }
    }
}

使う側

基本的にはSpeechサンプルのままですが、音声認識のあたりがよりシンプルになっています。

using System;
using System.IO;
using Microsoft.Research.Kinect.Audio;
using Microsoft.Speech.AudioFormat;
using Microsoft.Speech.Recognition;
using SpeechRecognizer;

namespace KinectSpeechRecognize
{
    class Program
    {
        static void Main( string[] args )
        {
            try {
                using ( var source = new KinectAudioSource() ) {
                    source.FeatureMode = true;
                    source.AutomaticGainControl = false;
                    source.SystemMode = SystemMode.OptibeamArrayOnly;

                    var colors = new Choices();
                    colors.Add( "red" );
                    colors.Add( "green" );
                    colors.Add( "blue" );
                    colors.Add( "end" );
                    colors.Add( "赤" );
                    colors.Add( "ミドリ" );
                    colors.Add( "あお" );

                    Recognizer r = new Recognizer( "ja-JP", colors );
                    r.SpeechRecognized += SreSpeechRecognized;
                    r.SpeechHypothesized += SreSpeechHypothesized;
                    r.SpeechRecognitionRejected += SreSpeechRecognitionRejected;
                    Console.WriteLine( "Using: {0}", r.Name );

                    using ( Stream s = source.Start() ) {
                        r.SetInputToAudioStream( s, new SpeechAudioFormatInfo(
                                                        EncodingFormat.Pcm, 16000, 16, 1,
                                                        32000, 2, null ) );

                        Console.WriteLine( "Recognizing. Say: 'red', 'green' or 'blue'. Press ENTER to stop" );

                        r.RecognizeAsync( RecognizeMode.Multiple );
                        Console.ReadLine();
                        Console.WriteLine( "Stopping recognizer ..." );
                        r.RecognizeAsyncStop();
                    }
                }
            }
            catch ( Exception ex ) {
                Console.WriteLine( ex.Message );
            }
        }

        static void SreSpeechRecognitionRejected( object sender, SpeechRecognitionRejectedEventArgs e )
        {
            Console.WriteLine( "\nSpeech Rejected" );
        }

        static void SreSpeechHypothesized( object sender, SpeechHypothesizedEventArgs e )
        {
            Console.Write( "\rSpeech Hypothesized: \t{0}", e.Result.Text );
        }

        static void SreSpeechRecognized( object sender, SpeechRecognizedEventArgs e )
        {
            Console.WriteLine( "\nSpeech Recognized: \t{0}", e.Result.Text );
        }
    }
}

まとめ

これで音声認識のところを独立できたので、OpenNIでも使えるかなぁと思ったのですが、事態はそう簡単にはいかないようですw