Kinect for Windows SDK beta で遊んでみた〜マイクの音声入力をスピーカーから出す〜 #shibuya_ni

やっとKinectのマイクにたどり着きました。。。サンプルを見てると、COM使いまくりでややこしくてなんとなく敬遠してました。
今回やったことは、Kinectのマイク入力をスピーカーからストリーミング出力することです。
スピーカー出力はやったことがなくていろいろ調べてて結局DirectXのXAudio2に落ち着きました。

環境

Windows 7 64bit
Visual Studio 2010 Premium
Microsoft DirectX SDK (June 2010)
- Developing games - Windows app development
ソース(プロジェクト一式)
- https://github.com/kaorun55/kinect_sdk_sandbox/tree/master/kinect_sdk_samples/AudioCaptureRaw/CPP
- MS SDKのAudioCaptureRawサンプルをベースにしています。変更点はこちら
  - https://github.com/kaorun55/kinect_sdk_sandbox/commit/377e81286a050b8b28ee646a25e51b515eb95619#L3L0

参考

AudioCaptureRawサンプルに、こちらのストリーミング再生を入れました。ありがとうございます:-)

XAudio2を使ってみる。その４-ストリーミング再生- - while( c++ );

実行にあたって

Microsoft DirectX SDK (June 2010)をインストールする
追加のインクルードディレクトリに $(DXSDK_DIR)\Include を追加する
追加のライブラリディレクトリに $(DXSDK_DIR)\Lib\x86 を追加する
ハウリングするので、スピーカーの音量を下げておく
実行時のオーディオソースの選択では「Kinect USB Audio」を選択する(この例では「0」)

Audioについて調べたこと

DirectSound：最新のDirectXで提供されておらず断念
OpenAL：OSSのマルチプラットフォーム音声ライブラリ。Kinectの入力WAVEフォーマットがWAVE_FORMAT_EXTENSIBLEであるが、これを入力にする方法がわからず断念
XAudio2：今のDirectXではこれを使うらしい(XboxとWindowsとのクロスプラットフォーム)。Kinectのサンプルから出力したWAVEファイルを再生できたの採用

ソース

CWASAPICapture::DoCaptureThread のみの変更でいけました。とりあえず載せときます

DWORD CWASAPICapture::DoCaptureThread()
{
    HANDLE mmcssHandle = NULL;

    IXAudio2* xaudio = 0;
    IXAudio2MasteringVoice* mastering_voice = 0;

    IXAudio2SourceVoice* source_voice = 0;

    try {

        bool stillPlaying = true;
        DWORD mmcssTaskIndex = 0;

        HRESULT hr = CoInitializeEx(NULL, COINIT_MULTITHREADED);
        if (FAILED(hr))
        {
            printf_s("Unable to initialize COM in render thread: %x\n", hr);
            return hr;
        }

    
        mmcssHandle = AvSetMmThreadCharacteristics(L"Audio", &mmcssTaskIndex);
        if (mmcssHandle == NULL)
        {
            printf_s("Unable to enable MMCSS on capture thread: %d\n", GetLastError());
        }

        //
        //  XAudioの初期化
        //
        {
            UINT32 flags = 0;
#ifdef _DEBUG
            flags |= XAUDIO2_DEBUG_ENGINE;
#endif
            if( FAILED( hr = XAudio2Create( &xaudio, flags ) ) )
                throw "XAudio2Create";

            //  Create a mastering voice
            if( FAILED( hr = xaudio->CreateMasteringVoice( &mastering_voice ) ) )
                throw "CreateMasteringVoice";

            //  WAVファイルのWAVEFORMATEXを使ってSourceVoiceを作成
            if( FAILED( xaudio->CreateSourceVoice( &source_voice, MixFormat() ) ) )
                throw "CreateSourceVoice";

            //  再生
            source_voice->Start();

        }

        while (stillPlaying)
        {
            HRESULT hr;
            //
            //  In Timer Driven mode, we want to wait for half the desired latency in milliseconds.
            //
            //  That way we'll wake up half way through the processing period to pull the 
            //  next set of samples from the engine.
            //
		    DWORD waitResult = WaitForSingleObject(_ShutdownEvent, _EngineLatencyInMS / 2);
            switch (waitResult)
            {
            case WAIT_OBJECT_0 + 0:     // _ShutdownEvent
                stillPlaying = false;       // We're done, exit the loop.
                break;        
            case WAIT_TIMEOUT:          // Timeout
                //
                //  We need to retrieve the next buffer of samples from the audio capturer.
                //
                BYTE *pData;
                UINT32 framesAvailable;
                DWORD  flags;

                //
                //  Find out how much capture data is available.  We need to make sure we don't run over the length
                //  of our capture buffer.  We'll discard any samples that don't fit in the buffer.
                //
                hr = _CaptureClient->GetBuffer(&pData, &framesAvailable, &flags, NULL, NULL);
                if (SUCCEEDED(hr))
                {
                    UINT32 framesToCopy = min(framesAvailable, static_cast<UINT32>((_CaptureBufferSize - _CurrentCaptureIndex) / _FrameSize));
                    if (framesToCopy != 0)
                    {
                        //
                        //  The flags on capture tell us information about the data.
                        //
                        //  We only really care about the silent flag since we want to put frames of silence into the buffer
                        //  when we receive silence.  We rely on the fact that a logical bit 0 is silence for both float and int formats.
                        //
                        if (flags & AUDCLNT_BUFFERFLAGS_SILENT)
                        {
                            //
                            //  Fill 0s from the capture buffer to the output buffer.
                            //
                            ZeroMemory(&_CaptureBuffer[_CurrentCaptureIndex], framesToCopy*_FrameSize);
                        }
                        else
                        {
                            //
                            //  Copy data from the audio engine buffer to the output buffer.
                            //
                            CopyMemory(&_CaptureBuffer[_CurrentCaptureIndex], pData, framesToCopy*_FrameSize);

                            // SourceVoiceにデータを送信
                            XAUDIO2_BUFFER buffer = { 0 };
                            buffer.AudioBytes = framesToCopy * _FrameSize;  //バッファのバイト数
                            buffer.pAudioData = &pData[ 0 ];                //バッファの先頭アドレス
                            source_voice->SubmitSourceBuffer( &buffer );
                        }
                        //
                        //  Bump the capture buffer pointer.
                        //
                        _CurrentCaptureIndex += framesToCopy*_FrameSize;
                    }
                    hr = _CaptureClient->ReleaseBuffer(framesAvailable);
                    if (FAILED(hr))
                    {
                        printf_s("Unable to release capture buffer: %x!\n", hr);
                    }
                }
                break;
            }
        }
    }
    catch( const char* e )
    {
        std::cout << e << std::endl;
    }

    //  Cleanup XAudio2
    if( mastering_voice != 0 ) {
        // ここで落ちる
        //mastering_voice->DestroyVoice();
        mastering_voice = 0;
    }

    if( xaudio != 0 ) {
        // ここでも落ちる
        //xaudio->Release();
        xaudio = 0;
    }

    AvRevertMmThreadCharacteristics(mmcssHandle);
    
    CoUninitialize();
    return 0;
}