Azure Speech Servicesを使って音声認識をやってみた

音声を認識して会話できるチャットプログラムを作りたいので、音声認識の方法をいろいろ調べていました。

音声認識のサービスとしてはAzure、Google、AWS、OpenAIのものなどがあります。

Unityでの開発方法としては、UnityWebRequestを使ってREST APIを呼び出す方法と、SDKをインストールして SDKの機能を使って開発する方法があります。

当初はいろんなサンプルを見て違いがわからず混乱してましたが、どうやらマイクロソフトのSpeech SDKを使うとマイクからの音声を認識することができてより簡単に実装できそうなのでSDKで試してみることにしました。

試行錯誤の上ようやく動かせたので載せておきます。 Unityで下記コードを実行するとマイクからの入力を日本語テキストに変換してコンソールに出力します。

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System;
using System.IO;
using System.Threading.Tasks;
using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;
using Cysharp.Threading.Tasks;

public class Speech : MonoBehaviour
{

    static string speechKey = "サブスクリプションキー";
    static string speechRegion = "japaneast";


    // Start is called before the first frame update
    async void Start()
    {
        Debug.Log("Start");

        //キーとリージョンを使用して SpeechConfig インスタンスを作成
        var speechConfig = SpeechConfig.FromSubscription(speechKey, speechRegion);
        speechConfig.SpeechRecognitionLanguage = "ja-JP";

        //FromDefaultMicrophoneInput() を使用して AudioConfig インスタンスを作成
        using var audioConfig = AudioConfig.FromDefaultMicrophoneInput();

        //audioConfig と speechConfig を渡して SpeechRecognizer を初期化
        using var speechRecognizer = new SpeechRecognizer(speechConfig, audioConfig);

        Debug.Log("Speak into your microphone.");

        //最大 15 秒間、または無音が検出されるまでの発話を文字起こし
        var speechRecognitionResult = await speechRecognizer.RecognizeOnceAsync(); 
        OutputSpeechRecognitionResult(speechRecognitionResult);

    }

    static void OutputSpeechRecognitionResult(SpeechRecognitionResult speechRecognitionResult)
    {
        switch (speechRecognitionResult.Reason)
        {
            case ResultReason.RecognizedSpeech: //認識成功
                Debug.Log($"RECOGNIZED: Text={speechRecognitionResult.Text}");
                break;
            case ResultReason.NoMatch: //認識できなかった
                Debug.Log($"NOMATCH: Speech could not be recognized.");
                break;
            case ResultReason.Canceled: //エラーが起きた
                var cancellation = CancellationDetails.FromResult(speechRecognitionResult);
                Debug.Log($"CANCELED: Reason={cancellation.Reason}");

                if (cancellation.Reason == CancellationReason.Error)
                {
                    Debug.Log($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                    Debug.Log($"CANCELED: ErrorDetails={cancellation.ErrorDetails}");
                    Debug.Log($"CANCELED: Did you set the speech resource key and region values?");
                }
                break;
        }
    }


}

実行結果