前提・実現したいこと
[Python3] Google Cloud Speech gRPC APIでストリーミング音声認識の実行
こちらではPyAudioを利用してマイク入力を利用されていますが、これをブラウザからのマイク入力に置き換えるにはどうすべきか、ということです。
音声のバイトの扱いがonaudioprocess
で渡されたものと、PyAudioのStreamで得られたものでどう異なるのかを知りたいです。
該当のソースコード
python
1import audioop 2import math 3import os 4import sys 5import threading 6import time 7 8import tornado.websocket 9 10from google.cloud import speech 11from google.cloud.speech import enums, types 12from google.rpc import code_pb2 13 14LANGUAGE_CODE = 'ja_JP' 15FRAME_SECONDS = 0.1 16SILENT_DECIBEL = 40 17SAMPLE_SIZE = 2 18SAMPLE_RATE = 44100 19 20frames = [] 21silent_frames = [] 22should_finish_stream = False 23is_recording = False 24 25 26class stdout: 27 BOLD = "\033[1m" 28 END = "\033[0m" 29 CLEAR = "\033[2K" 30 31 32def bold(string): 33 return stdout.BOLD + string + stdout.END 34 35 36def printr(string): 37 sys.stdout.write("\r" + stdout.CLEAR) 38 sys.stdout.write(string) 39 sys.stdout.flush() 40 41 42class Result: 43 def __init__(self): 44 self.transcription = "" 45 self.confidence = "" 46 self.is_final = False 47 48 49recognition_result = Result() 50 51 52class StreamVoiceSocketHandler(tornado.websocket.WebSocketHandler): 53 """ 音声受付WebSocketハンドラ """ 54 55 def open(self): 56 print("connected") 57 58 self.thread = TestThread() 59 self.thread.start() 60 61 def on_message(self, message): 62 frames.append(message) 63 64 def on_close(self): 65 print("disconnected") 66 67 68class TestThread(threading.Thread): 69 def run(self): 70 global is_recording 71 global should_finish_stream 72 73 self.client = speech.SpeechClient() 74 self.config = types.RecognitionConfig( 75 encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, 76 sample_rate_hertz=SAMPLE_RATE, 77 language_code=LANGUAGE_CODE) 78 self.streaming_config = types.StreamingRecognitionConfig(config=self.config) 79 80 while True: 81 should_finish_stream = False 82 is_recording = False 83 self.run_recognition_loop() 84 85 def run_recognition_loop(self): 86 global frames 87 global silent_frames 88 global is_recording 89 global should_finish_stream 90 91 if len(silent_frames) > 4: 92 silent_frames = silent_frames[-4:] 93 94 while not is_recording: 95 time.sleep(FRAME_SECONDS // 4) 96 97 if len(frames) > 4: 98 for frame_index in range(4): 99 data = frames[frame_index] 100 rms = audioop.rms(data, 2) 101 decibel = 20 * math.log10(rms) if rms > 0 else 0 102 if decibel < SILENT_DECIBEL: 103 silent_frames += frames[0:frame_index + 1] 104 del frames[0:frame_index + 1] 105 return 106 107 is_recording = True 108 frames = silent_frames + frames 109 silent_frames = [] 110 111 try: 112 self.listen_loop(self.client.streaming_recognize(self.streaming_config, self.request_stream())) 113 printr(" ".join((bold(recognition_result.transcription), " ", "confidence: ", 114 str(int(recognition_result.confidence * 100)), "%"))) 115 print() 116 except Exception as e: 117 print(str(e)) 118 119 def listen_loop(self, recognize_stream): 120 global should_finish_stream 121 global recognition_result 122 123 for resp in recognize_stream: 124 if resp.error.code != code_pb2.OK: 125 raise RuntimeError(resp.error.message) 126 127 for result in resp.results: 128 for alt in result.alternatives: 129 recognition_result.transcription = alt.transcript 130 recognition_result.confidence = alt.confidence 131 recognition_result.stability = result.stability 132 printr(" ".join( 133 (alt.transcript, " ", "stability: ", str(int(result.stability * 100)), "%"))) 134 135 if result.is_final: 136 recognition_result.is_final = True 137 should_finish_stream = True 138 return 139 140 def request_stream(self): 141 while True: 142 time.sleep(FRAME_SECONDS / 4) 143 144 if should_finish_stream: 145 return 146 147 if len(frames) > 0: 148 yield types.StreamingRecognizeRequest(audio_content=frames.pop(0)) 149
javascript
1function start_recording() { 2 let server = null; 3 let AudioContext = null; 4 5 function connect() { 6 return new Promise(function (resolve, reject) { 7 server = new WebSocket("wss://" + location.host + "/ws/stream_voice"); 8 server.onopen = function () { 9 resolve(server); 10 console.log('connected'); 11 }; 12 server.onerror = function (err) { 13 reject(err); 14 }; 15 }); 16 } 17 18 connect().then(function (server) { 19 const handleSuccess = function (stream) { 20 AudioContext = window.AudioContext || window.webkitAudioContext; 21 const context = new AudioContext(); 22 const input = context.createMediaStreamSource(stream); 23 const processor = context.createScriptProcessor(1024, 1, 1); 24 input.connect(processor); 25 processor.connect(context.destination); 26 27 processor.onaudioprocess = function (e) { 28 const voice = e.inputBuffer.getChannelData(0); 29 server.send(voice.buffer); 30 }; 31 }; 32 33 navigator.mediaDevices.getUserMedia({audio: true, video: false}).then(handleSuccess); 34 }).catch(function (err) { 35 }); 36}
補足情報(FW/ツールのバージョンなど)
python 3.6.4
tornado 4.5.3
google-cloud-speech 0.32.0
追記
デシベルの判定の部分ですでにおかしいですね。
やはりonaudioprocess
で渡すバッファをそのままではダメなようです。
あなたの回答
tips
プレビュー