WebSocketを利用してGoogle Cloud Speech APIのストリーミング認識を実行したい

前提・実現したいこと

[Python3] Google Cloud Speech gRPC APIでストリーミング音声認識の実行
こちらではPyAudioを利用してマイク入力を利用されていますが、これをブラウザからのマイク入力に置き換えるにはどうすべきか、ということです。
音声のバイトの扱いがonaudioprocessで渡されたものと、PyAudioのStreamで得られたものでどう異なるのかを知りたいです。

該当のソースコード

python
1import audioop
2import math
3import os
4import sys
5import threading
6import time
7
8import tornado.websocket
9
10from google.cloud import speech
11from google.cloud.speech import enums, types
12from google.rpc import code_pb2
13
14LANGUAGE_CODE = 'ja_JP'
15FRAME_SECONDS = 0.1
16SILENT_DECIBEL = 40
17SAMPLE_SIZE = 2
18SAMPLE_RATE = 44100
19
20frames = []
21silent_frames = []
22should_finish_stream = False
23is_recording = False
24
25
26class stdout:
27    BOLD = "\033[1m"
28    END = "\033[0m"
29    CLEAR = "\033[2K"
30
31
32def bold(string):
33    return stdout.BOLD + string + stdout.END
34
35
36def printr(string):
37    sys.stdout.write("\r" + stdout.CLEAR)
38    sys.stdout.write(string)
39    sys.stdout.flush()
40
41
42class Result:
43    def __init__(self):
44        self.transcription = ""
45        self.confidence = ""
46        self.is_final = False
47
48
49recognition_result = Result()
50
51
52class StreamVoiceSocketHandler(tornado.websocket.WebSocketHandler):
53    """ 音声受付WebSocketハンドラ """
54
55    def open(self):
56        print("connected")
57
58        self.thread = TestThread()
59        self.thread.start()
60
61    def on_message(self, message):
62        frames.append(message)
63
64    def on_close(self):
65        print("disconnected")
66
67
68class TestThread(threading.Thread):
69    def run(self):
70        global is_recording
71        global should_finish_stream
72
73        self.client = speech.SpeechClient()
74        self.config = types.RecognitionConfig(
75            encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16,
76            sample_rate_hertz=SAMPLE_RATE,
77            language_code=LANGUAGE_CODE)
78        self.streaming_config = types.StreamingRecognitionConfig(config=self.config)
79
80        while True:
81            should_finish_stream = False
82            is_recording = False
83            self.run_recognition_loop()
84
85    def run_recognition_loop(self):
86        global frames
87        global silent_frames
88        global is_recording
89        global should_finish_stream
90
91        if len(silent_frames) > 4:
92            silent_frames = silent_frames[-4:]
93
94        while not is_recording:
95            time.sleep(FRAME_SECONDS // 4)
96
97            if len(frames) > 4:
98                for frame_index in range(4):
99                    data = frames[frame_index]
100                    rms = audioop.rms(data, 2)
101                    decibel = 20 * math.log10(rms) if rms > 0 else 0
102                    if decibel < SILENT_DECIBEL:
103                        silent_frames += frames[0:frame_index + 1]
104                        del frames[0:frame_index + 1]
105                        return
106
107                is_recording = True
108                frames = silent_frames + frames
109                silent_frames = []
110
111        try:
112            self.listen_loop(self.client.streaming_recognize(self.streaming_config, self.request_stream()))
113            printr(" ".join((bold(recognition_result.transcription), "	", "confidence: ",
114                             str(int(recognition_result.confidence * 100)), "%")))
115            print()
116        except Exception as e:
117            print(str(e))
118
119    def listen_loop(self, recognize_stream):
120        global should_finish_stream
121        global recognition_result
122
123        for resp in recognize_stream:
124            if resp.error.code != code_pb2.OK:
125                raise RuntimeError(resp.error.message)
126
127            for result in resp.results:
128                for alt in result.alternatives:
129                    recognition_result.transcription = alt.transcript
130                    recognition_result.confidence = alt.confidence
131                    recognition_result.stability = result.stability
132                    printr(" ".join(
133                        (alt.transcript, "	", "stability: ", str(int(result.stability * 100)), "%")))
134
135                if result.is_final:
136                    recognition_result.is_final = True
137                    should_finish_stream = True
138                    return
139
140    def request_stream(self):
141        while True:
142            time.sleep(FRAME_SECONDS / 4)
143
144            if should_finish_stream:
145                return
146
147            if len(frames) > 0:
148                yield types.StreamingRecognizeRequest(audio_content=frames.pop(0))
149

javascript
1function start_recording() {
2    let server = null;
3    let AudioContext = null;
4
5    function connect() {
6        return new Promise(function (resolve, reject) {
7            server = new WebSocket("wss://" + location.host + "/ws/stream_voice");
8            server.onopen = function () {
9                resolve(server);
10                console.log('connected');
11            };
12            server.onerror = function (err) {
13                reject(err);
14            };
15        });
16    }
17
18    connect().then(function (server) {
19        const handleSuccess = function (stream) {
20            AudioContext = window.AudioContext || window.webkitAudioContext;
21            const context = new AudioContext();
22            const input = context.createMediaStreamSource(stream);
23            const processor = context.createScriptProcessor(1024, 1, 1);
24            input.connect(processor);
25            processor.connect(context.destination);
26
27            processor.onaudioprocess = function (e) {
28                const voice = e.inputBuffer.getChannelData(0);
29                server.send(voice.buffer);
30            };
31        };
32
33        navigator.mediaDevices.getUserMedia({audio: true, video: false}).then(handleSuccess);
34    }).catch(function (err) {
35    });
36}