Azure Speech To Textで言ってない言葉が返ってくる（誤変換、誤認識ではなく明らかに異なる）

前提・実現したいこと

AzureのSpeechToTextを利用して、入力音声のリアルタイムテキスト変換を行おうとしています。

発生している問題

実際に音声を自分で喋って入力しているんですが、「テストテスト」や「こんにちはよろしく」等の言葉を話しても、
テキスト変換されて返って来たものは、「もし」「もしもし」等になっています。以下実行ログです。

実行時のログ

node index_azure.js
Listening at Port 8080
New Connection Initiated
Continuous Reco Started
A new call has connected.
Starting Media Stream MZfbe576884cb91fd7b6bb2d40xxxxxxx
RECOGNIZING: Text=もし
RECOGNIZED: Text=もし。
RECOGNIZING: Text=もしもし
RECOGNIZED: Text=もしもし。
RECOGNIZING: Text=もし
RECOGNIZED: Text=もし。
RECOGNIZING: Text=もし
Call Has Ended
RECOGNIZED: Text=もし。
NOMATCH: Speech could not be recognized.

    Session stopped event.

実際に喋ったタイミングでテキストが返ってきて表示されるので、入力した音声自体は正しくAzure側に渡せていると思うのですが…

「もし」や「もしもし」等の言葉から察するに、過去にそう喋った内容がいつまでも出ている？のかなと
考えているのですが、nodeの再起動等も何回もしているので、ずっと残っているというのもよくわかりません…

何かテキストの取得方法が間違っている？と思いますが、エラーが出ているわけではないので
どのように対処すればいいのかわかりません。

何かわかりましたらお教え頂けると幸いです。
よろしくお願いいたします。

該当のソースコード

javascript
1const WebSocket = require("ws");
2const express = require("express");
3const app = express();
4const server = require("http").createServer(app);
5const wss = new WebSocket.Server({ server });
6const alawmulaw = require("alawmulaw");
7const base64 = require("js-base64");
8const path = require("path");
9
10const {
11  SpeechRecognizer,
12  SpeechConfig,
13  AudioInputStream,
14  AudioConfig,
15  AudioStreamFormat,
16  ResultReason,
17  CancellationReason,
18} = require("microsoft-cognitiveservices-speech-sdk");
19
20// Include Azure Speech to Text
21const speechConfig = SpeechConfig.fromSubscription("{Key}", "{Region}");
22speechConfig.speechRecognitionLanguage = "ja-JP";
23
24// Handle Web Socket Connection
25wss.on(
26  "connection",
27  (connection = (ws) => {
28    console.log("New Connection Initiated");
29
30    // Configure Transcription Request
31    const azurePusher = AudioInputStream.createPushStream(AudioStreamFormat.getWaveFormatPCM(8000, 16, 1));
32    const audioConfig = AudioConfig.fromStreamInput(azurePusher);
33    const recognizer = new SpeechRecognizer(speechConfig, audioConfig);
34
35    recognizer.recognizing = (s, e) => {
36      console.log(`RECOGNIZING: Text=${e.result.text}`);
37    };
38
39    recognizer.recognized = (s, e) => {
40      if (e.result.reason == ResultReason.RecognizedSpeech) {
41        console.log(`RECOGNIZED: Text=${e.result.text}`);
42
43        wss.clients.forEach((client) => {
44          if (client.readyState === WebSocket.OPEN) {
45            client.send(
46              JSON.stringify({
47                event: "interim-transcription",
48                text: e.result.text,
49              })
50            );
51          }
52        });
53      } else if (e.result.reason == ResultReason.NoMatch) {
54        console.log("NOMATCH: Speech could not be recognized.");
55      }
56    };
57
58    recognizer.canceled = (s, e) => {
59      console.log(`CANCELED: Reason=${e.reason}`);
60
61      if (e.reason == CancellationReason.Error) {
62        console.log(`"CANCELED: ErrorCode=${e.errorCode}`);
63        console.log(`"CANCELED: ErrorDetails=${e.errorDetails}`);
64        console.log("CANCELED: Did you update the key and location/region info?");
65      }
66
67      recognizer.stopContinuousRecognitionAsync();
68    };
69
70    recognizer.sessionStopped = (s, e) => {
71      console.log("\n    Session stopped event.");
72      recognizer.stopContinuousRecognitionAsync();
73    };
74
75    recognizer.startContinuousRecognitionAsync(
76      () => {
77        console.log("Continuous Reco Started");
78      },
79      (err) => {
80        console.trace("err - " + err);
81        recognizer.close();
82        recognizer = undefined;
83      }
84    );
85
86    ws.on(
87      "message",
88      (incoming = (message) => {
89        const msg = JSON.parse(message);
90        switch (msg.event) {
91          case "connected":
92            console.log(`A new call has connected.`);
93
94            break;
95
96          case "start":
97            console.log(`Starting Media Stream ${msg.streamSid}`);
98            break;
99
100          case "media":
101            // Write Media Packets to the recognize stream
102            let streampayload = base64.decode(msg.media.payload);
103            let mulawdata = Buffer.from(streampayload);
104
105            let pcmdata = Buffer.from(alawmulaw.mulaw.decode(mulawdata));
106            azurePusher.write(pcmdata.buffer);
107
108            // console.log("\n---------pcm " + pcmdata.length.toString() + "----------");
109
110            //2 let rawPcm = alawmulaw.mulaw.decode(msg.media.payload);
111            //2 let uintView = new Uint8Array(rawPcm.buffer);
112            //2 azurePusher.write(uintView);
113
114            break;
115
116          case "stop":
117            console.log(`Call Has Ended`);
118            azurePusher.close();
119            recognizer.stopContinuousRecognitionAsync();
120            recognizer.close();
121            break;
122        }
123      })
124    );
125  })
126);
127
128//Handle HTTP Request
129app.get("/", (req, res) => res.sendFile(path.join(__dirname, "/index.html")));
130
131app.post("/", (req, res) => {
132  res.set("Content-Type", "text/xml");
133
134  res.send(`
135      <Response>
136        <Start>
137          <Stream url="wss://${req.headers.host}/"/>
138        </Start>
139        <Say voice="alice" language="ja-JP">次の60秒間のオーディオをWebSocketからストリーミングします</Say>
140        <Pause length="60" />
141      </Response>
142    `);
143});
144
145console.log("Listening at Port 8080");
146server.listen(8080);
147

2021/10/20 追記

html
1<!DOCTYPE html>
2<html>
3
4<head>
5    <title>Twilio MediaStreams</title>
6</head>
7
8<body>
9    <h1>Twilio MediaStreams</h1>
10    <h3>
11        リアルタイムで話した内容がテキストに変換されます
12    </h3>
13    <p id="transcription-container"></p>
14    <script>
15        document.addEventListener("DOMContentLoaded", event => {
16            webSocket = new WebSocket("ws://localhost:8080");
17            webSocket.onmessage = function(msg) {
18                const data = JSON.parse(msg.data);
19                if (data.event === "interim-transcription") {
20                    document.getElementById("transcription-container").innerHTML =
21                        data.text;
22                }
23            };
24        });
25    </script>
26</body>
27
28</html>