watson SpeechToText でエラーになる

前提・実現したいこと

参考にした動画
 ダウンロードしたGITHUB
python3でIBM watson Speech To Text を使ってマイクからの日本語音声をテキストを変換したいのですが，動画通りにgithubからダウンロードして訂正し実行するとエラーになります・・・・

実行結果

transcribe.py実行
1[Errno 11001] getaddrinfo failed

speech.cfg
1[auth]
2apikey = 自身のapikey
3# Modify region based on where you provisioned your stt instance
4region = jp-tok
5

transcribe.py
1#!/usr/bin/env python
2#
3# Copyright 2016 IBM
4#
5# Licensed under the Apache License, Version 2.0 (the "License"); you may
6# not use this file except in compliance with the License. You may obtain
7# a copy of the License at
8#
9#    http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14# License for the specific language governing permissions and limitations
15# under the License.
16
17import argparse
18import base64
19import configparser
20import json
21import threading
22import time
23
24import pyaudio
25import websocket
26from websocket._abnf import ABNF
27
28CHUNK = 1024
29FORMAT = pyaudio.paInt16
30# Even if your default input is multi channel (like a webcam mic),
31# it's really important to only record 1 channel, as the STT service
32# does not do anything useful with stereo. You get a lot of "hmmm"
33# back.
34CHANNELS = 1
35# Rate is important, nothing works without it. This is a pretty
36# standard default. If you have an audio device that requires
37# something different, change this.
38RATE = 44100
39RECORD_SECONDS = 5
40FINALS = []
41LAST = None
42
43REGION_MAP = {
44    'us-east': 'gateway-wdc.watsonplatform.net',
45    'us-south': 'stream.watsonplatform.net',
46    'eu-gb': 'stream.watsonplatform.net',
47    'eu-de': 'stream-fra.watsonplatform.net',
48    'au-syd': 'gateway-syd.watsonplatform.net',
49    'jp-tok': 'gateway-syd.watsonplatform.net',
50}
51
52
53def read_audio(ws, timeout):
54    """Read audio and sent it to the websocket port.
55
56    This uses pyaudio to read from a device in chunks and send these
57    over the websocket wire.
58
59    """
60    global RATE
61    p = pyaudio.PyAudio()
62    # NOTE(sdague): if you don't seem to be getting anything off of
63    # this you might need to specify:
64    #
65    #    input_device_index=N,
66    #
67    # Where N is an int. You'll need to do a dump of your input
68    # devices to figure out which one you want.
69    RATE = int(p.get_default_input_device_info()['defaultSampleRate'])
70    stream = p.open(format=FORMAT,
71                    channels=CHANNELS,
72                    rate=RATE,
73                    input=True,
74                    frames_per_buffer=CHUNK)
75
76    print("* recording")
77    rec = timeout or RECORD_SECONDS
78
79    for i in range(0, int(RATE / CHUNK * rec)):
80        data = stream.read(CHUNK)
81        # print("Sending packet... %d" % i)
82        # NOTE(sdague): we're sending raw binary in the stream, we
83        # need to indicate that otherwise the stream service
84        # interprets this as text control messages.
85        ws.send(data, ABNF.OPCODE_BINARY)
86
87    # Disconnect the audio stream
88    stream.stop_stream()
89    stream.close()
90    print("* done recording")
91
92    # In order to get a final response from STT we send a stop, this
93    # will force a final=True return message.
94    data = {"action": "stop"}
95    ws.send(json.dumps(data).encode('utf8'))
96    # ... which we need to wait for before we shutdown the websocket
97    time.sleep(1)
98    ws.close()
99
100    # ... and kill the audio device
101    p.terminate()
102
103
104def on_message(self, msg):
105    """Print whatever messages come in.
106
107    While we are processing any non trivial stream of speech Watson
108    will start chunking results into bits of transcripts that it
109    considers "final", and start on a new stretch. It's not always
110    clear why it does this. However, it means that as we are
111    processing text, any time we see a final chunk, we need to save it
112    off for later.
113    """
114    global LAST
115    data = json.loads(msg)
116    if "results" in data:
117        if data["results"][0]["final"]:
118            FINALS.append(data)
119            LAST = None
120        else:
121            LAST = data
122        # This prints out the current fragment that we are working on
123        print(data['results'][0]['alternatives'][0]['transcript'])
124
125
126def on_error(self, error):
127    """Print any errors."""
128    print(error)
129
130
131def on_close(ws):
132    """Upon close, print the complete and final transcript."""
133    global LAST
134    if LAST:
135        FINALS.append(LAST)
136    transcript = "".join([x['results'][0]['alternatives'][0]['transcript']
137                          for x in FINALS])
138    print(transcript)
139
140
141def on_open(ws):
142    """Triggered as soon a we have an active connection."""
143    args = ws.args
144    data = {
145        "action": "start",
146        # this means we get to send it straight raw sampling
147        "content-type": "audio/l16;rate=%d" % RATE,
148        "continuous": True,
149        "interim_results": True,
150        # "inactivity_timeout": 5, # in order to use this effectively
151        # you need other tests to handle what happens if the socket is
152        # closed by the server.
153        "word_confidence": True,
154        "timestamps": True,
155        "max_alternatives": 3
156    }
157
158    # Send the initial control message which sets expectations for the
159    # binary stream that follows:
160    ws.send(json.dumps(data).encode('utf8'))
161    # Spin off a dedicated thread where we are going to read and
162    # stream out audio.
163    threading.Thread(target=read_audio,
164                     args=(ws, args.timeout)).start()
165
166def get_url():
167    config = configparser.RawConfigParser()
168    config.read('speech.cfg')
169    # See
170    # https://console.bluemix.net/docs/services/speech-to-text/websockets.html#websockets
171    # for details on which endpoints are for each region.
172    region = config.get('auth', 'region')
173    host = REGION_MAP[region]
174    return ("wss://{}/speech-to-text/api/v1/recognize"
175           "?model=ja-JP_BroadbandModel").format(host)
176
177def get_auth():
178    config = configparser.RawConfigParser()
179    config.read('speech.cfg')
180    apikey = config.get('auth', 'apikey')
181    return ("apikey", apikey)
182
183
184def parse_args():
185    parser = argparse.ArgumentParser(
186        description='Transcribe Watson text in real time')
187    parser.add_argument('-t', '--timeout', type=int, default=5)
188    # parser.add_argument('-d', '--device')
189    # parser.add_argument('-v', '--verbose', action='store_true')
190    args = parser.parse_args()
191    return args
192
193
194def main():
195    # Connect to websocket interfaces
196    headers = {}
197    userpass = ":".join(get_auth())
198    headers["Authorization"] = "Basic " + base64.b64encode(
199        userpass.encode()).decode()
200    url = get_url()
201
202    # If you really want to see everything going across the wire,
203    # uncomment this. However realize the trace is going to also do
204    # things like dump the binary sound packets in text in the
205    # console.
206    #
207    # websocket.enableTrace(True)
208    ws = websocket.WebSocketApp(url,
209                                header=headers,
210                                on_message=on_message,
211                                on_error=on_error,
212                                on_close=on_close)
213    ws.on_open = on_open
214    ws.args = parse_args()
215    # This gives control over the WebSocketApp. This is a blocking
216    # call, so it won't return until the ws.close() gets called (after
217    # 6 seconds in the dedicated thread).
218    ws.run_forever()
219
220
221if __name__ == "__main__":
222    main()
223

行動規範の内容に同意します

回答2件

自己解決したので載せます

can110様本当にありがとうございました！！

transcribe.py
1#!/usr/bin/env python
2#
3# Copyright 2016 IBM
4#
5# Licensed under the Apache License, Version 2.0 (the "License"); you may
6# not use this file except in compliance with the License. You may obtain
7# a copy of the License at
8#
9#    http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14# License for the specific language governing permissions and limitations
15# under the License.
16
17import argparse
18import base64
19import configparser
20import json
21import threading
22import time
23
24import pyaudio
25import websocket
26from websocket._abnf import ABNF
27
28CHUNK = 1024
29FORMAT = pyaudio.paInt16
30# Even if your default input is multi channel (like a webcam mic),
31# it's really important to only record 1 channel, as the STT service
32# does not do anything useful with stereo. You get a lot of "hmmm"
33# back.
34CHANNELS = 1
35# Rate is important, nothing works without it. This is a pretty
36# standard default. If you have an audio device that requires
37# something different, change this.
38RATE = 44100
39RECORD_SECONDS = 5
40FINALS = []
41LAST = None
42
43REGION_MAP = {
44    'us-east': 'gateway-wdc.watsonplatform.net',
45    'us-south': 'stream.watsonplatform.net',
46    'eu-gb': 'stream.watsonplatform.net',
47    'eu-de': 'stream-fra.watsonplatform.net',
48    'au-syd': 'gateway-syd.watsonplatform.net',
49    'jp-tok': 'api.jp-tok.speech-to-text.watson.cloud.ibm.com/instances/＃ご自身のID情報入れてください',
50}
51
52
53def read_audio(ws, timeout):
54    """Read audio and sent it to the websocket port.
55
56    This uses pyaudio to read from a device in chunks and send these
57    over the websocket wire.
58
59    """
60    global RATE
61    p = pyaudio.PyAudio()
62    # NOTE(sdague): if you don't seem to be getting anything off of
63    # this you might need to specify:
64    #
65    #    input_device_index=N,
66    #
67    # Where N is an int. You'll need to do a dump of your input
68    # devices to figure out which one you want.
69    RATE = int(p.get_default_input_device_info()['defaultSampleRate'])
70    stream = p.open(format=FORMAT,
71                    channels=CHANNELS,
72                    rate=RATE,
73                    input=True,
74                    frames_per_buffer=CHUNK)
75
76    print("* recording")
77    rec = timeout or RECORD_SECONDS
78
79    for i in range(0, int(RATE / CHUNK * rec)):
80        data = stream.read(CHUNK)
81        # print("Sending packet... %d" % i)
82        # NOTE(sdague): we're sending raw binary in the stream, we
83        # need to indicate that otherwise the stream service
84        # interprets this as text control messages.
85        ws.send(data, ABNF.OPCODE_BINARY)
86
87    # Disconnect the audio stream
88    stream.stop_stream()
89    stream.close()
90    print("* done recording")
91
92    # In order to get a final response from STT we send a stop, this
93    # will force a final=True return message.
94    data = {"action": "stop"}
95    ws.send(json.dumps(data).encode('utf8'))
96    # ... which we need to wait for before we shutdown the websocket
97    time.sleep(1)
98    ws.close()
99
100    # ... and kill the audio device
101    p.terminate()
102
103
104def on_message(self, msg):
105    """Print whatever messages come in.
106
107    While we are processing any non trivial stream of speech Watson
108    will start chunking results into bits of transcripts that it
109    considers "final", and start on a new stretch. It's not always
110    clear why it does this. However, it means that as we are
111    processing text, any time we see a final chunk, we need to save it
112    off for later.
113    """
114    global LAST
115    data = json.loads(msg)
116    if "results" in data:
117        if data["results"][0]["final"]:
118            FINALS.append(data)
119            LAST = None
120        else:
121            LAST = data
122        # This prints out the current fragment that we are working on
123        print(data['results'][0]['alternatives'][0]['transcript'])
124
125
126def on_error(self, error):
127    """Print any errors."""
128    print(error)
129
130
131def on_close(ws):
132    """Upon close, print the complete and final transcript."""
133    global LAST
134    if LAST:
135        FINALS.append(LAST)
136    transcript = "".join([x['results'][0]['alternatives'][0]['transcript']
137                          for x in FINALS])
138    print(transcript)
139
140
141def on_open(ws):
142    """Triggered as soon a we have an active connection."""
143    args = ws.args
144    data = {
145        "action": "start",
146        # this means we get to send it straight raw sampling
147        "content-type": "audio/l16;rate=%d" % RATE,
148        "continuous": True,
149        "interim_results": True,
150        # "inactivity_timeout": 5, # in order to use this effectively
151        # you need other tests to handle what happens if the socket is
152        # closed by the server.
153        "word_confidence": True,
154        "timestamps": True,
155        "max_alternatives": 3
156    }
157
158    # Send the initial control message which sets expectations for the
159    # binary stream that follows:
160    ws.send(json.dumps(data).encode('utf8'))
161    # Spin off a dedicated thread where we are going to read and
162    # stream out audio.
163    threading.Thread(target=read_audio,
164                     args=(ws, args.timeout)).start()
165
166def get_url():
167    config = configparser.RawConfigParser()
168    config.read('speech.cfg')
169    # See
170    # https://console.bluemix.net/docs/services/speech-to-text/websockets.html#websockets
171    # for details on which endpoints are for each region.
172    region = config.get('auth', 'region')
173    host = REGION_MAP[region]
174    return ("wss://{}/v1/recognize?model=ja-JP_BroadbandModel").format(host)　＃ここも変えています。
175
176def get_auth():
177    config = configparser.RawConfigParser()
178    config.read('speech.cfg')
179    apikey = config.get('auth', 'apikey')
180    return ("apikey", apikey)
181
182
183def parse_args():
184    parser = argparse.ArgumentParser(
185        description='Transcribe Watson text in real time')
186    parser.add_argument('-t', '--timeout', type=int, default=5)
187    # parser.add_argument('-d', '--device')
188    # parser.add_argument('-v', '--verbose', action='store_true')
189    args = parser.parse_args()
190    return args
191
192
193def main():
194    # Connect to websocket interfaces
195    headers = {}
196    userpass = ":".join(get_auth())
197    headers["Authorization"] = "Basic " + base64.b64encode(
198        userpass.encode()).decode()
199    url = get_url()
200
201    # If you really want to see everything going across the wire,
202    # uncomment this. However realize the trace is going to also do
203    # things like dump the binary sound packets in text in the
204    # console.
205    #
206    # websocket.enableTrace(True)
207    ws = websocket.WebSocketApp(url,
208                                header=headers,
209                                on_message=on_message,
210                                on_error=on_error,
211                                on_close=on_close)
212    ws.on_open = on_open
213    ws.args = parse_args()
214    # This gives control over the WebSocketApp. This is a blocking
215    # call, so it won't return until the ws.close() gets called (after
216    # 6 seconds in the dedicated thread).
217    ws.run_forever()
218
219
220if __name__ == "__main__":
221    main()
222