この記事とGitHubのソースを参考にGoogle Cloud Speech APIを利用できるところまではできたのですが、
cmd
1音声 2音声の 3音声のに 4音声の認識 5音声の認識で 6音声の認識です 7 テスト 8 テスト 9 テスト 10 テスト 11音声の認識 テスト
といった感じで不要な部分を少し削ることはできたのですが認識中の言葉も表示されてしまい、最終的な認識された結果(音声の認識 テスト
の部分)だけを取得することができず、困っています。
GitHubのソース
ほとんどこちらの方の引用ですが、該当ソースです。
リアルタイムに取得した音声の最終結果を拾う方法を教えて頂きたいです。
python:mic.py
1import pyaudio, time, audioop, math, sys, argparse 2from gcloud.credentials import get_credentials 3from google.cloud.speech.v1beta1 import cloud_speech_pb2 4from google.rpc import code_pb2 5from grpc.beta import implementations 6 7class stdout: 8 BOLD = "\033[1m" 9 END = "\033[0m" 10 CLEAR = "\033[2K" 11 12def bold(string): 13 return stdout.BOLD + string 14 15def printr(string): 16 sys.stdout.write("\r") 17 sys.stdout.write(string) 18 sys.stdout.flush() 19 20frames = [] 21silent_frames = [] 22is_recording = False 23should_finish_stream = False 24 25class Result: 26 def __init__(self): 27 self.transcription = "" 28 self.confidence = "" 29 self.is_final = False 30 31recognition_result = Result() 32 33def make_channel(host, port): 34 ssl_channel = implementations.ssl_channel_credentials(None, None, None) 35 creds = get_credentials().create_scoped(args.speech_scope) 36 auth_header = ("authorization", "Bearer " + creds.get_access_token().access_token) 37 auth_plugin = implementations.metadata_call_credentials(lambda _, func: func([auth_header], None), name="google_creds") 38 composite_channel = implementations.composite_channel_credentials(ssl_channel, auth_plugin) 39 return implementations.secure_channel(host, port, composite_channel) 40 41def listen_loop(recognize_stream): 42 global should_finish_stream 43 global recognition_result 44 45 for resp in recognize_stream: 46 if resp.error.code != code_pb2.OK: 47 raise RuntimeError(resp.error.message) 48 49 for result in resp.results: 50 for alt in result.alternatives: 51 recognition_result.transcription = alt.transcript 52 recognition_result.confidence = alt.confidence 53 recognition_result.stability = result.stability 54 #print(" ".join(alt.transcript)) 55 #printr(" ".join((alt.transcript, " ", "stability: ", str(int(result.stability * 100)), "%"))) 56 57 if result.is_final: 58 recognition_result.is_final = True 59 should_finish_stream = True 60 return 61 printr(recognition_result.transcription) 62 print() 63 64def request_stream(): 65 recognition_config = cloud_speech_pb2.RecognitionConfig( 66 encoding=args.audio_encoding, 67 sample_rate=args.sampling_rate, 68 language_code=args.lang_code, 69 max_alternatives=1, 70 ) 71 streaming_config = cloud_speech_pb2.StreamingRecognitionConfig( 72 config=recognition_config, 73 interim_results=True, 74 single_utterance=True 75 ) 76 77 yield cloud_speech_pb2.StreamingRecognizeRequest(streaming_config=streaming_config) 78 79 while True: 80 time.sleep(args.frame_seconds / 4) 81 82 if should_finish_stream: 83 return 84 85 if len(frames) > 0: 86 yield cloud_speech_pb2.StreamingRecognizeRequest(audio_content=frames.pop(0)) 87 88def pyaudio_callback(in_data, frame_count, time_info, status): 89 # in_data = b"".join(in_data) 90 assert isinstance(in_data, bytes) 91 frames.append(in_data) 92 return (None, pyaudio.paContinue) 93 94def run_recognition_loop(): 95 global frames 96 global silent_frames 97 global is_recording 98 global should_finish_stream 99 100 if len(silent_frames) > 4: 101 silent_frames = silent_frames[-4:] 102 103 while not is_recording: 104 time.sleep(args.frame_seconds // 4) 105 106 if len(frames) > 4: 107 for frame_index in range(4): 108 data = frames[frame_index] 109 rms = audioop.rms(data, 2) 110 decibel = 20 * math.log10(rms) if rms > 0 else 0 111 if decibel < args.silent_decibel: 112 silent_frames += frames[0:frame_index+1] 113 del frames[0:frame_index + 1] 114 return 115 116 is_recording = True 117 frames = silent_frames + frames 118 silent_frames = [] 119 120 with cloud_speech_pb2.beta_create_Speech_stub(make_channel(args.host, args.ssl_port)) as service: 121 try: 122 listen_loop(service.StreamingRecognize(request_stream(), args.deadline_seconds)) 123 #printr(" ".join((bold(recognition_result.transcription), " ",str(int(recognition_result.confidence * 100)), "%"))) 124 #printr(" ",str(int(recognition_result.confidence * 100)), "%") 125 #print() 126 except Exception as e: 127 print(str(e)) 128 129def main(): 130 global is_recording 131 global should_finish_stream 132 133 pa = pyaudio.PyAudio() 134 devices = [] 135 for device_index in range(pa.get_device_count()): 136 metadata = pa.get_device_info_by_index(device_index) 137 #print(device_index, metadata["name"]) 138 139 stream = pa.open(format=pa.get_format_from_width(2), 140 channels=1, 141 rate=args.sampling_rate, 142 input_device_index=args.device_index, 143 input=True, 144 output=False, 145 frames_per_buffer=int(args.sampling_rate * args.frame_seconds), 146 stream_callback=pyaudio_callback) 147 148 stream.start_stream() 149 150 while True: 151 is_recording = False 152 should_finish_stream = False 153 run_recognition_loop() 154 155 stream.stop_stream() 156 stream.close() 157 158 pa.terminate() 159 160if __name__ == "__main__": 161 parser = argparse.ArgumentParser() 162 parser.add_argument("--sampling-rate", "-rate", type=int, default=16000) 163 parser.add_argument("--device-index", "-device", type=int, default=0) 164 parser.add_argument("--lang-code", "-lang", type=str, default="ja-JP") 165 parser.add_argument("--audio-encoding", "-encode", type=str, default="LINEAR16") 166 parser.add_argument("--frame-seconds", "-fsec", type=float, default=0.1, help="1フレームあたりの時間(秒). デフォルトは100ミリ秒") 167 parser.add_argument("--deadline-seconds", "-dsec", type=int, default=60*3+5) 168 parser.add_argument("--silent-decibel", "-decibel", type=int, default=40) 169 parser.add_argument("--speech-scope", "-scope", type=str, default="https://www.googleapis.com/auth/cloud-platform") 170 parser.add_argument("--ssl-port", "-port", type=int, default=443) 171 parser.add_argument("--host", "-host", type=str, default="speech.googleapis.com") 172 args = parser.parse_args() 173 main()
環境
windows10
python3.7
あなたの回答
tips
プレビュー