diff options
author | Anuj Solanki <anuj603362@gmail.com> | 2024-10-01 00:32:40 +0530 |
---|---|---|
committer | Anuj Solanki <anuj603362@gmail.com> | 2024-10-06 01:16:32 +0530 |
commit | 5a8f670c3f772cfe0345ed53e5989a6dca08a905 (patch) | |
tree | 7d82533a0f5356547e31609c3db1d0101de83376 /agl_service_voiceagent | |
parent | 1144fcd343bc56f8c27ff73d3e76904010dbb832 (diff) |
- Removed OpenAI's Whisper AI from agl-service-voiceagent and using
whisper.cpp for speech-to-text.
- Fix audio_recorder.
- Update grpc protoc to include the online-mode status in ServiceStatus
- Set online_mode flag default to 0
- Change wake word to "hey automotive"
Bug-AGL: SPEC-5200
Change-Id: I9f1629cdcaef43498bf4cb9fdd950291a415819d
Signed-off-by: Anuj Solanki <anuj603362@gmail.com>
Diffstat (limited to 'agl_service_voiceagent')
-rw-r--r-- | agl_service_voiceagent/config.ini | 4 | ||||
-rw-r--r-- | agl_service_voiceagent/generated/voice_agent_pb2.py | 72 | ||||
-rw-r--r-- | agl_service_voiceagent/nlu/snips_interface.py | 3 | ||||
-rw-r--r-- | agl_service_voiceagent/protos/voice_agent.proto | 1 | ||||
-rw-r--r-- | agl_service_voiceagent/servicers/voice_agent_servicer.py | 31 | ||||
-rw-r--r-- | agl_service_voiceagent/utils/audio_recorder.py | 4 | ||||
-rw-r--r-- | agl_service_voiceagent/utils/stt_model.py | 44 |
7 files changed, 86 insertions, 73 deletions
diff --git a/agl_service_voiceagent/config.ini b/agl_service_voiceagent/config.ini index d6d695e..e4f6313 100644 --- a/agl_service_voiceagent/config.ini +++ b/agl_service_voiceagent/config.ini @@ -9,7 +9,7 @@ snips_model_path = /usr/share/nlu/snips/model/ channels = 1 sample_rate = 16000 bits_per_sample = 16 -wake_word = hello +wake_word = hey automotive server_port = 51053 server_address = 127.0.0.1 rasa_model_path = /usr/share/nlu/rasa/models/ @@ -17,7 +17,7 @@ rasa_server_port = 51054 rasa_detached_mode = 1 base_log_dir = /usr/share/nlu/logs/ store_voice_commands = 0 -online_mode = 1 +online_mode = 0 online_mode_address = 65.108.107.216 online_mode_port = 50051 online_mode_timeout = 15 diff --git a/agl_service_voiceagent/generated/voice_agent_pb2.py b/agl_service_voiceagent/generated/voice_agent_pb2.py index 4606f60..d978664 100644 --- a/agl_service_voiceagent/generated/voice_agent_pb2.py +++ b/agl_service_voiceagent/generated/voice_agent_pb2.py @@ -14,49 +14,49 @@ _sym_db = _symbol_database.Default() -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x11voice_agent.proto\"\x07\n\x05\x45mpty\"C\n\rServiceStatus\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x0e\n\x06status\x18\x02 \x01(\x08\x12\x11\n\twake_word\x18\x03 \x01(\t\"^\n\nVoiceAudio\x12\x13\n\x0b\x61udio_chunk\x18\x01 \x01(\x0c\x12\x14\n\x0c\x61udio_format\x18\x02 \x01(\t\x12\x13\n\x0bsample_rate\x18\x03 \x01(\x05\x12\x10\n\x08language\x18\x04 \x01(\t\" \n\x0eWakeWordStatus\x12\x0e\n\x06status\x18\x01 \x01(\x08\"\x93\x01\n\x17S_RecognizeVoiceControl\x12!\n\x0c\x61udio_stream\x18\x01 \x01(\x0b\x32\x0b.VoiceAudio\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12\x11\n\tstream_id\x18\x03 \x01(\t\x12$\n\rstt_framework\x18\x04 \x01(\x0e\x32\r.STTFramework\"\xd1\x01\n\x15RecognizeVoiceControl\x12\x1d\n\x06\x61\x63tion\x18\x01 \x01(\x0e\x32\r.RecordAction\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12 \n\x0brecord_mode\x18\x03 \x01(\x0e\x32\x0b.RecordMode\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\rstt_framework\x18\x05 \x01(\x0e\x32\r.STTFramework\x12 \n\x0bonline_mode\x18\x06 \x01(\x0e\x32\x0b.OnlineMode\"J\n\x14RecognizeTextControl\x12\x14\n\x0ctext_command\x18\x01 \x01(\t\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\")\n\nIntentSlot\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\x8e\x01\n\x0fRecognizeResult\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x0e\n\x06intent\x18\x02 \x01(\t\x12!\n\x0cintent_slots\x18\x03 \x03(\x0b\x32\x0b.IntentSlot\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\x06status\x18\x05 \x01(\x0e\x32\x14.RecognizeStatusType\"A\n\x0c\x45xecuteInput\x12\x0e\n\x06intent\x18\x01 \x01(\t\x12!\n\x0cintent_slots\x18\x02 \x03(\x0b\x32\x0b.IntentSlot\"E\n\rExecuteResult\x12\x10\n\x08response\x18\x01 \x01(\t\x12\"\n\x06status\x18\x02 \x01(\x0e\x32\x12.ExecuteStatusType*%\n\x0cSTTFramework\x12\x08\n\x04VOSK\x10\x00\x12\x0b\n\x07WHISPER\x10\x01*%\n\nOnlineMode\x12\n\n\x06ONLINE\x10\x00\x12\x0b\n\x07OFFLINE\x10\x01*#\n\x0cRecordAction\x12\t\n\x05START\x10\x00\x12\x08\n\x04STOP\x10\x01*\x1f\n\x08NLUModel\x12\t\n\x05SNIPS\x10\x00\x12\x08\n\x04RASA\x10\x01*\"\n\nRecordMode\x12\n\n\x06MANUAL\x10\x00\x12\x08\n\x04\x41UTO\x10\x01*\xb4\x01\n\x13RecognizeStatusType\x12\r\n\tREC_ERROR\x10\x00\x12\x0f\n\x0bREC_SUCCESS\x10\x01\x12\x12\n\x0eREC_PROCESSING\x10\x02\x12\x18\n\x14VOICE_NOT_RECOGNIZED\x10\x03\x12\x19\n\x15INTENT_NOT_RECOGNIZED\x10\x04\x12\x17\n\x13TEXT_NOT_RECOGNIZED\x10\x05\x12\x1b\n\x17NLU_MODEL_NOT_SUPPORTED\x10\x06*\x82\x01\n\x11\x45xecuteStatusType\x12\x0e\n\nEXEC_ERROR\x10\x00\x12\x10\n\x0c\x45XEC_SUCCESS\x10\x01\x12\x14\n\x10KUKSA_CONN_ERROR\x10\x02\x12\x18\n\x14INTENT_NOT_SUPPORTED\x10\x03\x12\x1b\n\x17INTENT_SLOTS_INCOMPLETE\x10\x04\x32\xa4\x03\n\x11VoiceAgentService\x12,\n\x12\x43heckServiceStatus\x12\x06.Empty\x1a\x0e.ServiceStatus\x12\x34\n\x10S_DetectWakeWord\x12\x0b.VoiceAudio\x1a\x0f.WakeWordStatus(\x01\x30\x01\x12+\n\x0e\x44\x65tectWakeWord\x12\x06.Empty\x1a\x0f.WakeWordStatus0\x01\x12G\n\x17S_RecognizeVoiceCommand\x12\x18.S_RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12\x43\n\x15RecognizeVoiceCommand\x12\x16.RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12?\n\x14RecognizeTextCommand\x12\x15.RecognizeTextControl\x1a\x10.RecognizeResult\x12/\n\x0e\x45xecuteCommand\x12\r.ExecuteInput\x1a\x0e.ExecuteResultb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x11voice_agent.proto\"\x07\n\x05\x45mpty\"X\n\rServiceStatus\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x0e\n\x06status\x18\x02 \x01(\x08\x12\x11\n\twake_word\x18\x03 \x01(\t\x12\x13\n\x0bonline_mode\x18\x04 \x01(\x08\"^\n\nVoiceAudio\x12\x13\n\x0b\x61udio_chunk\x18\x01 \x01(\x0c\x12\x14\n\x0c\x61udio_format\x18\x02 \x01(\t\x12\x13\n\x0bsample_rate\x18\x03 \x01(\x05\x12\x10\n\x08language\x18\x04 \x01(\t\" \n\x0eWakeWordStatus\x12\x0e\n\x06status\x18\x01 \x01(\x08\"\x93\x01\n\x17S_RecognizeVoiceControl\x12!\n\x0c\x61udio_stream\x18\x01 \x01(\x0b\x32\x0b.VoiceAudio\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12\x11\n\tstream_id\x18\x03 \x01(\t\x12$\n\rstt_framework\x18\x04 \x01(\x0e\x32\r.STTFramework\"\xd1\x01\n\x15RecognizeVoiceControl\x12\x1d\n\x06\x61\x63tion\x18\x01 \x01(\x0e\x32\r.RecordAction\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12 \n\x0brecord_mode\x18\x03 \x01(\x0e\x32\x0b.RecordMode\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\rstt_framework\x18\x05 \x01(\x0e\x32\r.STTFramework\x12 \n\x0bonline_mode\x18\x06 \x01(\x0e\x32\x0b.OnlineMode\"J\n\x14RecognizeTextControl\x12\x14\n\x0ctext_command\x18\x01 \x01(\t\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\")\n\nIntentSlot\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\x8e\x01\n\x0fRecognizeResult\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x0e\n\x06intent\x18\x02 \x01(\t\x12!\n\x0cintent_slots\x18\x03 \x03(\x0b\x32\x0b.IntentSlot\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\x06status\x18\x05 \x01(\x0e\x32\x14.RecognizeStatusType\"A\n\x0c\x45xecuteInput\x12\x0e\n\x06intent\x18\x01 \x01(\t\x12!\n\x0cintent_slots\x18\x02 \x03(\x0b\x32\x0b.IntentSlot\"E\n\rExecuteResult\x12\x10\n\x08response\x18\x01 \x01(\t\x12\"\n\x06status\x18\x02 \x01(\x0e\x32\x12.ExecuteStatusType*%\n\x0cSTTFramework\x12\x08\n\x04VOSK\x10\x00\x12\x0b\n\x07WHISPER\x10\x01*%\n\nOnlineMode\x12\n\n\x06ONLINE\x10\x00\x12\x0b\n\x07OFFLINE\x10\x01*#\n\x0cRecordAction\x12\t\n\x05START\x10\x00\x12\x08\n\x04STOP\x10\x01*\x1f\n\x08NLUModel\x12\t\n\x05SNIPS\x10\x00\x12\x08\n\x04RASA\x10\x01*\"\n\nRecordMode\x12\n\n\x06MANUAL\x10\x00\x12\x08\n\x04\x41UTO\x10\x01*\xb4\x01\n\x13RecognizeStatusType\x12\r\n\tREC_ERROR\x10\x00\x12\x0f\n\x0bREC_SUCCESS\x10\x01\x12\x12\n\x0eREC_PROCESSING\x10\x02\x12\x18\n\x14VOICE_NOT_RECOGNIZED\x10\x03\x12\x19\n\x15INTENT_NOT_RECOGNIZED\x10\x04\x12\x17\n\x13TEXT_NOT_RECOGNIZED\x10\x05\x12\x1b\n\x17NLU_MODEL_NOT_SUPPORTED\x10\x06*\x82\x01\n\x11\x45xecuteStatusType\x12\x0e\n\nEXEC_ERROR\x10\x00\x12\x10\n\x0c\x45XEC_SUCCESS\x10\x01\x12\x14\n\x10KUKSA_CONN_ERROR\x10\x02\x12\x18\n\x14INTENT_NOT_SUPPORTED\x10\x03\x12\x1b\n\x17INTENT_SLOTS_INCOMPLETE\x10\x04\x32\xa4\x03\n\x11VoiceAgentService\x12,\n\x12\x43heckServiceStatus\x12\x06.Empty\x1a\x0e.ServiceStatus\x12\x34\n\x10S_DetectWakeWord\x12\x0b.VoiceAudio\x1a\x0f.WakeWordStatus(\x01\x30\x01\x12+\n\x0e\x44\x65tectWakeWord\x12\x06.Empty\x1a\x0f.WakeWordStatus0\x01\x12G\n\x17S_RecognizeVoiceCommand\x12\x18.S_RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12\x43\n\x15RecognizeVoiceCommand\x12\x16.RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12?\n\x14RecognizeTextCommand\x12\x15.RecognizeTextControl\x1a\x10.RecognizeResult\x12/\n\x0e\x45xecuteCommand\x12\r.ExecuteInput\x1a\x0e.ExecuteResultb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'voice_agent_pb2', _globals) if not _descriptor._USE_C_DESCRIPTORS: DESCRIPTOR._loaded_options = None - _globals['_STTFRAMEWORK']._serialized_start=993 - _globals['_STTFRAMEWORK']._serialized_end=1030 - _globals['_ONLINEMODE']._serialized_start=1032 - _globals['_ONLINEMODE']._serialized_end=1069 - _globals['_RECORDACTION']._serialized_start=1071 - _globals['_RECORDACTION']._serialized_end=1106 - _globals['_NLUMODEL']._serialized_start=1108 - _globals['_NLUMODEL']._serialized_end=1139 - _globals['_RECORDMODE']._serialized_start=1141 - _globals['_RECORDMODE']._serialized_end=1175 - _globals['_RECOGNIZESTATUSTYPE']._serialized_start=1178 - _globals['_RECOGNIZESTATUSTYPE']._serialized_end=1358 - _globals['_EXECUTESTATUSTYPE']._serialized_start=1361 - _globals['_EXECUTESTATUSTYPE']._serialized_end=1491 + _globals['_STTFRAMEWORK']._serialized_start=1014 + _globals['_STTFRAMEWORK']._serialized_end=1051 + _globals['_ONLINEMODE']._serialized_start=1053 + _globals['_ONLINEMODE']._serialized_end=1090 + _globals['_RECORDACTION']._serialized_start=1092 + _globals['_RECORDACTION']._serialized_end=1127 + _globals['_NLUMODEL']._serialized_start=1129 + _globals['_NLUMODEL']._serialized_end=1160 + _globals['_RECORDMODE']._serialized_start=1162 + _globals['_RECORDMODE']._serialized_end=1196 + _globals['_RECOGNIZESTATUSTYPE']._serialized_start=1199 + _globals['_RECOGNIZESTATUSTYPE']._serialized_end=1379 + _globals['_EXECUTESTATUSTYPE']._serialized_start=1382 + _globals['_EXECUTESTATUSTYPE']._serialized_end=1512 _globals['_EMPTY']._serialized_start=21 _globals['_EMPTY']._serialized_end=28 _globals['_SERVICESTATUS']._serialized_start=30 - _globals['_SERVICESTATUS']._serialized_end=97 - _globals['_VOICEAUDIO']._serialized_start=99 - _globals['_VOICEAUDIO']._serialized_end=193 - _globals['_WAKEWORDSTATUS']._serialized_start=195 - _globals['_WAKEWORDSTATUS']._serialized_end=227 - _globals['_S_RECOGNIZEVOICECONTROL']._serialized_start=230 - _globals['_S_RECOGNIZEVOICECONTROL']._serialized_end=377 - _globals['_RECOGNIZEVOICECONTROL']._serialized_start=380 - _globals['_RECOGNIZEVOICECONTROL']._serialized_end=589 - _globals['_RECOGNIZETEXTCONTROL']._serialized_start=591 - _globals['_RECOGNIZETEXTCONTROL']._serialized_end=665 - _globals['_INTENTSLOT']._serialized_start=667 - _globals['_INTENTSLOT']._serialized_end=708 - _globals['_RECOGNIZERESULT']._serialized_start=711 - _globals['_RECOGNIZERESULT']._serialized_end=853 - _globals['_EXECUTEINPUT']._serialized_start=855 - _globals['_EXECUTEINPUT']._serialized_end=920 - _globals['_EXECUTERESULT']._serialized_start=922 - _globals['_EXECUTERESULT']._serialized_end=991 - _globals['_VOICEAGENTSERVICE']._serialized_start=1494 - _globals['_VOICEAGENTSERVICE']._serialized_end=1914 + _globals['_SERVICESTATUS']._serialized_end=118 + _globals['_VOICEAUDIO']._serialized_start=120 + _globals['_VOICEAUDIO']._serialized_end=214 + _globals['_WAKEWORDSTATUS']._serialized_start=216 + _globals['_WAKEWORDSTATUS']._serialized_end=248 + _globals['_S_RECOGNIZEVOICECONTROL']._serialized_start=251 + _globals['_S_RECOGNIZEVOICECONTROL']._serialized_end=398 + _globals['_RECOGNIZEVOICECONTROL']._serialized_start=401 + _globals['_RECOGNIZEVOICECONTROL']._serialized_end=610 + _globals['_RECOGNIZETEXTCONTROL']._serialized_start=612 + _globals['_RECOGNIZETEXTCONTROL']._serialized_end=686 + _globals['_INTENTSLOT']._serialized_start=688 + _globals['_INTENTSLOT']._serialized_end=729 + _globals['_RECOGNIZERESULT']._serialized_start=732 + _globals['_RECOGNIZERESULT']._serialized_end=874 + _globals['_EXECUTEINPUT']._serialized_start=876 + _globals['_EXECUTEINPUT']._serialized_end=941 + _globals['_EXECUTERESULT']._serialized_start=943 + _globals['_EXECUTERESULT']._serialized_end=1012 + _globals['_VOICEAGENTSERVICE']._serialized_start=1515 + _globals['_VOICEAGENTSERVICE']._serialized_end=1935 # @@protoc_insertion_point(module_scope) diff --git a/agl_service_voiceagent/nlu/snips_interface.py b/agl_service_voiceagent/nlu/snips_interface.py index a32f574..25ad05b 100644 --- a/agl_service_voiceagent/nlu/snips_interface.py +++ b/agl_service_voiceagent/nlu/snips_interface.py @@ -46,8 +46,7 @@ class SnipsInterface: preprocessed_text = text.lower().strip() # remove special characters, punctuation, and extra whitespaces preprocessed_text = re.sub(r'[^\w\s]', '', preprocessed_text).strip() - # replace % with " precent" - preprocessed_text = re.sub(r'%', ' percent', preprocessed_text) + preprocessed_text = re.sub(r'percent', '', preprocessed_text) # replace ° with " degrees" preprocessed_text = re.sub(r'°', ' degrees ', preprocessed_text) return preprocessed_text diff --git a/agl_service_voiceagent/protos/voice_agent.proto b/agl_service_voiceagent/protos/voice_agent.proto index bd2daa2..72d48c6 100644 --- a/agl_service_voiceagent/protos/voice_agent.proto +++ b/agl_service_voiceagent/protos/voice_agent.proto @@ -61,6 +61,7 @@ message ServiceStatus { string version = 1; bool status = 2; string wake_word = 3; + bool online_mode = 4; } message VoiceAudio { diff --git a/agl_service_voiceagent/servicers/voice_agent_servicer.py b/agl_service_voiceagent/servicers/voice_agent_servicer.py index 2a4de33..c149b6d 100644 --- a/agl_service_voiceagent/servicers/voice_agent_servicer.py +++ b/agl_service_voiceagent/servicers/voice_agent_servicer.py @@ -199,6 +199,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): version=self.service_version, status=True, wake_word=self.wake_word, + online_mode = self.online_mode ) # Convert the response object to a JSON string and log it @@ -280,8 +281,12 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): "recorder": recorder, "audio_file": audio_file } - - recorder.start_recording() + + def record(): + recorder.start_recording() + + record_thread = threading.Thread(target=record) + record_thread.start() elif request.action == voice_agent_pb2.STOP: stream_uuid = request.stream_id @@ -294,6 +299,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): recorder = self.rvc_stream_uuids[stream_uuid]["recorder"] audio_file = self.rvc_stream_uuids[stream_uuid]["audio_file"] del self.rvc_stream_uuids[stream_uuid] + print(use_online_mode) recorder.stop_recording() @@ -316,12 +322,19 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): recognizer_uuid = self.stt_model.setup_vosk_recognizer() stt = self.stt_model.recognize_from_file(recognizer_uuid, audio_file,stt_framework=stt_framework) used_kaldi = True - print(stt) if stt not in ["FILE_NOT_FOUND", "FILE_FORMAT_INVALID", "VOICE_NOT_RECOGNIZED", ""]: if request.nlu_model == voice_agent_pb2.SNIPS: - extracted_intent = self.snips_interface.extract_intent(stt) - intent, intent_actions = self.snips_interface.process_intent(extracted_intent) + try: + extracted_intent = self.snips_interface.extract_intent(stt) + except Exception as e: + print(e) + extracted_intent = "" + if extracted_intent != "": + intent, intent_actions = self.snips_interface.process_intent(extracted_intent) + else: + intent = "" + intent_actions = {} if not intent or intent == "": status = voice_agent_pb2.INTENT_NOT_RECOGNIZED @@ -346,14 +359,12 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): status = voice_agent_pb2.NLU_MODEL_NOT_SUPPORTED else: - stt = "" status = voice_agent_pb2.VOICE_NOT_RECOGNIZED # cleanup the kaldi recognizer if used_kaldi: self.stt_model.cleanup_recognizer(recognizer_uuid) used_kaldi = False - # delete the audio file if not self.store_voice_command: delete_file(audio_file) @@ -516,7 +527,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): exec_response = "Uh oh, I failed to stop the media." exec_status = voice_agent_pb2.EXEC_ERROR else: - exec_response = "Sorry, I failed to execute command against intent 'MediaControl'. Maybe try again with more specific instructions." + exec_response = "Sorry, I failed to execute command." exec_status = voice_agent_pb2.EXEC_ERROR @@ -572,7 +583,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): if "value" in execution_item: value = execution_item["value"] if self.set_current_values(signal, value): - exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'." + exec_response = f"Yay, I successfully updated '{intent}' to value '{value}'." exec_status = voice_agent_pb2.EXEC_SUCCESS elif "factor" in execution_item: @@ -593,7 +604,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): value = current_value - factor value = str(value) if self.set_current_values(signal, value): - exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'." + exec_response = f"Yay, I successfully updated '{intent}' to value '{value}'." exec_status = voice_agent_pb2.EXEC_SUCCESS else: diff --git a/agl_service_voiceagent/utils/audio_recorder.py b/agl_service_voiceagent/utils/audio_recorder.py index 49716c9..e362480 100644 --- a/agl_service_voiceagent/utils/audio_recorder.py +++ b/agl_service_voiceagent/utils/audio_recorder.py @@ -64,7 +64,7 @@ class AudioRecorder: """ print("Creating pipeline for audio recording in {} mode...".format(self.mode)) self.pipeline = Gst.Pipeline() - autoaudiosrc = Gst.ElementFactory.make("autoaudiosrc", None) + autoaudiosrc = Gst.ElementFactory.make("alsasrc", None) queue = Gst.ElementFactory.make("queue", None) queue.set_property("max-size-buffers", 0) queue.set_property("max-size-bytes", 0) @@ -109,6 +109,7 @@ class AudioRecorder: Start recording audio using the GStreamer pipeline. """ self.pipeline.set_state(Gst.State.PLAYING) + self.loop.run() print("Recording Voice Input...") @@ -186,3 +187,4 @@ class AudioRecorder: print("Pipeline cleanup complete!") self.bus = None self.pipeline = None + self.loop.quit()
\ No newline at end of file diff --git a/agl_service_voiceagent/utils/stt_model.py b/agl_service_voiceagent/utils/stt_model.py index 7e8ad8b..0a092ea 100644 --- a/agl_service_voiceagent/utils/stt_model.py +++ b/agl_service_voiceagent/utils/stt_model.py @@ -21,7 +21,7 @@ import wave from agl_service_voiceagent.utils.common import generate_unique_uuid # import the whisper model -import whisper +# import whisper # for whisper timeout feature from concurrent.futures import ThreadPoolExecutor import subprocess @@ -93,28 +93,28 @@ class STTModel: return result # Recognize speech using the whisper model - def recognize_using_whisper(self,filename,language = None,timeout = 5,fp16=False): - """ - Recognize speech and return the result as a JSON object. - - Args: - filename (str): The path to the audio file. - timeout (int, optional): The timeout for recognition (default is 5 seconds). - fp16 (bool, optional): If True, use 16-bit floating point precision, (default is False) because cuda is not supported. - language (str, optional): The language code for recognition (default is None). - - Returns: - dict: A JSON object containing recognition results. - """ - def transcribe_with_whisper(): - return self.whisper_model.transcribe(filename, language = language,fp16=fp16) + # def recognize_using_whisper(self,filename,language = None,timeout = 5,fp16=False): + # """ + # Recognize speech and return the result as a JSON object. + + # Args: + # filename (str): The path to the audio file. + # timeout (int, optional): The timeout for recognition (default is 5 seconds). + # fp16 (bool, optional): If True, use 16-bit floating point precision, (default is False) because cuda is not supported. + # language (str, optional): The language code for recognition (default is None). + + # Returns: + # dict: A JSON object containing recognition results. + # """ + # def transcribe_with_whisper(): + # return self.whisper_model.transcribe(filename, language = language,fp16=fp16) - with ThreadPoolExecutor() as executor: - future = executor.submit(transcribe_with_whisper) - try: - return future.result(timeout=timeout) - except TimeoutError: - return {"error": "Transcription with Whisper exceeded the timeout."} + # with ThreadPoolExecutor() as executor: + # future = executor.submit(transcribe_with_whisper) + # try: + # return future.result(timeout=timeout) + # except TimeoutError: + # return {"error": "Transcription with Whisper exceeded the timeout."} def recognize_using_whisper_cpp(self,filename): command = self.whisper_cpp_path |