From 5a8f670c3f772cfe0345ed53e5989a6dca08a905 Mon Sep 17 00:00:00 2001 From: Anuj Solanki Date: Tue, 1 Oct 2024 00:32:40 +0530 Subject: Remove OpenAI's Whisper AI and Bug Fixing - Removed OpenAI's Whisper AI from agl-service-voiceagent and using whisper.cpp for speech-to-text. - Fix audio_recorder. - Update grpc protoc to include the online-mode status in ServiceStatus - Set online_mode flag default to 0 - Change wake word to "hey automotive" Bug-AGL: SPEC-5200 Change-Id: I9f1629cdcaef43498bf4cb9fdd950291a415819d Signed-off-by: Anuj Solanki --- README.md | 7 ++- agl_service_voiceagent/config.ini | 4 +- .../generated/voice_agent_pb2.py | 72 +++++++++++----------- agl_service_voiceagent/nlu/snips_interface.py | 3 +- agl_service_voiceagent/protos/voice_agent.proto | 1 + .../servicers/voice_agent_servicer.py | 31 +++++++--- agl_service_voiceagent/utils/audio_recorder.py | 4 +- agl_service_voiceagent/utils/stt_model.py | 44 ++++++------- 8 files changed, 90 insertions(+), 76 deletions(-) diff --git a/README.md b/README.md index d418994..7f037f7 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Automotive Grade Linux (AGL) Voice Agent Service -A gRPC-based voice agent service designed for Automotive Grade Linux (AGL). This service leverages GStreamer, Vosk, Snips, and RASA to seamlessly process user voice commands. It converts spoken words into text, extracts intents from these commands, and performs actions through the Kuksa interface. +A gRPC-based voice agent service designed for Automotive Grade Linux (AGL). This service leverages GStreamer, Vosk, Whisper, Snips, and RASA to seamlessly process user voice commands. It converts spoken words into text, extracts intents from these commands, and performs actions through the Kuksa interface. ## Table of Contents - [Features](#features) @@ -62,14 +62,15 @@ Replace `SERVER_IP` with IP address of the running Voice Agent server, and `SERV To issue a voice command, use the following command: ```bash -voiceagent-service run-client --server_address SERVER_IP --server_port SERVER_PORT --action ExecuteVoiceCommand --mode manual --nlu NLU_ENGINE +voiceagent-service run-client --server_address SERVER_IP --server_port SERVER_PORT --action ExecuteVoiceCommand --mode manual --nlu NLU_ENGINE --stt-frameword STT_FRAMEWORK ``` -Replace `NLU_ENGINE` with the preferred NLU engine ("snips" or "rasa"), `SERVER_IP` with IP address of the running Voice Agent server, and `SERVER_PORT` with the port of the running Voice Agent server. You can also pass a custom value to flag `--recording-time` if you want to change the default recording time from 5 seconds to any other value. +Replace `NLU_ENGINE` with the preferred NLU engine ("snips" or "rasa"), `SERVER_IP` with IP address of the running Voice Agent server, and `SERVER_PORT` with the port of the running Voice Agent server. You can also pass a custom value to flag `--recording-time` if you want to change the default recording time from 5 seconds to any other value, you can also pass --stt-frameword to specify the STT framework to be used, supported frameworks are "vosk" and "whisper" and the default is "vosk". ## Configuration Configuration options for the AGL Voice Agent Service can be found in the default `config.ini` file. You can customize various settings, including the AI models, audio directories, and Kuksa integration. **Important:** while manually making changes to the config file make sure you add trailing slash to all the directory paths, ie. the paths to directories should always end with a `/`. ## Maintainers +- **Anuj Solanki** - **Malik Talha** ## License diff --git a/agl_service_voiceagent/config.ini b/agl_service_voiceagent/config.ini index d6d695e..e4f6313 100644 --- a/agl_service_voiceagent/config.ini +++ b/agl_service_voiceagent/config.ini @@ -9,7 +9,7 @@ snips_model_path = /usr/share/nlu/snips/model/ channels = 1 sample_rate = 16000 bits_per_sample = 16 -wake_word = hello +wake_word = hey automotive server_port = 51053 server_address = 127.0.0.1 rasa_model_path = /usr/share/nlu/rasa/models/ @@ -17,7 +17,7 @@ rasa_server_port = 51054 rasa_detached_mode = 1 base_log_dir = /usr/share/nlu/logs/ store_voice_commands = 0 -online_mode = 1 +online_mode = 0 online_mode_address = 65.108.107.216 online_mode_port = 50051 online_mode_timeout = 15 diff --git a/agl_service_voiceagent/generated/voice_agent_pb2.py b/agl_service_voiceagent/generated/voice_agent_pb2.py index 4606f60..d978664 100644 --- a/agl_service_voiceagent/generated/voice_agent_pb2.py +++ b/agl_service_voiceagent/generated/voice_agent_pb2.py @@ -14,49 +14,49 @@ _sym_db = _symbol_database.Default() -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x11voice_agent.proto\"\x07\n\x05\x45mpty\"C\n\rServiceStatus\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x0e\n\x06status\x18\x02 \x01(\x08\x12\x11\n\twake_word\x18\x03 \x01(\t\"^\n\nVoiceAudio\x12\x13\n\x0b\x61udio_chunk\x18\x01 \x01(\x0c\x12\x14\n\x0c\x61udio_format\x18\x02 \x01(\t\x12\x13\n\x0bsample_rate\x18\x03 \x01(\x05\x12\x10\n\x08language\x18\x04 \x01(\t\" \n\x0eWakeWordStatus\x12\x0e\n\x06status\x18\x01 \x01(\x08\"\x93\x01\n\x17S_RecognizeVoiceControl\x12!\n\x0c\x61udio_stream\x18\x01 \x01(\x0b\x32\x0b.VoiceAudio\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12\x11\n\tstream_id\x18\x03 \x01(\t\x12$\n\rstt_framework\x18\x04 \x01(\x0e\x32\r.STTFramework\"\xd1\x01\n\x15RecognizeVoiceControl\x12\x1d\n\x06\x61\x63tion\x18\x01 \x01(\x0e\x32\r.RecordAction\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12 \n\x0brecord_mode\x18\x03 \x01(\x0e\x32\x0b.RecordMode\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\rstt_framework\x18\x05 \x01(\x0e\x32\r.STTFramework\x12 \n\x0bonline_mode\x18\x06 \x01(\x0e\x32\x0b.OnlineMode\"J\n\x14RecognizeTextControl\x12\x14\n\x0ctext_command\x18\x01 \x01(\t\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\")\n\nIntentSlot\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\x8e\x01\n\x0fRecognizeResult\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x0e\n\x06intent\x18\x02 \x01(\t\x12!\n\x0cintent_slots\x18\x03 \x03(\x0b\x32\x0b.IntentSlot\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\x06status\x18\x05 \x01(\x0e\x32\x14.RecognizeStatusType\"A\n\x0c\x45xecuteInput\x12\x0e\n\x06intent\x18\x01 \x01(\t\x12!\n\x0cintent_slots\x18\x02 \x03(\x0b\x32\x0b.IntentSlot\"E\n\rExecuteResult\x12\x10\n\x08response\x18\x01 \x01(\t\x12\"\n\x06status\x18\x02 \x01(\x0e\x32\x12.ExecuteStatusType*%\n\x0cSTTFramework\x12\x08\n\x04VOSK\x10\x00\x12\x0b\n\x07WHISPER\x10\x01*%\n\nOnlineMode\x12\n\n\x06ONLINE\x10\x00\x12\x0b\n\x07OFFLINE\x10\x01*#\n\x0cRecordAction\x12\t\n\x05START\x10\x00\x12\x08\n\x04STOP\x10\x01*\x1f\n\x08NLUModel\x12\t\n\x05SNIPS\x10\x00\x12\x08\n\x04RASA\x10\x01*\"\n\nRecordMode\x12\n\n\x06MANUAL\x10\x00\x12\x08\n\x04\x41UTO\x10\x01*\xb4\x01\n\x13RecognizeStatusType\x12\r\n\tREC_ERROR\x10\x00\x12\x0f\n\x0bREC_SUCCESS\x10\x01\x12\x12\n\x0eREC_PROCESSING\x10\x02\x12\x18\n\x14VOICE_NOT_RECOGNIZED\x10\x03\x12\x19\n\x15INTENT_NOT_RECOGNIZED\x10\x04\x12\x17\n\x13TEXT_NOT_RECOGNIZED\x10\x05\x12\x1b\n\x17NLU_MODEL_NOT_SUPPORTED\x10\x06*\x82\x01\n\x11\x45xecuteStatusType\x12\x0e\n\nEXEC_ERROR\x10\x00\x12\x10\n\x0c\x45XEC_SUCCESS\x10\x01\x12\x14\n\x10KUKSA_CONN_ERROR\x10\x02\x12\x18\n\x14INTENT_NOT_SUPPORTED\x10\x03\x12\x1b\n\x17INTENT_SLOTS_INCOMPLETE\x10\x04\x32\xa4\x03\n\x11VoiceAgentService\x12,\n\x12\x43heckServiceStatus\x12\x06.Empty\x1a\x0e.ServiceStatus\x12\x34\n\x10S_DetectWakeWord\x12\x0b.VoiceAudio\x1a\x0f.WakeWordStatus(\x01\x30\x01\x12+\n\x0e\x44\x65tectWakeWord\x12\x06.Empty\x1a\x0f.WakeWordStatus0\x01\x12G\n\x17S_RecognizeVoiceCommand\x12\x18.S_RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12\x43\n\x15RecognizeVoiceCommand\x12\x16.RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12?\n\x14RecognizeTextCommand\x12\x15.RecognizeTextControl\x1a\x10.RecognizeResult\x12/\n\x0e\x45xecuteCommand\x12\r.ExecuteInput\x1a\x0e.ExecuteResultb\x06proto3') +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x11voice_agent.proto\"\x07\n\x05\x45mpty\"X\n\rServiceStatus\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x0e\n\x06status\x18\x02 \x01(\x08\x12\x11\n\twake_word\x18\x03 \x01(\t\x12\x13\n\x0bonline_mode\x18\x04 \x01(\x08\"^\n\nVoiceAudio\x12\x13\n\x0b\x61udio_chunk\x18\x01 \x01(\x0c\x12\x14\n\x0c\x61udio_format\x18\x02 \x01(\t\x12\x13\n\x0bsample_rate\x18\x03 \x01(\x05\x12\x10\n\x08language\x18\x04 \x01(\t\" \n\x0eWakeWordStatus\x12\x0e\n\x06status\x18\x01 \x01(\x08\"\x93\x01\n\x17S_RecognizeVoiceControl\x12!\n\x0c\x61udio_stream\x18\x01 \x01(\x0b\x32\x0b.VoiceAudio\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12\x11\n\tstream_id\x18\x03 \x01(\t\x12$\n\rstt_framework\x18\x04 \x01(\x0e\x32\r.STTFramework\"\xd1\x01\n\x15RecognizeVoiceControl\x12\x1d\n\x06\x61\x63tion\x18\x01 \x01(\x0e\x32\r.RecordAction\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12 \n\x0brecord_mode\x18\x03 \x01(\x0e\x32\x0b.RecordMode\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\rstt_framework\x18\x05 \x01(\x0e\x32\r.STTFramework\x12 \n\x0bonline_mode\x18\x06 \x01(\x0e\x32\x0b.OnlineMode\"J\n\x14RecognizeTextControl\x12\x14\n\x0ctext_command\x18\x01 \x01(\t\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\")\n\nIntentSlot\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\x8e\x01\n\x0fRecognizeResult\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x0e\n\x06intent\x18\x02 \x01(\t\x12!\n\x0cintent_slots\x18\x03 \x03(\x0b\x32\x0b.IntentSlot\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\x06status\x18\x05 \x01(\x0e\x32\x14.RecognizeStatusType\"A\n\x0c\x45xecuteInput\x12\x0e\n\x06intent\x18\x01 \x01(\t\x12!\n\x0cintent_slots\x18\x02 \x03(\x0b\x32\x0b.IntentSlot\"E\n\rExecuteResult\x12\x10\n\x08response\x18\x01 \x01(\t\x12\"\n\x06status\x18\x02 \x01(\x0e\x32\x12.ExecuteStatusType*%\n\x0cSTTFramework\x12\x08\n\x04VOSK\x10\x00\x12\x0b\n\x07WHISPER\x10\x01*%\n\nOnlineMode\x12\n\n\x06ONLINE\x10\x00\x12\x0b\n\x07OFFLINE\x10\x01*#\n\x0cRecordAction\x12\t\n\x05START\x10\x00\x12\x08\n\x04STOP\x10\x01*\x1f\n\x08NLUModel\x12\t\n\x05SNIPS\x10\x00\x12\x08\n\x04RASA\x10\x01*\"\n\nRecordMode\x12\n\n\x06MANUAL\x10\x00\x12\x08\n\x04\x41UTO\x10\x01*\xb4\x01\n\x13RecognizeStatusType\x12\r\n\tREC_ERROR\x10\x00\x12\x0f\n\x0bREC_SUCCESS\x10\x01\x12\x12\n\x0eREC_PROCESSING\x10\x02\x12\x18\n\x14VOICE_NOT_RECOGNIZED\x10\x03\x12\x19\n\x15INTENT_NOT_RECOGNIZED\x10\x04\x12\x17\n\x13TEXT_NOT_RECOGNIZED\x10\x05\x12\x1b\n\x17NLU_MODEL_NOT_SUPPORTED\x10\x06*\x82\x01\n\x11\x45xecuteStatusType\x12\x0e\n\nEXEC_ERROR\x10\x00\x12\x10\n\x0c\x45XEC_SUCCESS\x10\x01\x12\x14\n\x10KUKSA_CONN_ERROR\x10\x02\x12\x18\n\x14INTENT_NOT_SUPPORTED\x10\x03\x12\x1b\n\x17INTENT_SLOTS_INCOMPLETE\x10\x04\x32\xa4\x03\n\x11VoiceAgentService\x12,\n\x12\x43heckServiceStatus\x12\x06.Empty\x1a\x0e.ServiceStatus\x12\x34\n\x10S_DetectWakeWord\x12\x0b.VoiceAudio\x1a\x0f.WakeWordStatus(\x01\x30\x01\x12+\n\x0e\x44\x65tectWakeWord\x12\x06.Empty\x1a\x0f.WakeWordStatus0\x01\x12G\n\x17S_RecognizeVoiceCommand\x12\x18.S_RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12\x43\n\x15RecognizeVoiceCommand\x12\x16.RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12?\n\x14RecognizeTextCommand\x12\x15.RecognizeTextControl\x1a\x10.RecognizeResult\x12/\n\x0e\x45xecuteCommand\x12\r.ExecuteInput\x1a\x0e.ExecuteResultb\x06proto3') _globals = globals() _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals) _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'voice_agent_pb2', _globals) if not _descriptor._USE_C_DESCRIPTORS: DESCRIPTOR._loaded_options = None - _globals['_STTFRAMEWORK']._serialized_start=993 - _globals['_STTFRAMEWORK']._serialized_end=1030 - _globals['_ONLINEMODE']._serialized_start=1032 - _globals['_ONLINEMODE']._serialized_end=1069 - _globals['_RECORDACTION']._serialized_start=1071 - _globals['_RECORDACTION']._serialized_end=1106 - _globals['_NLUMODEL']._serialized_start=1108 - _globals['_NLUMODEL']._serialized_end=1139 - _globals['_RECORDMODE']._serialized_start=1141 - _globals['_RECORDMODE']._serialized_end=1175 - _globals['_RECOGNIZESTATUSTYPE']._serialized_start=1178 - _globals['_RECOGNIZESTATUSTYPE']._serialized_end=1358 - _globals['_EXECUTESTATUSTYPE']._serialized_start=1361 - _globals['_EXECUTESTATUSTYPE']._serialized_end=1491 + _globals['_STTFRAMEWORK']._serialized_start=1014 + _globals['_STTFRAMEWORK']._serialized_end=1051 + _globals['_ONLINEMODE']._serialized_start=1053 + _globals['_ONLINEMODE']._serialized_end=1090 + _globals['_RECORDACTION']._serialized_start=1092 + _globals['_RECORDACTION']._serialized_end=1127 + _globals['_NLUMODEL']._serialized_start=1129 + _globals['_NLUMODEL']._serialized_end=1160 + _globals['_RECORDMODE']._serialized_start=1162 + _globals['_RECORDMODE']._serialized_end=1196 + _globals['_RECOGNIZESTATUSTYPE']._serialized_start=1199 + _globals['_RECOGNIZESTATUSTYPE']._serialized_end=1379 + _globals['_EXECUTESTATUSTYPE']._serialized_start=1382 + _globals['_EXECUTESTATUSTYPE']._serialized_end=1512 _globals['_EMPTY']._serialized_start=21 _globals['_EMPTY']._serialized_end=28 _globals['_SERVICESTATUS']._serialized_start=30 - _globals['_SERVICESTATUS']._serialized_end=97 - _globals['_VOICEAUDIO']._serialized_start=99 - _globals['_VOICEAUDIO']._serialized_end=193 - _globals['_WAKEWORDSTATUS']._serialized_start=195 - _globals['_WAKEWORDSTATUS']._serialized_end=227 - _globals['_S_RECOGNIZEVOICECONTROL']._serialized_start=230 - _globals['_S_RECOGNIZEVOICECONTROL']._serialized_end=377 - _globals['_RECOGNIZEVOICECONTROL']._serialized_start=380 - _globals['_RECOGNIZEVOICECONTROL']._serialized_end=589 - _globals['_RECOGNIZETEXTCONTROL']._serialized_start=591 - _globals['_RECOGNIZETEXTCONTROL']._serialized_end=665 - _globals['_INTENTSLOT']._serialized_start=667 - _globals['_INTENTSLOT']._serialized_end=708 - _globals['_RECOGNIZERESULT']._serialized_start=711 - _globals['_RECOGNIZERESULT']._serialized_end=853 - _globals['_EXECUTEINPUT']._serialized_start=855 - _globals['_EXECUTEINPUT']._serialized_end=920 - _globals['_EXECUTERESULT']._serialized_start=922 - _globals['_EXECUTERESULT']._serialized_end=991 - _globals['_VOICEAGENTSERVICE']._serialized_start=1494 - _globals['_VOICEAGENTSERVICE']._serialized_end=1914 + _globals['_SERVICESTATUS']._serialized_end=118 + _globals['_VOICEAUDIO']._serialized_start=120 + _globals['_VOICEAUDIO']._serialized_end=214 + _globals['_WAKEWORDSTATUS']._serialized_start=216 + _globals['_WAKEWORDSTATUS']._serialized_end=248 + _globals['_S_RECOGNIZEVOICECONTROL']._serialized_start=251 + _globals['_S_RECOGNIZEVOICECONTROL']._serialized_end=398 + _globals['_RECOGNIZEVOICECONTROL']._serialized_start=401 + _globals['_RECOGNIZEVOICECONTROL']._serialized_end=610 + _globals['_RECOGNIZETEXTCONTROL']._serialized_start=612 + _globals['_RECOGNIZETEXTCONTROL']._serialized_end=686 + _globals['_INTENTSLOT']._serialized_start=688 + _globals['_INTENTSLOT']._serialized_end=729 + _globals['_RECOGNIZERESULT']._serialized_start=732 + _globals['_RECOGNIZERESULT']._serialized_end=874 + _globals['_EXECUTEINPUT']._serialized_start=876 + _globals['_EXECUTEINPUT']._serialized_end=941 + _globals['_EXECUTERESULT']._serialized_start=943 + _globals['_EXECUTERESULT']._serialized_end=1012 + _globals['_VOICEAGENTSERVICE']._serialized_start=1515 + _globals['_VOICEAGENTSERVICE']._serialized_end=1935 # @@protoc_insertion_point(module_scope) diff --git a/agl_service_voiceagent/nlu/snips_interface.py b/agl_service_voiceagent/nlu/snips_interface.py index a32f574..25ad05b 100644 --- a/agl_service_voiceagent/nlu/snips_interface.py +++ b/agl_service_voiceagent/nlu/snips_interface.py @@ -46,8 +46,7 @@ class SnipsInterface: preprocessed_text = text.lower().strip() # remove special characters, punctuation, and extra whitespaces preprocessed_text = re.sub(r'[^\w\s]', '', preprocessed_text).strip() - # replace % with " precent" - preprocessed_text = re.sub(r'%', ' percent', preprocessed_text) + preprocessed_text = re.sub(r'percent', '', preprocessed_text) # replace ° with " degrees" preprocessed_text = re.sub(r'°', ' degrees ', preprocessed_text) return preprocessed_text diff --git a/agl_service_voiceagent/protos/voice_agent.proto b/agl_service_voiceagent/protos/voice_agent.proto index bd2daa2..72d48c6 100644 --- a/agl_service_voiceagent/protos/voice_agent.proto +++ b/agl_service_voiceagent/protos/voice_agent.proto @@ -61,6 +61,7 @@ message ServiceStatus { string version = 1; bool status = 2; string wake_word = 3; + bool online_mode = 4; } message VoiceAudio { diff --git a/agl_service_voiceagent/servicers/voice_agent_servicer.py b/agl_service_voiceagent/servicers/voice_agent_servicer.py index 2a4de33..c149b6d 100644 --- a/agl_service_voiceagent/servicers/voice_agent_servicer.py +++ b/agl_service_voiceagent/servicers/voice_agent_servicer.py @@ -199,6 +199,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): version=self.service_version, status=True, wake_word=self.wake_word, + online_mode = self.online_mode ) # Convert the response object to a JSON string and log it @@ -280,8 +281,12 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): "recorder": recorder, "audio_file": audio_file } - - recorder.start_recording() + + def record(): + recorder.start_recording() + + record_thread = threading.Thread(target=record) + record_thread.start() elif request.action == voice_agent_pb2.STOP: stream_uuid = request.stream_id @@ -294,6 +299,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): recorder = self.rvc_stream_uuids[stream_uuid]["recorder"] audio_file = self.rvc_stream_uuids[stream_uuid]["audio_file"] del self.rvc_stream_uuids[stream_uuid] + print(use_online_mode) recorder.stop_recording() @@ -316,12 +322,19 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): recognizer_uuid = self.stt_model.setup_vosk_recognizer() stt = self.stt_model.recognize_from_file(recognizer_uuid, audio_file,stt_framework=stt_framework) used_kaldi = True - print(stt) if stt not in ["FILE_NOT_FOUND", "FILE_FORMAT_INVALID", "VOICE_NOT_RECOGNIZED", ""]: if request.nlu_model == voice_agent_pb2.SNIPS: - extracted_intent = self.snips_interface.extract_intent(stt) - intent, intent_actions = self.snips_interface.process_intent(extracted_intent) + try: + extracted_intent = self.snips_interface.extract_intent(stt) + except Exception as e: + print(e) + extracted_intent = "" + if extracted_intent != "": + intent, intent_actions = self.snips_interface.process_intent(extracted_intent) + else: + intent = "" + intent_actions = {} if not intent or intent == "": status = voice_agent_pb2.INTENT_NOT_RECOGNIZED @@ -346,14 +359,12 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): status = voice_agent_pb2.NLU_MODEL_NOT_SUPPORTED else: - stt = "" status = voice_agent_pb2.VOICE_NOT_RECOGNIZED # cleanup the kaldi recognizer if used_kaldi: self.stt_model.cleanup_recognizer(recognizer_uuid) used_kaldi = False - # delete the audio file if not self.store_voice_command: delete_file(audio_file) @@ -516,7 +527,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): exec_response = "Uh oh, I failed to stop the media." exec_status = voice_agent_pb2.EXEC_ERROR else: - exec_response = "Sorry, I failed to execute command against intent 'MediaControl'. Maybe try again with more specific instructions." + exec_response = "Sorry, I failed to execute command." exec_status = voice_agent_pb2.EXEC_ERROR @@ -572,7 +583,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): if "value" in execution_item: value = execution_item["value"] if self.set_current_values(signal, value): - exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'." + exec_response = f"Yay, I successfully updated '{intent}' to value '{value}'." exec_status = voice_agent_pb2.EXEC_SUCCESS elif "factor" in execution_item: @@ -593,7 +604,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer): value = current_value - factor value = str(value) if self.set_current_values(signal, value): - exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'." + exec_response = f"Yay, I successfully updated '{intent}' to value '{value}'." exec_status = voice_agent_pb2.EXEC_SUCCESS else: diff --git a/agl_service_voiceagent/utils/audio_recorder.py b/agl_service_voiceagent/utils/audio_recorder.py index 49716c9..e362480 100644 --- a/agl_service_voiceagent/utils/audio_recorder.py +++ b/agl_service_voiceagent/utils/audio_recorder.py @@ -64,7 +64,7 @@ class AudioRecorder: """ print("Creating pipeline for audio recording in {} mode...".format(self.mode)) self.pipeline = Gst.Pipeline() - autoaudiosrc = Gst.ElementFactory.make("autoaudiosrc", None) + autoaudiosrc = Gst.ElementFactory.make("alsasrc", None) queue = Gst.ElementFactory.make("queue", None) queue.set_property("max-size-buffers", 0) queue.set_property("max-size-bytes", 0) @@ -109,6 +109,7 @@ class AudioRecorder: Start recording audio using the GStreamer pipeline. """ self.pipeline.set_state(Gst.State.PLAYING) + self.loop.run() print("Recording Voice Input...") @@ -186,3 +187,4 @@ class AudioRecorder: print("Pipeline cleanup complete!") self.bus = None self.pipeline = None + self.loop.quit() \ No newline at end of file diff --git a/agl_service_voiceagent/utils/stt_model.py b/agl_service_voiceagent/utils/stt_model.py index 7e8ad8b..0a092ea 100644 --- a/agl_service_voiceagent/utils/stt_model.py +++ b/agl_service_voiceagent/utils/stt_model.py @@ -21,7 +21,7 @@ import wave from agl_service_voiceagent.utils.common import generate_unique_uuid # import the whisper model -import whisper +# import whisper # for whisper timeout feature from concurrent.futures import ThreadPoolExecutor import subprocess @@ -93,28 +93,28 @@ class STTModel: return result # Recognize speech using the whisper model - def recognize_using_whisper(self,filename,language = None,timeout = 5,fp16=False): - """ - Recognize speech and return the result as a JSON object. - - Args: - filename (str): The path to the audio file. - timeout (int, optional): The timeout for recognition (default is 5 seconds). - fp16 (bool, optional): If True, use 16-bit floating point precision, (default is False) because cuda is not supported. - language (str, optional): The language code for recognition (default is None). - - Returns: - dict: A JSON object containing recognition results. - """ - def transcribe_with_whisper(): - return self.whisper_model.transcribe(filename, language = language,fp16=fp16) + # def recognize_using_whisper(self,filename,language = None,timeout = 5,fp16=False): + # """ + # Recognize speech and return the result as a JSON object. + + # Args: + # filename (str): The path to the audio file. + # timeout (int, optional): The timeout for recognition (default is 5 seconds). + # fp16 (bool, optional): If True, use 16-bit floating point precision, (default is False) because cuda is not supported. + # language (str, optional): The language code for recognition (default is None). + + # Returns: + # dict: A JSON object containing recognition results. + # """ + # def transcribe_with_whisper(): + # return self.whisper_model.transcribe(filename, language = language,fp16=fp16) - with ThreadPoolExecutor() as executor: - future = executor.submit(transcribe_with_whisper) - try: - return future.result(timeout=timeout) - except TimeoutError: - return {"error": "Transcription with Whisper exceeded the timeout."} + # with ThreadPoolExecutor() as executor: + # future = executor.submit(transcribe_with_whisper) + # try: + # return future.result(timeout=timeout) + # except TimeoutError: + # return {"error": "Transcription with Whisper exceeded the timeout."} def recognize_using_whisper_cpp(self,filename): command = self.whisper_cpp_path -- cgit 1.2.3-korg