Remove OpenAI's Whisper AI and Bug FixingHEAD trout_19.90.0 trout/19.90.0 19.90.0 master

- Removed OpenAI's Whisper AI from agl-service-voiceagent and using whisper.cpp for speech-to-text. - Fix audio_recorder. - Update grpc protoc to include the online-mode status in ServiceStatus - Set online_mode flag default to 0 - Change wake word to "hey automotive" Bug-AGL: SPEC-5200 Change-Id: I9f1629cdcaef43498bf4cb9fdd950291a415819d Signed-off-by: Anuj Solanki <anuj603362@gmail.com>
author: Anuj Solanki <anuj603362@gmail.com> 2024-10-01 00:32:40 +0530
committer: Anuj Solanki <anuj603362@gmail.com> 2024-10-06 01:16:32 +0530
commit: 5a8f670c3f772cfe0345ed53e5989a6dca08a905 (patch)
tree: 7d82533a0f5356547e31609c3db1d0101de83376
parent: 1144fcd343bc56f8c27ff73d3e76904010dbb832 (diff)
8 files changed, 90 insertions, 76 deletions
diff --git a/README.md b/README.md
index d418994..7f037f7 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
 # Automotive Grade Linux (AGL) Voice Agent Service
-A gRPC-based voice agent service designed for Automotive Grade Linux (AGL). This service leverages GStreamer, Vosk, Snips, and RASA to seamlessly process user voice commands. It converts spoken words into text, extracts intents from these commands, and performs actions through the Kuksa interface.
+A gRPC-based voice agent service designed for Automotive Grade Linux (AGL). This service leverages GStreamer, Vosk, Whisper, Snips, and RASA to seamlessly process user voice commands. It converts spoken words into text, extracts intents from these commands, and performs actions through the Kuksa interface.
 
 ## Table of Contents
 - [Features](#features)
@@ -62,14 +62,15 @@ Replace `SERVER_IP` with IP address of the running Voice Agent server, and `SERV
 
 To issue a voice command, use the following command:
 ```bash
-voiceagent-service run-client --server_address SERVER_IP --server_port SERVER_PORT --action ExecuteVoiceCommand --mode manual --nlu NLU_ENGINE
+voiceagent-service run-client --server_address SERVER_IP --server_port SERVER_PORT --action ExecuteVoiceCommand --mode manual --nlu NLU_ENGINE --stt-frameword STT_FRAMEWORK
 ```
-Replace `NLU_ENGINE` with the preferred NLU engine ("snips" or "rasa"), `SERVER_IP` with IP address of the running Voice Agent server, and `SERVER_PORT` with the port of the running Voice Agent server. You can also pass a custom value to flag `--recording-time` if you want to change the default recording time from 5 seconds to any other value.
+Replace `NLU_ENGINE` with the preferred NLU engine ("snips" or "rasa"), `SERVER_IP` with IP address of the running Voice Agent server, and `SERVER_PORT` with the port of the running Voice Agent server. You can also pass a custom value to flag `--recording-time` if you want to change the default recording time from 5 seconds to any other value, you can also pass --stt-frameword to specify the STT framework to be used, supported frameworks are "vosk" and "whisper" and the default is "vosk".
 
 ## Configuration
 Configuration options for the AGL Voice Agent Service can be found in the default `config.ini` file. You can customize various settings, including the AI models, audio directories, and Kuksa integration. **Important:** while manually making changes to the config file make sure you add trailing slash to all the directory paths, ie. the paths to directories should always end with a `/`. 
 
 ## Maintainers
+- **Anuj Solanki** <anuj603362@gmail.com>
 - **Malik Talha** <talhamalik727x@gmail.com>
 
 ## License
diff --git a/agl_service_voiceagent/config.ini b/agl_service_voiceagent/config.ini
index d6d695e..e4f6313 100644
--- a/agl_service_voiceagent/config.ini
+++ b/agl_service_voiceagent/config.ini
@@ -9,7 +9,7 @@ snips_model_path = /usr/share/nlu/snips/model/
 channels = 1
 sample_rate = 16000
 bits_per_sample = 16
-wake_word = hello
+wake_word = hey automotive
 server_port = 51053
 server_address = 127.0.0.1
 rasa_model_path = /usr/share/nlu/rasa/models/
@@ -17,7 +17,7 @@ rasa_server_port = 51054
 rasa_detached_mode = 1
 base_log_dir = /usr/share/nlu/logs/
 store_voice_commands = 0
-online_mode = 1
+online_mode = 0
 online_mode_address = 65.108.107.216
 online_mode_port = 50051
 online_mode_timeout = 15
diff --git a/agl_service_voiceagent/generated/voice_agent_pb2.py b/agl_service_voiceagent/generated/voice_agent_pb2.py
index 4606f60..d978664 100644
--- a/agl_service_voiceagent/generated/voice_agent_pb2.py
+++ b/agl_service_voiceagent/generated/voice_agent_pb2.py
@@ -14,49 +14,49 @@ _sym_db = _symbol_database.Default()
 
 
 
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x11voice_agent.proto\"\x07\n\x05\x45mpty\"C\n\rServiceStatus\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x0e\n\x06status\x18\x02 \x01(\x08\x12\x11\n\twake_word\x18\x03 \x01(\t\"^\n\nVoiceAudio\x12\x13\n\x0b\x61udio_chunk\x18\x01 \x01(\x0c\x12\x14\n\x0c\x61udio_format\x18\x02 \x01(\t\x12\x13\n\x0bsample_rate\x18\x03 \x01(\x05\x12\x10\n\x08language\x18\x04 \x01(\t\" \n\x0eWakeWordStatus\x12\x0e\n\x06status\x18\x01 \x01(\x08\"\x93\x01\n\x17S_RecognizeVoiceControl\x12!\n\x0c\x61udio_stream\x18\x01 \x01(\x0b\x32\x0b.VoiceAudio\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12\x11\n\tstream_id\x18\x03 \x01(\t\x12$\n\rstt_framework\x18\x04 \x01(\x0e\x32\r.STTFramework\"\xd1\x01\n\x15RecognizeVoiceControl\x12\x1d\n\x06\x61\x63tion\x18\x01 \x01(\x0e\x32\r.RecordAction\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12 \n\x0brecord_mode\x18\x03 \x01(\x0e\x32\x0b.RecordMode\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\rstt_framework\x18\x05 \x01(\x0e\x32\r.STTFramework\x12 \n\x0bonline_mode\x18\x06 \x01(\x0e\x32\x0b.OnlineMode\"J\n\x14RecognizeTextControl\x12\x14\n\x0ctext_command\x18\x01 \x01(\t\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\")\n\nIntentSlot\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\x8e\x01\n\x0fRecognizeResult\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x0e\n\x06intent\x18\x02 \x01(\t\x12!\n\x0cintent_slots\x18\x03 \x03(\x0b\x32\x0b.IntentSlot\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\x06status\x18\x05 \x01(\x0e\x32\x14.RecognizeStatusType\"A\n\x0c\x45xecuteInput\x12\x0e\n\x06intent\x18\x01 \x01(\t\x12!\n\x0cintent_slots\x18\x02 \x03(\x0b\x32\x0b.IntentSlot\"E\n\rExecuteResult\x12\x10\n\x08response\x18\x01 \x01(\t\x12\"\n\x06status\x18\x02 \x01(\x0e\x32\x12.ExecuteStatusType*%\n\x0cSTTFramework\x12\x08\n\x04VOSK\x10\x00\x12\x0b\n\x07WHISPER\x10\x01*%\n\nOnlineMode\x12\n\n\x06ONLINE\x10\x00\x12\x0b\n\x07OFFLINE\x10\x01*#\n\x0cRecordAction\x12\t\n\x05START\x10\x00\x12\x08\n\x04STOP\x10\x01*\x1f\n\x08NLUModel\x12\t\n\x05SNIPS\x10\x00\x12\x08\n\x04RASA\x10\x01*\"\n\nRecordMode\x12\n\n\x06MANUAL\x10\x00\x12\x08\n\x04\x41UTO\x10\x01*\xb4\x01\n\x13RecognizeStatusType\x12\r\n\tREC_ERROR\x10\x00\x12\x0f\n\x0bREC_SUCCESS\x10\x01\x12\x12\n\x0eREC_PROCESSING\x10\x02\x12\x18\n\x14VOICE_NOT_RECOGNIZED\x10\x03\x12\x19\n\x15INTENT_NOT_RECOGNIZED\x10\x04\x12\x17\n\x13TEXT_NOT_RECOGNIZED\x10\x05\x12\x1b\n\x17NLU_MODEL_NOT_SUPPORTED\x10\x06*\x82\x01\n\x11\x45xecuteStatusType\x12\x0e\n\nEXEC_ERROR\x10\x00\x12\x10\n\x0c\x45XEC_SUCCESS\x10\x01\x12\x14\n\x10KUKSA_CONN_ERROR\x10\x02\x12\x18\n\x14INTENT_NOT_SUPPORTED\x10\x03\x12\x1b\n\x17INTENT_SLOTS_INCOMPLETE\x10\x04\x32\xa4\x03\n\x11VoiceAgentService\x12,\n\x12\x43heckServiceStatus\x12\x06.Empty\x1a\x0e.ServiceStatus\x12\x34\n\x10S_DetectWakeWord\x12\x0b.VoiceAudio\x1a\x0f.WakeWordStatus(\x01\x30\x01\x12+\n\x0e\x44\x65tectWakeWord\x12\x06.Empty\x1a\x0f.WakeWordStatus0\x01\x12G\n\x17S_RecognizeVoiceCommand\x12\x18.S_RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12\x43\n\x15RecognizeVoiceCommand\x12\x16.RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12?\n\x14RecognizeTextCommand\x12\x15.RecognizeTextControl\x1a\x10.RecognizeResult\x12/\n\x0e\x45xecuteCommand\x12\r.ExecuteInput\x1a\x0e.ExecuteResultb\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x11voice_agent.proto\"\x07\n\x05\x45mpty\"X\n\rServiceStatus\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x0e\n\x06status\x18\x02 \x01(\x08\x12\x11\n\twake_word\x18\x03 \x01(\t\x12\x13\n\x0bonline_mode\x18\x04 \x01(\x08\"^\n\nVoiceAudio\x12\x13\n\x0b\x61udio_chunk\x18\x01 \x01(\x0c\x12\x14\n\x0c\x61udio_format\x18\x02 \x01(\t\x12\x13\n\x0bsample_rate\x18\x03 \x01(\x05\x12\x10\n\x08language\x18\x04 \x01(\t\" \n\x0eWakeWordStatus\x12\x0e\n\x06status\x18\x01 \x01(\x08\"\x93\x01\n\x17S_RecognizeVoiceControl\x12!\n\x0c\x61udio_stream\x18\x01 \x01(\x0b\x32\x0b.VoiceAudio\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12\x11\n\tstream_id\x18\x03 \x01(\t\x12$\n\rstt_framework\x18\x04 \x01(\x0e\x32\r.STTFramework\"\xd1\x01\n\x15RecognizeVoiceControl\x12\x1d\n\x06\x61\x63tion\x18\x01 \x01(\x0e\x32\r.RecordAction\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12 \n\x0brecord_mode\x18\x03 \x01(\x0e\x32\x0b.RecordMode\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\rstt_framework\x18\x05 \x01(\x0e\x32\r.STTFramework\x12 \n\x0bonline_mode\x18\x06 \x01(\x0e\x32\x0b.OnlineMode\"J\n\x14RecognizeTextControl\x12\x14\n\x0ctext_command\x18\x01 \x01(\t\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\")\n\nIntentSlot\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\x8e\x01\n\x0fRecognizeResult\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x0e\n\x06intent\x18\x02 \x01(\t\x12!\n\x0cintent_slots\x18\x03 \x03(\x0b\x32\x0b.IntentSlot\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\x06status\x18\x05 \x01(\x0e\x32\x14.RecognizeStatusType\"A\n\x0c\x45xecuteInput\x12\x0e\n\x06intent\x18\x01 \x01(\t\x12!\n\x0cintent_slots\x18\x02 \x03(\x0b\x32\x0b.IntentSlot\"E\n\rExecuteResult\x12\x10\n\x08response\x18\x01 \x01(\t\x12\"\n\x06status\x18\x02 \x01(\x0e\x32\x12.ExecuteStatusType*%\n\x0cSTTFramework\x12\x08\n\x04VOSK\x10\x00\x12\x0b\n\x07WHISPER\x10\x01*%\n\nOnlineMode\x12\n\n\x06ONLINE\x10\x00\x12\x0b\n\x07OFFLINE\x10\x01*#\n\x0cRecordAction\x12\t\n\x05START\x10\x00\x12\x08\n\x04STOP\x10\x01*\x1f\n\x08NLUModel\x12\t\n\x05SNIPS\x10\x00\x12\x08\n\x04RASA\x10\x01*\"\n\nRecordMode\x12\n\n\x06MANUAL\x10\x00\x12\x08\n\x04\x41UTO\x10\x01*\xb4\x01\n\x13RecognizeStatusType\x12\r\n\tREC_ERROR\x10\x00\x12\x0f\n\x0bREC_SUCCESS\x10\x01\x12\x12\n\x0eREC_PROCESSING\x10\x02\x12\x18\n\x14VOICE_NOT_RECOGNIZED\x10\x03\x12\x19\n\x15INTENT_NOT_RECOGNIZED\x10\x04\x12\x17\n\x13TEXT_NOT_RECOGNIZED\x10\x05\x12\x1b\n\x17NLU_MODEL_NOT_SUPPORTED\x10\x06*\x82\x01\n\x11\x45xecuteStatusType\x12\x0e\n\nEXEC_ERROR\x10\x00\x12\x10\n\x0c\x45XEC_SUCCESS\x10\x01\x12\x14\n\x10KUKSA_CONN_ERROR\x10\x02\x12\x18\n\x14INTENT_NOT_SUPPORTED\x10\x03\x12\x1b\n\x17INTENT_SLOTS_INCOMPLETE\x10\x04\x32\xa4\x03\n\x11VoiceAgentService\x12,\n\x12\x43heckServiceStatus\x12\x06.Empty\x1a\x0e.ServiceStatus\x12\x34\n\x10S_DetectWakeWord\x12\x0b.VoiceAudio\x1a\x0f.WakeWordStatus(\x01\x30\x01\x12+\n\x0e\x44\x65tectWakeWord\x12\x06.Empty\x1a\x0f.WakeWordStatus0\x01\x12G\n\x17S_RecognizeVoiceCommand\x12\x18.S_RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12\x43\n\x15RecognizeVoiceCommand\x12\x16.RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12?\n\x14RecognizeTextCommand\x12\x15.RecognizeTextControl\x1a\x10.RecognizeResult\x12/\n\x0e\x45xecuteCommand\x12\r.ExecuteInput\x1a\x0e.ExecuteResultb\x06proto3')
 
 _globals = globals()
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
 _builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'voice_agent_pb2', _globals)
 if not _descriptor._USE_C_DESCRIPTORS:
   DESCRIPTOR._loaded_options = None
-  _globals['_STTFRAMEWORK']._serialized_start=993
-  _globals['_STTFRAMEWORK']._serialized_end=1030
-  _globals['_ONLINEMODE']._serialized_start=1032
-  _globals['_ONLINEMODE']._serialized_end=1069
-  _globals['_RECORDACTION']._serialized_start=1071
-  _globals['_RECORDACTION']._serialized_end=1106
-  _globals['_NLUMODEL']._serialized_start=1108
-  _globals['_NLUMODEL']._serialized_end=1139
-  _globals['_RECORDMODE']._serialized_start=1141
-  _globals['_RECORDMODE']._serialized_end=1175
-  _globals['_RECOGNIZESTATUSTYPE']._serialized_start=1178
-  _globals['_RECOGNIZESTATUSTYPE']._serialized_end=1358
-  _globals['_EXECUTESTATUSTYPE']._serialized_start=1361
-  _globals['_EXECUTESTATUSTYPE']._serialized_end=1491
+  _globals['_STTFRAMEWORK']._serialized_start=1014
+  _globals['_STTFRAMEWORK']._serialized_end=1051
+  _globals['_ONLINEMODE']._serialized_start=1053
+  _globals['_ONLINEMODE']._serialized_end=1090
+  _globals['_RECORDACTION']._serialized_start=1092
+  _globals['_RECORDACTION']._serialized_end=1127
+  _globals['_NLUMODEL']._serialized_start=1129
+  _globals['_NLUMODEL']._serialized_end=1160
+  _globals['_RECORDMODE']._serialized_start=1162
+  _globals['_RECORDMODE']._serialized_end=1196
+  _globals['_RECOGNIZESTATUSTYPE']._serialized_start=1199
+  _globals['_RECOGNIZESTATUSTYPE']._serialized_end=1379
+  _globals['_EXECUTESTATUSTYPE']._serialized_start=1382
+  _globals['_EXECUTESTATUSTYPE']._serialized_end=1512
   _globals['_EMPTY']._serialized_start=21
   _globals['_EMPTY']._serialized_end=28
   _globals['_SERVICESTATUS']._serialized_start=30
-  _globals['_SERVICESTATUS']._serialized_end=97
-  _globals['_VOICEAUDIO']._serialized_start=99
-  _globals['_VOICEAUDIO']._serialized_end=193
-  _globals['_WAKEWORDSTATUS']._serialized_start=195
-  _globals['_WAKEWORDSTATUS']._serialized_end=227
-  _globals['_S_RECOGNIZEVOICECONTROL']._serialized_start=230
-  _globals['_S_RECOGNIZEVOICECONTROL']._serialized_end=377
-  _globals['_RECOGNIZEVOICECONTROL']._serialized_start=380
-  _globals['_RECOGNIZEVOICECONTROL']._serialized_end=589
-  _globals['_RECOGNIZETEXTCONTROL']._serialized_start=591
-  _globals['_RECOGNIZETEXTCONTROL']._serialized_end=665
-  _globals['_INTENTSLOT']._serialized_start=667
-  _globals['_INTENTSLOT']._serialized_end=708
-  _globals['_RECOGNIZERESULT']._serialized_start=711
-  _globals['_RECOGNIZERESULT']._serialized_end=853
-  _globals['_EXECUTEINPUT']._serialized_start=855
-  _globals['_EXECUTEINPUT']._serialized_end=920
-  _globals['_EXECUTERESULT']._serialized_start=922
-  _globals['_EXECUTERESULT']._serialized_end=991
-  _globals['_VOICEAGENTSERVICE']._serialized_start=1494
-  _globals['_VOICEAGENTSERVICE']._serialized_end=1914
+  _globals['_SERVICESTATUS']._serialized_end=118
+  _globals['_VOICEAUDIO']._serialized_start=120
+  _globals['_VOICEAUDIO']._serialized_end=214
+  _globals['_WAKEWORDSTATUS']._serialized_start=216
+  _globals['_WAKEWORDSTATUS']._serialized_end=248
+  _globals['_S_RECOGNIZEVOICECONTROL']._serialized_start=251
+  _globals['_S_RECOGNIZEVOICECONTROL']._serialized_end=398
+  _globals['_RECOGNIZEVOICECONTROL']._serialized_start=401
+  _globals['_RECOGNIZEVOICECONTROL']._serialized_end=610
+  _globals['_RECOGNIZETEXTCONTROL']._serialized_start=612
+  _globals['_RECOGNIZETEXTCONTROL']._serialized_end=686
+  _globals['_INTENTSLOT']._serialized_start=688
+  _globals['_INTENTSLOT']._serialized_end=729
+  _globals['_RECOGNIZERESULT']._serialized_start=732
+  _globals['_RECOGNIZERESULT']._serialized_end=874
+  _globals['_EXECUTEINPUT']._serialized_start=876
+  _globals['_EXECUTEINPUT']._serialized_end=941
+  _globals['_EXECUTERESULT']._serialized_start=943
+  _globals['_EXECUTERESULT']._serialized_end=1012
+  _globals['_VOICEAGENTSERVICE']._serialized_start=1515
+  _globals['_VOICEAGENTSERVICE']._serialized_end=1935
 # @@protoc_insertion_point(module_scope)
diff --git a/agl_service_voiceagent/nlu/snips_interface.py b/agl_service_voiceagent/nlu/snips_interface.py
index a32f574..25ad05b 100644
--- a/agl_service_voiceagent/nlu/snips_interface.py
+++ b/agl_service_voiceagent/nlu/snips_interface.py
@@ -46,8 +46,7 @@ class SnipsInterface:
         preprocessed_text = text.lower().strip()
         # remove special characters, punctuation, and extra whitespaces
         preprocessed_text = re.sub(r'[^\w\s]', '', preprocessed_text).strip()
-        # replace % with " precent"
-        preprocessed_text = re.sub(r'%', ' percent', preprocessed_text)
+        preprocessed_text = re.sub(r'percent', '', preprocessed_text)
         # replace ° with " degrees"
         preprocessed_text = re.sub(r'°', ' degrees ', preprocessed_text)
         return preprocessed_text
diff --git a/agl_service_voiceagent/protos/voice_agent.proto b/agl_service_voiceagent/protos/voice_agent.proto
index bd2daa2..72d48c6 100644
--- a/agl_service_voiceagent/protos/voice_agent.proto
+++ b/agl_service_voiceagent/protos/voice_agent.proto
@@ -61,6 +61,7 @@ message ServiceStatus {
   string version = 1;
   bool status = 2;  
   string wake_word = 3;
+  bool online_mode = 4;
 }
 
 message VoiceAudio {
diff --git a/agl_service_voiceagent/servicers/voice_agent_servicer.py b/agl_service_voiceagent/servicers/voice_agent_servicer.py
index 2a4de33..c149b6d 100644
--- a/agl_service_voiceagent/servicers/voice_agent_servicer.py
+++ b/agl_service_voiceagent/servicers/voice_agent_servicer.py
@@ -199,6 +199,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
             version=self.service_version,
             status=True,
             wake_word=self.wake_word,
+            online_mode = self.online_mode
         )
 
         # Convert the response object to a JSON string and log it
@@ -280,8 +281,12 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
                         "recorder": recorder,
                         "audio_file": audio_file
                     }
-                    
-                    recorder.start_recording()
+
+                    def record():
+                        recorder.start_recording()
+
+                    record_thread = threading.Thread(target=record)
+                    record_thread.start()
 
                 elif request.action == voice_agent_pb2.STOP:
                     stream_uuid = request.stream_id
@@ -294,6 +299,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
                     recorder = self.rvc_stream_uuids[stream_uuid]["recorder"]
                     audio_file = self.rvc_stream_uuids[stream_uuid]["audio_file"]
                     del self.rvc_stream_uuids[stream_uuid]
+                    print(use_online_mode)
 
                     recorder.stop_recording()
                                       
@@ -316,12 +322,19 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
                         recognizer_uuid = self.stt_model.setup_vosk_recognizer()
                         stt = self.stt_model.recognize_from_file(recognizer_uuid, audio_file,stt_framework=stt_framework)
                         used_kaldi = True
-
                     print(stt)
                     if stt not in ["FILE_NOT_FOUND", "FILE_FORMAT_INVALID", "VOICE_NOT_RECOGNIZED", ""]:
                         if request.nlu_model == voice_agent_pb2.SNIPS:
-                            extracted_intent = self.snips_interface.extract_intent(stt)
-                            intent, intent_actions = self.snips_interface.process_intent(extracted_intent)
+                            try:
+                                extracted_intent = self.snips_interface.extract_intent(stt)
+                            except Exception as e:
+                                print(e)
+                                extracted_intent = ""
+                            if extracted_intent != "":
+                                intent, intent_actions = self.snips_interface.process_intent(extracted_intent)
+                            else:
+                                intent = ""
+                                intent_actions = {}
                             if not intent or intent == "":
                                 status = voice_agent_pb2.INTENT_NOT_RECOGNIZED
                             
@@ -346,14 +359,12 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
                             status = voice_agent_pb2.NLU_MODEL_NOT_SUPPORTED
 
                     else:
-                        stt = ""
                         status = voice_agent_pb2.VOICE_NOT_RECOGNIZED
                     
                     # cleanup the kaldi recognizer
                     if used_kaldi:
                         self.stt_model.cleanup_recognizer(recognizer_uuid)
                         used_kaldi = False
-
                     # delete the audio file
                     if not self.store_voice_command:   
                         delete_file(audio_file)
@@ -516,7 +527,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
                     exec_response = "Uh oh, I failed to stop the media."
                     exec_status = voice_agent_pb2.EXEC_ERROR
             else:
-                exec_response = "Sorry, I failed to execute command against intent 'MediaControl'. Maybe try again with more specific instructions."
+                exec_response = "Sorry, I failed to execute command."
                 exec_status = voice_agent_pb2.EXEC_ERROR
             
 
@@ -572,7 +583,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
                     if "value" in execution_item:
                         value = execution_item["value"]
                         if self.set_current_values(signal, value):
-                            exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'."
+                            exec_response = f"Yay, I successfully updated '{intent}' to value '{value}'."
                             exec_status = voice_agent_pb2.EXEC_SUCCESS
                     
                     elif "factor" in execution_item:
@@ -593,7 +604,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
                                     value = current_value - factor
                                     value = str(value)
                                 if self.set_current_values(signal, value):
-                                    exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'."
+                                    exec_response = f"Yay, I successfully updated '{intent}' to value '{value}'."
                                     exec_status = voice_agent_pb2.EXEC_SUCCESS
 
                             else:
diff --git a/agl_service_voiceagent/utils/audio_recorder.py b/agl_service_voiceagent/utils/audio_recorder.py
index 49716c9..e362480 100644
--- a/agl_service_voiceagent/utils/audio_recorder.py
+++ b/agl_service_voiceagent/utils/audio_recorder.py
@@ -64,7 +64,7 @@ class AudioRecorder:
         """
         print("Creating pipeline for audio recording in {} mode...".format(self.mode))
         self.pipeline = Gst.Pipeline()
-        autoaudiosrc = Gst.ElementFactory.make("autoaudiosrc", None)
+        autoaudiosrc = Gst.ElementFactory.make("alsasrc", None)
         queue = Gst.ElementFactory.make("queue", None)
         queue.set_property("max-size-buffers", 0)
         queue.set_property("max-size-bytes", 0)
@@ -109,6 +109,7 @@ class AudioRecorder:
         Start recording audio using the GStreamer pipeline.
         """
         self.pipeline.set_state(Gst.State.PLAYING)
+        self.loop.run()
         print("Recording Voice Input...")
 
 
@@ -186,3 +187,4 @@ class AudioRecorder:
             print("Pipeline cleanup complete!")
             self.bus = None
             self.pipeline = None
+            self.loop.quit()
+\ No newline at end of file
diff --git a/agl_service_voiceagent/utils/stt_model.py b/agl_service_voiceagent/utils/stt_model.py
index 7e8ad8b..0a092ea 100644
--- a/agl_service_voiceagent/utils/stt_model.py
+++ b/agl_service_voiceagent/utils/stt_model.py
@@ -21,7 +21,7 @@ import wave
 from agl_service_voiceagent.utils.common import generate_unique_uuid
 
 # import the whisper model
-import whisper
+# import whisper
 # for whisper timeout feature
 from concurrent.futures import ThreadPoolExecutor  
 import subprocess
@@ -93,28 +93,28 @@ class STTModel:
         return result
     
     # Recognize speech using the whisper model
-    def recognize_using_whisper(self,filename,language = None,timeout = 5,fp16=False):
-        """
-        Recognize speech and return the result as a JSON object.
-
-        Args:
-            filename (str): The path to the audio file.
-            timeout (int, optional): The timeout for recognition (default is 5 seconds).
-            fp16 (bool, optional): If True, use 16-bit floating point precision, (default is False) because cuda is not supported.
-            language (str, optional): The language code for recognition (default is None).
-
-        Returns:
-            dict: A JSON object containing recognition results.
-        """
-        def transcribe_with_whisper():
-            return self.whisper_model.transcribe(filename, language = language,fp16=fp16)
+    # def recognize_using_whisper(self,filename,language = None,timeout = 5,fp16=False):
+    #     """
+    #     Recognize speech and return the result as a JSON object.
+
+    #     Args:
+    #         filename (str): The path to the audio file.
+    #         timeout (int, optional): The timeout for recognition (default is 5 seconds).
+    #         fp16 (bool, optional): If True, use 16-bit floating point precision, (default is False) because cuda is not supported.
+    #         language (str, optional): The language code for recognition (default is None).
+
+    #     Returns:
+    #         dict: A JSON object containing recognition results.
+    #     """
+    #     def transcribe_with_whisper():
+    #         return self.whisper_model.transcribe(filename, language = language,fp16=fp16)
         
-        with ThreadPoolExecutor() as executor:
-            future = executor.submit(transcribe_with_whisper)
-            try:
-                return future.result(timeout=timeout)
-            except TimeoutError:
-                return {"error": "Transcription with Whisper exceeded the timeout."}
+    #     with ThreadPoolExecutor() as executor:
+    #         future = executor.submit(transcribe_with_whisper)
+    #         try:
+    #             return future.result(timeout=timeout)
+    #         except TimeoutError:
+    #             return {"error": "Transcription with Whisper exceeded the timeout."}
             
     def recognize_using_whisper_cpp(self,filename):
         command = self.whisper_cpp_path
author	Anuj Solanki <anuj603362@gmail.com>	2024-10-01 00:32:40 +0530
committer	Anuj Solanki <anuj603362@gmail.com>	2024-10-06 01:16:32 +0530
commit	5a8f670c3f772cfe0345ed53e5989a6dca08a905 (patch)
tree	7d82533a0f5356547e31609c3db1d0101de83376
parent	1144fcd343bc56f8c27ff73d3e76904010dbb832 (diff)