aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnuj Solanki <anuj603362@gmail.com>2024-10-01 00:32:40 +0530
committerAnuj Solanki <anuj603362@gmail.com>2024-10-06 01:16:32 +0530
commit5a8f670c3f772cfe0345ed53e5989a6dca08a905 (patch)
tree7d82533a0f5356547e31609c3db1d0101de83376
parent1144fcd343bc56f8c27ff73d3e76904010dbb832 (diff)
Remove OpenAI's Whisper AI and Bug FixingHEADmaster
- Removed OpenAI's Whisper AI from agl-service-voiceagent and using whisper.cpp for speech-to-text. - Fix audio_recorder. - Update grpc protoc to include the online-mode status in ServiceStatus - Set online_mode flag default to 0 - Change wake word to "hey automotive" Bug-AGL: SPEC-5200 Change-Id: I9f1629cdcaef43498bf4cb9fdd950291a415819d Signed-off-by: Anuj Solanki <anuj603362@gmail.com>
-rw-r--r--README.md7
-rw-r--r--agl_service_voiceagent/config.ini4
-rw-r--r--agl_service_voiceagent/generated/voice_agent_pb2.py72
-rw-r--r--agl_service_voiceagent/nlu/snips_interface.py3
-rw-r--r--agl_service_voiceagent/protos/voice_agent.proto1
-rw-r--r--agl_service_voiceagent/servicers/voice_agent_servicer.py31
-rw-r--r--agl_service_voiceagent/utils/audio_recorder.py4
-rw-r--r--agl_service_voiceagent/utils/stt_model.py44
8 files changed, 90 insertions, 76 deletions
diff --git a/README.md b/README.md
index d418994..7f037f7 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,5 @@
# Automotive Grade Linux (AGL) Voice Agent Service
-A gRPC-based voice agent service designed for Automotive Grade Linux (AGL). This service leverages GStreamer, Vosk, Snips, and RASA to seamlessly process user voice commands. It converts spoken words into text, extracts intents from these commands, and performs actions through the Kuksa interface.
+A gRPC-based voice agent service designed for Automotive Grade Linux (AGL). This service leverages GStreamer, Vosk, Whisper, Snips, and RASA to seamlessly process user voice commands. It converts spoken words into text, extracts intents from these commands, and performs actions through the Kuksa interface.
## Table of Contents
- [Features](#features)
@@ -62,14 +62,15 @@ Replace `SERVER_IP` with IP address of the running Voice Agent server, and `SERV
To issue a voice command, use the following command:
```bash
-voiceagent-service run-client --server_address SERVER_IP --server_port SERVER_PORT --action ExecuteVoiceCommand --mode manual --nlu NLU_ENGINE
+voiceagent-service run-client --server_address SERVER_IP --server_port SERVER_PORT --action ExecuteVoiceCommand --mode manual --nlu NLU_ENGINE --stt-frameword STT_FRAMEWORK
```
-Replace `NLU_ENGINE` with the preferred NLU engine ("snips" or "rasa"), `SERVER_IP` with IP address of the running Voice Agent server, and `SERVER_PORT` with the port of the running Voice Agent server. You can also pass a custom value to flag `--recording-time` if you want to change the default recording time from 5 seconds to any other value.
+Replace `NLU_ENGINE` with the preferred NLU engine ("snips" or "rasa"), `SERVER_IP` with IP address of the running Voice Agent server, and `SERVER_PORT` with the port of the running Voice Agent server. You can also pass a custom value to flag `--recording-time` if you want to change the default recording time from 5 seconds to any other value, you can also pass --stt-frameword to specify the STT framework to be used, supported frameworks are "vosk" and "whisper" and the default is "vosk".
## Configuration
Configuration options for the AGL Voice Agent Service can be found in the default `config.ini` file. You can customize various settings, including the AI models, audio directories, and Kuksa integration. **Important:** while manually making changes to the config file make sure you add trailing slash to all the directory paths, ie. the paths to directories should always end with a `/`.
## Maintainers
+- **Anuj Solanki** <anuj603362@gmail.com>
- **Malik Talha** <talhamalik727x@gmail.com>
## License
diff --git a/agl_service_voiceagent/config.ini b/agl_service_voiceagent/config.ini
index d6d695e..e4f6313 100644
--- a/agl_service_voiceagent/config.ini
+++ b/agl_service_voiceagent/config.ini
@@ -9,7 +9,7 @@ snips_model_path = /usr/share/nlu/snips/model/
channels = 1
sample_rate = 16000
bits_per_sample = 16
-wake_word = hello
+wake_word = hey automotive
server_port = 51053
server_address = 127.0.0.1
rasa_model_path = /usr/share/nlu/rasa/models/
@@ -17,7 +17,7 @@ rasa_server_port = 51054
rasa_detached_mode = 1
base_log_dir = /usr/share/nlu/logs/
store_voice_commands = 0
-online_mode = 1
+online_mode = 0
online_mode_address = 65.108.107.216
online_mode_port = 50051
online_mode_timeout = 15
diff --git a/agl_service_voiceagent/generated/voice_agent_pb2.py b/agl_service_voiceagent/generated/voice_agent_pb2.py
index 4606f60..d978664 100644
--- a/agl_service_voiceagent/generated/voice_agent_pb2.py
+++ b/agl_service_voiceagent/generated/voice_agent_pb2.py
@@ -14,49 +14,49 @@ _sym_db = _symbol_database.Default()
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x11voice_agent.proto\"\x07\n\x05\x45mpty\"C\n\rServiceStatus\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x0e\n\x06status\x18\x02 \x01(\x08\x12\x11\n\twake_word\x18\x03 \x01(\t\"^\n\nVoiceAudio\x12\x13\n\x0b\x61udio_chunk\x18\x01 \x01(\x0c\x12\x14\n\x0c\x61udio_format\x18\x02 \x01(\t\x12\x13\n\x0bsample_rate\x18\x03 \x01(\x05\x12\x10\n\x08language\x18\x04 \x01(\t\" \n\x0eWakeWordStatus\x12\x0e\n\x06status\x18\x01 \x01(\x08\"\x93\x01\n\x17S_RecognizeVoiceControl\x12!\n\x0c\x61udio_stream\x18\x01 \x01(\x0b\x32\x0b.VoiceAudio\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12\x11\n\tstream_id\x18\x03 \x01(\t\x12$\n\rstt_framework\x18\x04 \x01(\x0e\x32\r.STTFramework\"\xd1\x01\n\x15RecognizeVoiceControl\x12\x1d\n\x06\x61\x63tion\x18\x01 \x01(\x0e\x32\r.RecordAction\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12 \n\x0brecord_mode\x18\x03 \x01(\x0e\x32\x0b.RecordMode\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\rstt_framework\x18\x05 \x01(\x0e\x32\r.STTFramework\x12 \n\x0bonline_mode\x18\x06 \x01(\x0e\x32\x0b.OnlineMode\"J\n\x14RecognizeTextControl\x12\x14\n\x0ctext_command\x18\x01 \x01(\t\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\")\n\nIntentSlot\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\x8e\x01\n\x0fRecognizeResult\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x0e\n\x06intent\x18\x02 \x01(\t\x12!\n\x0cintent_slots\x18\x03 \x03(\x0b\x32\x0b.IntentSlot\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\x06status\x18\x05 \x01(\x0e\x32\x14.RecognizeStatusType\"A\n\x0c\x45xecuteInput\x12\x0e\n\x06intent\x18\x01 \x01(\t\x12!\n\x0cintent_slots\x18\x02 \x03(\x0b\x32\x0b.IntentSlot\"E\n\rExecuteResult\x12\x10\n\x08response\x18\x01 \x01(\t\x12\"\n\x06status\x18\x02 \x01(\x0e\x32\x12.ExecuteStatusType*%\n\x0cSTTFramework\x12\x08\n\x04VOSK\x10\x00\x12\x0b\n\x07WHISPER\x10\x01*%\n\nOnlineMode\x12\n\n\x06ONLINE\x10\x00\x12\x0b\n\x07OFFLINE\x10\x01*#\n\x0cRecordAction\x12\t\n\x05START\x10\x00\x12\x08\n\x04STOP\x10\x01*\x1f\n\x08NLUModel\x12\t\n\x05SNIPS\x10\x00\x12\x08\n\x04RASA\x10\x01*\"\n\nRecordMode\x12\n\n\x06MANUAL\x10\x00\x12\x08\n\x04\x41UTO\x10\x01*\xb4\x01\n\x13RecognizeStatusType\x12\r\n\tREC_ERROR\x10\x00\x12\x0f\n\x0bREC_SUCCESS\x10\x01\x12\x12\n\x0eREC_PROCESSING\x10\x02\x12\x18\n\x14VOICE_NOT_RECOGNIZED\x10\x03\x12\x19\n\x15INTENT_NOT_RECOGNIZED\x10\x04\x12\x17\n\x13TEXT_NOT_RECOGNIZED\x10\x05\x12\x1b\n\x17NLU_MODEL_NOT_SUPPORTED\x10\x06*\x82\x01\n\x11\x45xecuteStatusType\x12\x0e\n\nEXEC_ERROR\x10\x00\x12\x10\n\x0c\x45XEC_SUCCESS\x10\x01\x12\x14\n\x10KUKSA_CONN_ERROR\x10\x02\x12\x18\n\x14INTENT_NOT_SUPPORTED\x10\x03\x12\x1b\n\x17INTENT_SLOTS_INCOMPLETE\x10\x04\x32\xa4\x03\n\x11VoiceAgentService\x12,\n\x12\x43heckServiceStatus\x12\x06.Empty\x1a\x0e.ServiceStatus\x12\x34\n\x10S_DetectWakeWord\x12\x0b.VoiceAudio\x1a\x0f.WakeWordStatus(\x01\x30\x01\x12+\n\x0e\x44\x65tectWakeWord\x12\x06.Empty\x1a\x0f.WakeWordStatus0\x01\x12G\n\x17S_RecognizeVoiceCommand\x12\x18.S_RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12\x43\n\x15RecognizeVoiceCommand\x12\x16.RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12?\n\x14RecognizeTextCommand\x12\x15.RecognizeTextControl\x1a\x10.RecognizeResult\x12/\n\x0e\x45xecuteCommand\x12\r.ExecuteInput\x1a\x0e.ExecuteResultb\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x11voice_agent.proto\"\x07\n\x05\x45mpty\"X\n\rServiceStatus\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x0e\n\x06status\x18\x02 \x01(\x08\x12\x11\n\twake_word\x18\x03 \x01(\t\x12\x13\n\x0bonline_mode\x18\x04 \x01(\x08\"^\n\nVoiceAudio\x12\x13\n\x0b\x61udio_chunk\x18\x01 \x01(\x0c\x12\x14\n\x0c\x61udio_format\x18\x02 \x01(\t\x12\x13\n\x0bsample_rate\x18\x03 \x01(\x05\x12\x10\n\x08language\x18\x04 \x01(\t\" \n\x0eWakeWordStatus\x12\x0e\n\x06status\x18\x01 \x01(\x08\"\x93\x01\n\x17S_RecognizeVoiceControl\x12!\n\x0c\x61udio_stream\x18\x01 \x01(\x0b\x32\x0b.VoiceAudio\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12\x11\n\tstream_id\x18\x03 \x01(\t\x12$\n\rstt_framework\x18\x04 \x01(\x0e\x32\r.STTFramework\"\xd1\x01\n\x15RecognizeVoiceControl\x12\x1d\n\x06\x61\x63tion\x18\x01 \x01(\x0e\x32\r.RecordAction\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12 \n\x0brecord_mode\x18\x03 \x01(\x0e\x32\x0b.RecordMode\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\rstt_framework\x18\x05 \x01(\x0e\x32\r.STTFramework\x12 \n\x0bonline_mode\x18\x06 \x01(\x0e\x32\x0b.OnlineMode\"J\n\x14RecognizeTextControl\x12\x14\n\x0ctext_command\x18\x01 \x01(\t\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\")\n\nIntentSlot\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\x8e\x01\n\x0fRecognizeResult\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x0e\n\x06intent\x18\x02 \x01(\t\x12!\n\x0cintent_slots\x18\x03 \x03(\x0b\x32\x0b.IntentSlot\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\x06status\x18\x05 \x01(\x0e\x32\x14.RecognizeStatusType\"A\n\x0c\x45xecuteInput\x12\x0e\n\x06intent\x18\x01 \x01(\t\x12!\n\x0cintent_slots\x18\x02 \x03(\x0b\x32\x0b.IntentSlot\"E\n\rExecuteResult\x12\x10\n\x08response\x18\x01 \x01(\t\x12\"\n\x06status\x18\x02 \x01(\x0e\x32\x12.ExecuteStatusType*%\n\x0cSTTFramework\x12\x08\n\x04VOSK\x10\x00\x12\x0b\n\x07WHISPER\x10\x01*%\n\nOnlineMode\x12\n\n\x06ONLINE\x10\x00\x12\x0b\n\x07OFFLINE\x10\x01*#\n\x0cRecordAction\x12\t\n\x05START\x10\x00\x12\x08\n\x04STOP\x10\x01*\x1f\n\x08NLUModel\x12\t\n\x05SNIPS\x10\x00\x12\x08\n\x04RASA\x10\x01*\"\n\nRecordMode\x12\n\n\x06MANUAL\x10\x00\x12\x08\n\x04\x41UTO\x10\x01*\xb4\x01\n\x13RecognizeStatusType\x12\r\n\tREC_ERROR\x10\x00\x12\x0f\n\x0bREC_SUCCESS\x10\x01\x12\x12\n\x0eREC_PROCESSING\x10\x02\x12\x18\n\x14VOICE_NOT_RECOGNIZED\x10\x03\x12\x19\n\x15INTENT_NOT_RECOGNIZED\x10\x04\x12\x17\n\x13TEXT_NOT_RECOGNIZED\x10\x05\x12\x1b\n\x17NLU_MODEL_NOT_SUPPORTED\x10\x06*\x82\x01\n\x11\x45xecuteStatusType\x12\x0e\n\nEXEC_ERROR\x10\x00\x12\x10\n\x0c\x45XEC_SUCCESS\x10\x01\x12\x14\n\x10KUKSA_CONN_ERROR\x10\x02\x12\x18\n\x14INTENT_NOT_SUPPORTED\x10\x03\x12\x1b\n\x17INTENT_SLOTS_INCOMPLETE\x10\x04\x32\xa4\x03\n\x11VoiceAgentService\x12,\n\x12\x43heckServiceStatus\x12\x06.Empty\x1a\x0e.ServiceStatus\x12\x34\n\x10S_DetectWakeWord\x12\x0b.VoiceAudio\x1a\x0f.WakeWordStatus(\x01\x30\x01\x12+\n\x0e\x44\x65tectWakeWord\x12\x06.Empty\x1a\x0f.WakeWordStatus0\x01\x12G\n\x17S_RecognizeVoiceCommand\x12\x18.S_RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12\x43\n\x15RecognizeVoiceCommand\x12\x16.RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12?\n\x14RecognizeTextCommand\x12\x15.RecognizeTextControl\x1a\x10.RecognizeResult\x12/\n\x0e\x45xecuteCommand\x12\r.ExecuteInput\x1a\x0e.ExecuteResultb\x06proto3')
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'voice_agent_pb2', _globals)
if not _descriptor._USE_C_DESCRIPTORS:
DESCRIPTOR._loaded_options = None
- _globals['_STTFRAMEWORK']._serialized_start=993
- _globals['_STTFRAMEWORK']._serialized_end=1030
- _globals['_ONLINEMODE']._serialized_start=1032
- _globals['_ONLINEMODE']._serialized_end=1069
- _globals['_RECORDACTION']._serialized_start=1071
- _globals['_RECORDACTION']._serialized_end=1106
- _globals['_NLUMODEL']._serialized_start=1108
- _globals['_NLUMODEL']._serialized_end=1139
- _globals['_RECORDMODE']._serialized_start=1141
- _globals['_RECORDMODE']._serialized_end=1175
- _globals['_RECOGNIZESTATUSTYPE']._serialized_start=1178
- _globals['_RECOGNIZESTATUSTYPE']._serialized_end=1358
- _globals['_EXECUTESTATUSTYPE']._serialized_start=1361
- _globals['_EXECUTESTATUSTYPE']._serialized_end=1491
+ _globals['_STTFRAMEWORK']._serialized_start=1014
+ _globals['_STTFRAMEWORK']._serialized_end=1051
+ _globals['_ONLINEMODE']._serialized_start=1053
+ _globals['_ONLINEMODE']._serialized_end=1090
+ _globals['_RECORDACTION']._serialized_start=1092
+ _globals['_RECORDACTION']._serialized_end=1127
+ _globals['_NLUMODEL']._serialized_start=1129
+ _globals['_NLUMODEL']._serialized_end=1160
+ _globals['_RECORDMODE']._serialized_start=1162
+ _globals['_RECORDMODE']._serialized_end=1196
+ _globals['_RECOGNIZESTATUSTYPE']._serialized_start=1199
+ _globals['_RECOGNIZESTATUSTYPE']._serialized_end=1379
+ _globals['_EXECUTESTATUSTYPE']._serialized_start=1382
+ _globals['_EXECUTESTATUSTYPE']._serialized_end=1512
_globals['_EMPTY']._serialized_start=21
_globals['_EMPTY']._serialized_end=28
_globals['_SERVICESTATUS']._serialized_start=30
- _globals['_SERVICESTATUS']._serialized_end=97
- _globals['_VOICEAUDIO']._serialized_start=99
- _globals['_VOICEAUDIO']._serialized_end=193
- _globals['_WAKEWORDSTATUS']._serialized_start=195
- _globals['_WAKEWORDSTATUS']._serialized_end=227
- _globals['_S_RECOGNIZEVOICECONTROL']._serialized_start=230
- _globals['_S_RECOGNIZEVOICECONTROL']._serialized_end=377
- _globals['_RECOGNIZEVOICECONTROL']._serialized_start=380
- _globals['_RECOGNIZEVOICECONTROL']._serialized_end=589
- _globals['_RECOGNIZETEXTCONTROL']._serialized_start=591
- _globals['_RECOGNIZETEXTCONTROL']._serialized_end=665
- _globals['_INTENTSLOT']._serialized_start=667
- _globals['_INTENTSLOT']._serialized_end=708
- _globals['_RECOGNIZERESULT']._serialized_start=711
- _globals['_RECOGNIZERESULT']._serialized_end=853
- _globals['_EXECUTEINPUT']._serialized_start=855
- _globals['_EXECUTEINPUT']._serialized_end=920
- _globals['_EXECUTERESULT']._serialized_start=922
- _globals['_EXECUTERESULT']._serialized_end=991
- _globals['_VOICEAGENTSERVICE']._serialized_start=1494
- _globals['_VOICEAGENTSERVICE']._serialized_end=1914
+ _globals['_SERVICESTATUS']._serialized_end=118
+ _globals['_VOICEAUDIO']._serialized_start=120
+ _globals['_VOICEAUDIO']._serialized_end=214
+ _globals['_WAKEWORDSTATUS']._serialized_start=216
+ _globals['_WAKEWORDSTATUS']._serialized_end=248
+ _globals['_S_RECOGNIZEVOICECONTROL']._serialized_start=251
+ _globals['_S_RECOGNIZEVOICECONTROL']._serialized_end=398
+ _globals['_RECOGNIZEVOICECONTROL']._serialized_start=401
+ _globals['_RECOGNIZEVOICECONTROL']._serialized_end=610
+ _globals['_RECOGNIZETEXTCONTROL']._serialized_start=612
+ _globals['_RECOGNIZETEXTCONTROL']._serialized_end=686
+ _globals['_INTENTSLOT']._serialized_start=688
+ _globals['_INTENTSLOT']._serialized_end=729
+ _globals['_RECOGNIZERESULT']._serialized_start=732
+ _globals['_RECOGNIZERESULT']._serialized_end=874
+ _globals['_EXECUTEINPUT']._serialized_start=876
+ _globals['_EXECUTEINPUT']._serialized_end=941
+ _globals['_EXECUTERESULT']._serialized_start=943
+ _globals['_EXECUTERESULT']._serialized_end=1012
+ _globals['_VOICEAGENTSERVICE']._serialized_start=1515
+ _globals['_VOICEAGENTSERVICE']._serialized_end=1935
# @@protoc_insertion_point(module_scope)
diff --git a/agl_service_voiceagent/nlu/snips_interface.py b/agl_service_voiceagent/nlu/snips_interface.py
index a32f574..25ad05b 100644
--- a/agl_service_voiceagent/nlu/snips_interface.py
+++ b/agl_service_voiceagent/nlu/snips_interface.py
@@ -46,8 +46,7 @@ class SnipsInterface:
preprocessed_text = text.lower().strip()
# remove special characters, punctuation, and extra whitespaces
preprocessed_text = re.sub(r'[^\w\s]', '', preprocessed_text).strip()
- # replace % with " precent"
- preprocessed_text = re.sub(r'%', ' percent', preprocessed_text)
+ preprocessed_text = re.sub(r'percent', '', preprocessed_text)
# replace ° with " degrees"
preprocessed_text = re.sub(r'°', ' degrees ', preprocessed_text)
return preprocessed_text
diff --git a/agl_service_voiceagent/protos/voice_agent.proto b/agl_service_voiceagent/protos/voice_agent.proto
index bd2daa2..72d48c6 100644
--- a/agl_service_voiceagent/protos/voice_agent.proto
+++ b/agl_service_voiceagent/protos/voice_agent.proto
@@ -61,6 +61,7 @@ message ServiceStatus {
string version = 1;
bool status = 2;
string wake_word = 3;
+ bool online_mode = 4;
}
message VoiceAudio {
diff --git a/agl_service_voiceagent/servicers/voice_agent_servicer.py b/agl_service_voiceagent/servicers/voice_agent_servicer.py
index 2a4de33..c149b6d 100644
--- a/agl_service_voiceagent/servicers/voice_agent_servicer.py
+++ b/agl_service_voiceagent/servicers/voice_agent_servicer.py
@@ -199,6 +199,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
version=self.service_version,
status=True,
wake_word=self.wake_word,
+ online_mode = self.online_mode
)
# Convert the response object to a JSON string and log it
@@ -280,8 +281,12 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
"recorder": recorder,
"audio_file": audio_file
}
-
- recorder.start_recording()
+
+ def record():
+ recorder.start_recording()
+
+ record_thread = threading.Thread(target=record)
+ record_thread.start()
elif request.action == voice_agent_pb2.STOP:
stream_uuid = request.stream_id
@@ -294,6 +299,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
recorder = self.rvc_stream_uuids[stream_uuid]["recorder"]
audio_file = self.rvc_stream_uuids[stream_uuid]["audio_file"]
del self.rvc_stream_uuids[stream_uuid]
+ print(use_online_mode)
recorder.stop_recording()
@@ -316,12 +322,19 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
recognizer_uuid = self.stt_model.setup_vosk_recognizer()
stt = self.stt_model.recognize_from_file(recognizer_uuid, audio_file,stt_framework=stt_framework)
used_kaldi = True
-
print(stt)
if stt not in ["FILE_NOT_FOUND", "FILE_FORMAT_INVALID", "VOICE_NOT_RECOGNIZED", ""]:
if request.nlu_model == voice_agent_pb2.SNIPS:
- extracted_intent = self.snips_interface.extract_intent(stt)
- intent, intent_actions = self.snips_interface.process_intent(extracted_intent)
+ try:
+ extracted_intent = self.snips_interface.extract_intent(stt)
+ except Exception as e:
+ print(e)
+ extracted_intent = ""
+ if extracted_intent != "":
+ intent, intent_actions = self.snips_interface.process_intent(extracted_intent)
+ else:
+ intent = ""
+ intent_actions = {}
if not intent or intent == "":
status = voice_agent_pb2.INTENT_NOT_RECOGNIZED
@@ -346,14 +359,12 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
status = voice_agent_pb2.NLU_MODEL_NOT_SUPPORTED
else:
- stt = ""
status = voice_agent_pb2.VOICE_NOT_RECOGNIZED
# cleanup the kaldi recognizer
if used_kaldi:
self.stt_model.cleanup_recognizer(recognizer_uuid)
used_kaldi = False
-
# delete the audio file
if not self.store_voice_command:
delete_file(audio_file)
@@ -516,7 +527,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
exec_response = "Uh oh, I failed to stop the media."
exec_status = voice_agent_pb2.EXEC_ERROR
else:
- exec_response = "Sorry, I failed to execute command against intent 'MediaControl'. Maybe try again with more specific instructions."
+ exec_response = "Sorry, I failed to execute command."
exec_status = voice_agent_pb2.EXEC_ERROR
@@ -572,7 +583,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
if "value" in execution_item:
value = execution_item["value"]
if self.set_current_values(signal, value):
- exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'."
+ exec_response = f"Yay, I successfully updated '{intent}' to value '{value}'."
exec_status = voice_agent_pb2.EXEC_SUCCESS
elif "factor" in execution_item:
@@ -593,7 +604,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
value = current_value - factor
value = str(value)
if self.set_current_values(signal, value):
- exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'."
+ exec_response = f"Yay, I successfully updated '{intent}' to value '{value}'."
exec_status = voice_agent_pb2.EXEC_SUCCESS
else:
diff --git a/agl_service_voiceagent/utils/audio_recorder.py b/agl_service_voiceagent/utils/audio_recorder.py
index 49716c9..e362480 100644
--- a/agl_service_voiceagent/utils/audio_recorder.py
+++ b/agl_service_voiceagent/utils/audio_recorder.py
@@ -64,7 +64,7 @@ class AudioRecorder:
"""
print("Creating pipeline for audio recording in {} mode...".format(self.mode))
self.pipeline = Gst.Pipeline()
- autoaudiosrc = Gst.ElementFactory.make("autoaudiosrc", None)
+ autoaudiosrc = Gst.ElementFactory.make("alsasrc", None)
queue = Gst.ElementFactory.make("queue", None)
queue.set_property("max-size-buffers", 0)
queue.set_property("max-size-bytes", 0)
@@ -109,6 +109,7 @@ class AudioRecorder:
Start recording audio using the GStreamer pipeline.
"""
self.pipeline.set_state(Gst.State.PLAYING)
+ self.loop.run()
print("Recording Voice Input...")
@@ -186,3 +187,4 @@ class AudioRecorder:
print("Pipeline cleanup complete!")
self.bus = None
self.pipeline = None
+ self.loop.quit() \ No newline at end of file
diff --git a/agl_service_voiceagent/utils/stt_model.py b/agl_service_voiceagent/utils/stt_model.py
index 7e8ad8b..0a092ea 100644
--- a/agl_service_voiceagent/utils/stt_model.py
+++ b/agl_service_voiceagent/utils/stt_model.py
@@ -21,7 +21,7 @@ import wave
from agl_service_voiceagent.utils.common import generate_unique_uuid
# import the whisper model
-import whisper
+# import whisper
# for whisper timeout feature
from concurrent.futures import ThreadPoolExecutor
import subprocess
@@ -93,28 +93,28 @@ class STTModel:
return result
# Recognize speech using the whisper model
- def recognize_using_whisper(self,filename,language = None,timeout = 5,fp16=False):
- """
- Recognize speech and return the result as a JSON object.
-
- Args:
- filename (str): The path to the audio file.
- timeout (int, optional): The timeout for recognition (default is 5 seconds).
- fp16 (bool, optional): If True, use 16-bit floating point precision, (default is False) because cuda is not supported.
- language (str, optional): The language code for recognition (default is None).
-
- Returns:
- dict: A JSON object containing recognition results.
- """
- def transcribe_with_whisper():
- return self.whisper_model.transcribe(filename, language = language,fp16=fp16)
+ # def recognize_using_whisper(self,filename,language = None,timeout = 5,fp16=False):
+ # """
+ # Recognize speech and return the result as a JSON object.
+
+ # Args:
+ # filename (str): The path to the audio file.
+ # timeout (int, optional): The timeout for recognition (default is 5 seconds).
+ # fp16 (bool, optional): If True, use 16-bit floating point precision, (default is False) because cuda is not supported.
+ # language (str, optional): The language code for recognition (default is None).
+
+ # Returns:
+ # dict: A JSON object containing recognition results.
+ # """
+ # def transcribe_with_whisper():
+ # return self.whisper_model.transcribe(filename, language = language,fp16=fp16)
- with ThreadPoolExecutor() as executor:
- future = executor.submit(transcribe_with_whisper)
- try:
- return future.result(timeout=timeout)
- except TimeoutError:
- return {"error": "Transcription with Whisper exceeded the timeout."}
+ # with ThreadPoolExecutor() as executor:
+ # future = executor.submit(transcribe_with_whisper)
+ # try:
+ # return future.result(timeout=timeout)
+ # except TimeoutError:
+ # return {"error": "Transcription with Whisper exceeded the timeout."}
def recognize_using_whisper_cpp(self,filename):
command = self.whisper_cpp_path