aboutsummaryrefslogtreecommitdiffstats
path: root/agl_service_voiceagent
diff options
context:
space:
mode:
authorAnuj Solanki <anuj603362@gmail.com>2024-10-01 00:32:40 +0530
committerAnuj Solanki <anuj603362@gmail.com>2024-10-06 01:16:32 +0530
commit5a8f670c3f772cfe0345ed53e5989a6dca08a905 (patch)
tree7d82533a0f5356547e31609c3db1d0101de83376 /agl_service_voiceagent
parent1144fcd343bc56f8c27ff73d3e76904010dbb832 (diff)
Remove OpenAI's Whisper AI and Bug FixingHEADmaster
- Removed OpenAI's Whisper AI from agl-service-voiceagent and using whisper.cpp for speech-to-text. - Fix audio_recorder. - Update grpc protoc to include the online-mode status in ServiceStatus - Set online_mode flag default to 0 - Change wake word to "hey automotive" Bug-AGL: SPEC-5200 Change-Id: I9f1629cdcaef43498bf4cb9fdd950291a415819d Signed-off-by: Anuj Solanki <anuj603362@gmail.com>
Diffstat (limited to 'agl_service_voiceagent')
-rw-r--r--agl_service_voiceagent/config.ini4
-rw-r--r--agl_service_voiceagent/generated/voice_agent_pb2.py72
-rw-r--r--agl_service_voiceagent/nlu/snips_interface.py3
-rw-r--r--agl_service_voiceagent/protos/voice_agent.proto1
-rw-r--r--agl_service_voiceagent/servicers/voice_agent_servicer.py31
-rw-r--r--agl_service_voiceagent/utils/audio_recorder.py4
-rw-r--r--agl_service_voiceagent/utils/stt_model.py44
7 files changed, 86 insertions, 73 deletions
diff --git a/agl_service_voiceagent/config.ini b/agl_service_voiceagent/config.ini
index d6d695e..e4f6313 100644
--- a/agl_service_voiceagent/config.ini
+++ b/agl_service_voiceagent/config.ini
@@ -9,7 +9,7 @@ snips_model_path = /usr/share/nlu/snips/model/
channels = 1
sample_rate = 16000
bits_per_sample = 16
-wake_word = hello
+wake_word = hey automotive
server_port = 51053
server_address = 127.0.0.1
rasa_model_path = /usr/share/nlu/rasa/models/
@@ -17,7 +17,7 @@ rasa_server_port = 51054
rasa_detached_mode = 1
base_log_dir = /usr/share/nlu/logs/
store_voice_commands = 0
-online_mode = 1
+online_mode = 0
online_mode_address = 65.108.107.216
online_mode_port = 50051
online_mode_timeout = 15
diff --git a/agl_service_voiceagent/generated/voice_agent_pb2.py b/agl_service_voiceagent/generated/voice_agent_pb2.py
index 4606f60..d978664 100644
--- a/agl_service_voiceagent/generated/voice_agent_pb2.py
+++ b/agl_service_voiceagent/generated/voice_agent_pb2.py
@@ -14,49 +14,49 @@ _sym_db = _symbol_database.Default()
-DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x11voice_agent.proto\"\x07\n\x05\x45mpty\"C\n\rServiceStatus\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x0e\n\x06status\x18\x02 \x01(\x08\x12\x11\n\twake_word\x18\x03 \x01(\t\"^\n\nVoiceAudio\x12\x13\n\x0b\x61udio_chunk\x18\x01 \x01(\x0c\x12\x14\n\x0c\x61udio_format\x18\x02 \x01(\t\x12\x13\n\x0bsample_rate\x18\x03 \x01(\x05\x12\x10\n\x08language\x18\x04 \x01(\t\" \n\x0eWakeWordStatus\x12\x0e\n\x06status\x18\x01 \x01(\x08\"\x93\x01\n\x17S_RecognizeVoiceControl\x12!\n\x0c\x61udio_stream\x18\x01 \x01(\x0b\x32\x0b.VoiceAudio\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12\x11\n\tstream_id\x18\x03 \x01(\t\x12$\n\rstt_framework\x18\x04 \x01(\x0e\x32\r.STTFramework\"\xd1\x01\n\x15RecognizeVoiceControl\x12\x1d\n\x06\x61\x63tion\x18\x01 \x01(\x0e\x32\r.RecordAction\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12 \n\x0brecord_mode\x18\x03 \x01(\x0e\x32\x0b.RecordMode\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\rstt_framework\x18\x05 \x01(\x0e\x32\r.STTFramework\x12 \n\x0bonline_mode\x18\x06 \x01(\x0e\x32\x0b.OnlineMode\"J\n\x14RecognizeTextControl\x12\x14\n\x0ctext_command\x18\x01 \x01(\t\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\")\n\nIntentSlot\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\x8e\x01\n\x0fRecognizeResult\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x0e\n\x06intent\x18\x02 \x01(\t\x12!\n\x0cintent_slots\x18\x03 \x03(\x0b\x32\x0b.IntentSlot\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\x06status\x18\x05 \x01(\x0e\x32\x14.RecognizeStatusType\"A\n\x0c\x45xecuteInput\x12\x0e\n\x06intent\x18\x01 \x01(\t\x12!\n\x0cintent_slots\x18\x02 \x03(\x0b\x32\x0b.IntentSlot\"E\n\rExecuteResult\x12\x10\n\x08response\x18\x01 \x01(\t\x12\"\n\x06status\x18\x02 \x01(\x0e\x32\x12.ExecuteStatusType*%\n\x0cSTTFramework\x12\x08\n\x04VOSK\x10\x00\x12\x0b\n\x07WHISPER\x10\x01*%\n\nOnlineMode\x12\n\n\x06ONLINE\x10\x00\x12\x0b\n\x07OFFLINE\x10\x01*#\n\x0cRecordAction\x12\t\n\x05START\x10\x00\x12\x08\n\x04STOP\x10\x01*\x1f\n\x08NLUModel\x12\t\n\x05SNIPS\x10\x00\x12\x08\n\x04RASA\x10\x01*\"\n\nRecordMode\x12\n\n\x06MANUAL\x10\x00\x12\x08\n\x04\x41UTO\x10\x01*\xb4\x01\n\x13RecognizeStatusType\x12\r\n\tREC_ERROR\x10\x00\x12\x0f\n\x0bREC_SUCCESS\x10\x01\x12\x12\n\x0eREC_PROCESSING\x10\x02\x12\x18\n\x14VOICE_NOT_RECOGNIZED\x10\x03\x12\x19\n\x15INTENT_NOT_RECOGNIZED\x10\x04\x12\x17\n\x13TEXT_NOT_RECOGNIZED\x10\x05\x12\x1b\n\x17NLU_MODEL_NOT_SUPPORTED\x10\x06*\x82\x01\n\x11\x45xecuteStatusType\x12\x0e\n\nEXEC_ERROR\x10\x00\x12\x10\n\x0c\x45XEC_SUCCESS\x10\x01\x12\x14\n\x10KUKSA_CONN_ERROR\x10\x02\x12\x18\n\x14INTENT_NOT_SUPPORTED\x10\x03\x12\x1b\n\x17INTENT_SLOTS_INCOMPLETE\x10\x04\x32\xa4\x03\n\x11VoiceAgentService\x12,\n\x12\x43heckServiceStatus\x12\x06.Empty\x1a\x0e.ServiceStatus\x12\x34\n\x10S_DetectWakeWord\x12\x0b.VoiceAudio\x1a\x0f.WakeWordStatus(\x01\x30\x01\x12+\n\x0e\x44\x65tectWakeWord\x12\x06.Empty\x1a\x0f.WakeWordStatus0\x01\x12G\n\x17S_RecognizeVoiceCommand\x12\x18.S_RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12\x43\n\x15RecognizeVoiceCommand\x12\x16.RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12?\n\x14RecognizeTextCommand\x12\x15.RecognizeTextControl\x1a\x10.RecognizeResult\x12/\n\x0e\x45xecuteCommand\x12\r.ExecuteInput\x1a\x0e.ExecuteResultb\x06proto3')
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x11voice_agent.proto\"\x07\n\x05\x45mpty\"X\n\rServiceStatus\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x0e\n\x06status\x18\x02 \x01(\x08\x12\x11\n\twake_word\x18\x03 \x01(\t\x12\x13\n\x0bonline_mode\x18\x04 \x01(\x08\"^\n\nVoiceAudio\x12\x13\n\x0b\x61udio_chunk\x18\x01 \x01(\x0c\x12\x14\n\x0c\x61udio_format\x18\x02 \x01(\t\x12\x13\n\x0bsample_rate\x18\x03 \x01(\x05\x12\x10\n\x08language\x18\x04 \x01(\t\" \n\x0eWakeWordStatus\x12\x0e\n\x06status\x18\x01 \x01(\x08\"\x93\x01\n\x17S_RecognizeVoiceControl\x12!\n\x0c\x61udio_stream\x18\x01 \x01(\x0b\x32\x0b.VoiceAudio\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12\x11\n\tstream_id\x18\x03 \x01(\t\x12$\n\rstt_framework\x18\x04 \x01(\x0e\x32\r.STTFramework\"\xd1\x01\n\x15RecognizeVoiceControl\x12\x1d\n\x06\x61\x63tion\x18\x01 \x01(\x0e\x32\r.RecordAction\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12 \n\x0brecord_mode\x18\x03 \x01(\x0e\x32\x0b.RecordMode\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\rstt_framework\x18\x05 \x01(\x0e\x32\r.STTFramework\x12 \n\x0bonline_mode\x18\x06 \x01(\x0e\x32\x0b.OnlineMode\"J\n\x14RecognizeTextControl\x12\x14\n\x0ctext_command\x18\x01 \x01(\t\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\")\n\nIntentSlot\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\x8e\x01\n\x0fRecognizeResult\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x0e\n\x06intent\x18\x02 \x01(\t\x12!\n\x0cintent_slots\x18\x03 \x03(\x0b\x32\x0b.IntentSlot\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\x06status\x18\x05 \x01(\x0e\x32\x14.RecognizeStatusType\"A\n\x0c\x45xecuteInput\x12\x0e\n\x06intent\x18\x01 \x01(\t\x12!\n\x0cintent_slots\x18\x02 \x03(\x0b\x32\x0b.IntentSlot\"E\n\rExecuteResult\x12\x10\n\x08response\x18\x01 \x01(\t\x12\"\n\x06status\x18\x02 \x01(\x0e\x32\x12.ExecuteStatusType*%\n\x0cSTTFramework\x12\x08\n\x04VOSK\x10\x00\x12\x0b\n\x07WHISPER\x10\x01*%\n\nOnlineMode\x12\n\n\x06ONLINE\x10\x00\x12\x0b\n\x07OFFLINE\x10\x01*#\n\x0cRecordAction\x12\t\n\x05START\x10\x00\x12\x08\n\x04STOP\x10\x01*\x1f\n\x08NLUModel\x12\t\n\x05SNIPS\x10\x00\x12\x08\n\x04RASA\x10\x01*\"\n\nRecordMode\x12\n\n\x06MANUAL\x10\x00\x12\x08\n\x04\x41UTO\x10\x01*\xb4\x01\n\x13RecognizeStatusType\x12\r\n\tREC_ERROR\x10\x00\x12\x0f\n\x0bREC_SUCCESS\x10\x01\x12\x12\n\x0eREC_PROCESSING\x10\x02\x12\x18\n\x14VOICE_NOT_RECOGNIZED\x10\x03\x12\x19\n\x15INTENT_NOT_RECOGNIZED\x10\x04\x12\x17\n\x13TEXT_NOT_RECOGNIZED\x10\x05\x12\x1b\n\x17NLU_MODEL_NOT_SUPPORTED\x10\x06*\x82\x01\n\x11\x45xecuteStatusType\x12\x0e\n\nEXEC_ERROR\x10\x00\x12\x10\n\x0c\x45XEC_SUCCESS\x10\x01\x12\x14\n\x10KUKSA_CONN_ERROR\x10\x02\x12\x18\n\x14INTENT_NOT_SUPPORTED\x10\x03\x12\x1b\n\x17INTENT_SLOTS_INCOMPLETE\x10\x04\x32\xa4\x03\n\x11VoiceAgentService\x12,\n\x12\x43heckServiceStatus\x12\x06.Empty\x1a\x0e.ServiceStatus\x12\x34\n\x10S_DetectWakeWord\x12\x0b.VoiceAudio\x1a\x0f.WakeWordStatus(\x01\x30\x01\x12+\n\x0e\x44\x65tectWakeWord\x12\x06.Empty\x1a\x0f.WakeWordStatus0\x01\x12G\n\x17S_RecognizeVoiceCommand\x12\x18.S_RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12\x43\n\x15RecognizeVoiceCommand\x12\x16.RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12?\n\x14RecognizeTextCommand\x12\x15.RecognizeTextControl\x1a\x10.RecognizeResult\x12/\n\x0e\x45xecuteCommand\x12\r.ExecuteInput\x1a\x0e.ExecuteResultb\x06proto3')
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'voice_agent_pb2', _globals)
if not _descriptor._USE_C_DESCRIPTORS:
DESCRIPTOR._loaded_options = None
- _globals['_STTFRAMEWORK']._serialized_start=993
- _globals['_STTFRAMEWORK']._serialized_end=1030
- _globals['_ONLINEMODE']._serialized_start=1032
- _globals['_ONLINEMODE']._serialized_end=1069
- _globals['_RECORDACTION']._serialized_start=1071
- _globals['_RECORDACTION']._serialized_end=1106
- _globals['_NLUMODEL']._serialized_start=1108
- _globals['_NLUMODEL']._serialized_end=1139
- _globals['_RECORDMODE']._serialized_start=1141
- _globals['_RECORDMODE']._serialized_end=1175
- _globals['_RECOGNIZESTATUSTYPE']._serialized_start=1178
- _globals['_RECOGNIZESTATUSTYPE']._serialized_end=1358
- _globals['_EXECUTESTATUSTYPE']._serialized_start=1361
- _globals['_EXECUTESTATUSTYPE']._serialized_end=1491
+ _globals['_STTFRAMEWORK']._serialized_start=1014
+ _globals['_STTFRAMEWORK']._serialized_end=1051
+ _globals['_ONLINEMODE']._serialized_start=1053
+ _globals['_ONLINEMODE']._serialized_end=1090
+ _globals['_RECORDACTION']._serialized_start=1092
+ _globals['_RECORDACTION']._serialized_end=1127
+ _globals['_NLUMODEL']._serialized_start=1129
+ _globals['_NLUMODEL']._serialized_end=1160
+ _globals['_RECORDMODE']._serialized_start=1162
+ _globals['_RECORDMODE']._serialized_end=1196
+ _globals['_RECOGNIZESTATUSTYPE']._serialized_start=1199
+ _globals['_RECOGNIZESTATUSTYPE']._serialized_end=1379
+ _globals['_EXECUTESTATUSTYPE']._serialized_start=1382
+ _globals['_EXECUTESTATUSTYPE']._serialized_end=1512
_globals['_EMPTY']._serialized_start=21
_globals['_EMPTY']._serialized_end=28
_globals['_SERVICESTATUS']._serialized_start=30
- _globals['_SERVICESTATUS']._serialized_end=97
- _globals['_VOICEAUDIO']._serialized_start=99
- _globals['_VOICEAUDIO']._serialized_end=193
- _globals['_WAKEWORDSTATUS']._serialized_start=195
- _globals['_WAKEWORDSTATUS']._serialized_end=227
- _globals['_S_RECOGNIZEVOICECONTROL']._serialized_start=230
- _globals['_S_RECOGNIZEVOICECONTROL']._serialized_end=377
- _globals['_RECOGNIZEVOICECONTROL']._serialized_start=380
- _globals['_RECOGNIZEVOICECONTROL']._serialized_end=589
- _globals['_RECOGNIZETEXTCONTROL']._serialized_start=591
- _globals['_RECOGNIZETEXTCONTROL']._serialized_end=665
- _globals['_INTENTSLOT']._serialized_start=667
- _globals['_INTENTSLOT']._serialized_end=708
- _globals['_RECOGNIZERESULT']._serialized_start=711
- _globals['_RECOGNIZERESULT']._serialized_end=853
- _globals['_EXECUTEINPUT']._serialized_start=855
- _globals['_EXECUTEINPUT']._serialized_end=920
- _globals['_EXECUTERESULT']._serialized_start=922
- _globals['_EXECUTERESULT']._serialized_end=991
- _globals['_VOICEAGENTSERVICE']._serialized_start=1494
- _globals['_VOICEAGENTSERVICE']._serialized_end=1914
+ _globals['_SERVICESTATUS']._serialized_end=118
+ _globals['_VOICEAUDIO']._serialized_start=120
+ _globals['_VOICEAUDIO']._serialized_end=214
+ _globals['_WAKEWORDSTATUS']._serialized_start=216
+ _globals['_WAKEWORDSTATUS']._serialized_end=248
+ _globals['_S_RECOGNIZEVOICECONTROL']._serialized_start=251
+ _globals['_S_RECOGNIZEVOICECONTROL']._serialized_end=398
+ _globals['_RECOGNIZEVOICECONTROL']._serialized_start=401
+ _globals['_RECOGNIZEVOICECONTROL']._serialized_end=610
+ _globals['_RECOGNIZETEXTCONTROL']._serialized_start=612
+ _globals['_RECOGNIZETEXTCONTROL']._serialized_end=686
+ _globals['_INTENTSLOT']._serialized_start=688
+ _globals['_INTENTSLOT']._serialized_end=729
+ _globals['_RECOGNIZERESULT']._serialized_start=732
+ _globals['_RECOGNIZERESULT']._serialized_end=874
+ _globals['_EXECUTEINPUT']._serialized_start=876
+ _globals['_EXECUTEINPUT']._serialized_end=941
+ _globals['_EXECUTERESULT']._serialized_start=943
+ _globals['_EXECUTERESULT']._serialized_end=1012
+ _globals['_VOICEAGENTSERVICE']._serialized_start=1515
+ _globals['_VOICEAGENTSERVICE']._serialized_end=1935
# @@protoc_insertion_point(module_scope)
diff --git a/agl_service_voiceagent/nlu/snips_interface.py b/agl_service_voiceagent/nlu/snips_interface.py
index a32f574..25ad05b 100644
--- a/agl_service_voiceagent/nlu/snips_interface.py
+++ b/agl_service_voiceagent/nlu/snips_interface.py
@@ -46,8 +46,7 @@ class SnipsInterface:
preprocessed_text = text.lower().strip()
# remove special characters, punctuation, and extra whitespaces
preprocessed_text = re.sub(r'[^\w\s]', '', preprocessed_text).strip()
- # replace % with " precent"
- preprocessed_text = re.sub(r'%', ' percent', preprocessed_text)
+ preprocessed_text = re.sub(r'percent', '', preprocessed_text)
# replace ° with " degrees"
preprocessed_text = re.sub(r'°', ' degrees ', preprocessed_text)
return preprocessed_text
diff --git a/agl_service_voiceagent/protos/voice_agent.proto b/agl_service_voiceagent/protos/voice_agent.proto
index bd2daa2..72d48c6 100644
--- a/agl_service_voiceagent/protos/voice_agent.proto
+++ b/agl_service_voiceagent/protos/voice_agent.proto
@@ -61,6 +61,7 @@ message ServiceStatus {
string version = 1;
bool status = 2;
string wake_word = 3;
+ bool online_mode = 4;
}
message VoiceAudio {
diff --git a/agl_service_voiceagent/servicers/voice_agent_servicer.py b/agl_service_voiceagent/servicers/voice_agent_servicer.py
index 2a4de33..c149b6d 100644
--- a/agl_service_voiceagent/servicers/voice_agent_servicer.py
+++ b/agl_service_voiceagent/servicers/voice_agent_servicer.py
@@ -199,6 +199,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
version=self.service_version,
status=True,
wake_word=self.wake_word,
+ online_mode = self.online_mode
)
# Convert the response object to a JSON string and log it
@@ -280,8 +281,12 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
"recorder": recorder,
"audio_file": audio_file
}
-
- recorder.start_recording()
+
+ def record():
+ recorder.start_recording()
+
+ record_thread = threading.Thread(target=record)
+ record_thread.start()
elif request.action == voice_agent_pb2.STOP:
stream_uuid = request.stream_id
@@ -294,6 +299,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
recorder = self.rvc_stream_uuids[stream_uuid]["recorder"]
audio_file = self.rvc_stream_uuids[stream_uuid]["audio_file"]
del self.rvc_stream_uuids[stream_uuid]
+ print(use_online_mode)
recorder.stop_recording()
@@ -316,12 +322,19 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
recognizer_uuid = self.stt_model.setup_vosk_recognizer()
stt = self.stt_model.recognize_from_file(recognizer_uuid, audio_file,stt_framework=stt_framework)
used_kaldi = True
-
print(stt)
if stt not in ["FILE_NOT_FOUND", "FILE_FORMAT_INVALID", "VOICE_NOT_RECOGNIZED", ""]:
if request.nlu_model == voice_agent_pb2.SNIPS:
- extracted_intent = self.snips_interface.extract_intent(stt)
- intent, intent_actions = self.snips_interface.process_intent(extracted_intent)
+ try:
+ extracted_intent = self.snips_interface.extract_intent(stt)
+ except Exception as e:
+ print(e)
+ extracted_intent = ""
+ if extracted_intent != "":
+ intent, intent_actions = self.snips_interface.process_intent(extracted_intent)
+ else:
+ intent = ""
+ intent_actions = {}
if not intent or intent == "":
status = voice_agent_pb2.INTENT_NOT_RECOGNIZED
@@ -346,14 +359,12 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
status = voice_agent_pb2.NLU_MODEL_NOT_SUPPORTED
else:
- stt = ""
status = voice_agent_pb2.VOICE_NOT_RECOGNIZED
# cleanup the kaldi recognizer
if used_kaldi:
self.stt_model.cleanup_recognizer(recognizer_uuid)
used_kaldi = False
-
# delete the audio file
if not self.store_voice_command:
delete_file(audio_file)
@@ -516,7 +527,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
exec_response = "Uh oh, I failed to stop the media."
exec_status = voice_agent_pb2.EXEC_ERROR
else:
- exec_response = "Sorry, I failed to execute command against intent 'MediaControl'. Maybe try again with more specific instructions."
+ exec_response = "Sorry, I failed to execute command."
exec_status = voice_agent_pb2.EXEC_ERROR
@@ -572,7 +583,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
if "value" in execution_item:
value = execution_item["value"]
if self.set_current_values(signal, value):
- exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'."
+ exec_response = f"Yay, I successfully updated '{intent}' to value '{value}'."
exec_status = voice_agent_pb2.EXEC_SUCCESS
elif "factor" in execution_item:
@@ -593,7 +604,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
value = current_value - factor
value = str(value)
if self.set_current_values(signal, value):
- exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'."
+ exec_response = f"Yay, I successfully updated '{intent}' to value '{value}'."
exec_status = voice_agent_pb2.EXEC_SUCCESS
else:
diff --git a/agl_service_voiceagent/utils/audio_recorder.py b/agl_service_voiceagent/utils/audio_recorder.py
index 49716c9..e362480 100644
--- a/agl_service_voiceagent/utils/audio_recorder.py
+++ b/agl_service_voiceagent/utils/audio_recorder.py
@@ -64,7 +64,7 @@ class AudioRecorder:
"""
print("Creating pipeline for audio recording in {} mode...".format(self.mode))
self.pipeline = Gst.Pipeline()
- autoaudiosrc = Gst.ElementFactory.make("autoaudiosrc", None)
+ autoaudiosrc = Gst.ElementFactory.make("alsasrc", None)
queue = Gst.ElementFactory.make("queue", None)
queue.set_property("max-size-buffers", 0)
queue.set_property("max-size-bytes", 0)
@@ -109,6 +109,7 @@ class AudioRecorder:
Start recording audio using the GStreamer pipeline.
"""
self.pipeline.set_state(Gst.State.PLAYING)
+ self.loop.run()
print("Recording Voice Input...")
@@ -186,3 +187,4 @@ class AudioRecorder:
print("Pipeline cleanup complete!")
self.bus = None
self.pipeline = None
+ self.loop.quit() \ No newline at end of file
diff --git a/agl_service_voiceagent/utils/stt_model.py b/agl_service_voiceagent/utils/stt_model.py
index 7e8ad8b..0a092ea 100644
--- a/agl_service_voiceagent/utils/stt_model.py
+++ b/agl_service_voiceagent/utils/stt_model.py
@@ -21,7 +21,7 @@ import wave
from agl_service_voiceagent.utils.common import generate_unique_uuid
# import the whisper model
-import whisper
+# import whisper
# for whisper timeout feature
from concurrent.futures import ThreadPoolExecutor
import subprocess
@@ -93,28 +93,28 @@ class STTModel:
return result
# Recognize speech using the whisper model
- def recognize_using_whisper(self,filename,language = None,timeout = 5,fp16=False):
- """
- Recognize speech and return the result as a JSON object.
-
- Args:
- filename (str): The path to the audio file.
- timeout (int, optional): The timeout for recognition (default is 5 seconds).
- fp16 (bool, optional): If True, use 16-bit floating point precision, (default is False) because cuda is not supported.
- language (str, optional): The language code for recognition (default is None).
-
- Returns:
- dict: A JSON object containing recognition results.
- """
- def transcribe_with_whisper():
- return self.whisper_model.transcribe(filename, language = language,fp16=fp16)
+ # def recognize_using_whisper(self,filename,language = None,timeout = 5,fp16=False):
+ # """
+ # Recognize speech and return the result as a JSON object.
+
+ # Args:
+ # filename (str): The path to the audio file.
+ # timeout (int, optional): The timeout for recognition (default is 5 seconds).
+ # fp16 (bool, optional): If True, use 16-bit floating point precision, (default is False) because cuda is not supported.
+ # language (str, optional): The language code for recognition (default is None).
+
+ # Returns:
+ # dict: A JSON object containing recognition results.
+ # """
+ # def transcribe_with_whisper():
+ # return self.whisper_model.transcribe(filename, language = language,fp16=fp16)
- with ThreadPoolExecutor() as executor:
- future = executor.submit(transcribe_with_whisper)
- try:
- return future.result(timeout=timeout)
- except TimeoutError:
- return {"error": "Transcription with Whisper exceeded the timeout."}
+ # with ThreadPoolExecutor() as executor:
+ # future = executor.submit(transcribe_with_whisper)
+ # try:
+ # return future.result(timeout=timeout)
+ # except TimeoutError:
+ # return {"error": "Transcription with Whisper exceeded the timeout."}
def recognize_using_whisper_cpp(self,filename):
command = self.whisper_cpp_path