Integrate Whisper AI into agl-service-voiceagent

V1: - Integrated Whisper AI for speech-to-text functionality into agl-service-voiceagent. - Add support for both online and offline mode. - Implemented a gRPC-based connection for online mode between Whisper ASR service and voice-agent service. V2: - Update kuksa-interface - Add whisper-cpp for speech-to-text functionality - Add support to control media using mpd - Fix audio recorder Bug-AGL: SPEC-5200 Change-Id: I2661ae61ba2c3283bcfde26d6e4f498270240b19 Signed-off-by: Anuj Solanki <anuj603362@gmail.com>
author: Anuj Solanki <anuj603362@gmail.com> 2024-06-16 18:49:45 +0530
committer: Anuj Solanki <anuj603362@gmail.com> 2024-09-07 20:16:14 +0530
commit: 1144fcd343bc56f8c27ff73d3e76904010dbb832 (patch)
tree: 490915cd969f19b4eb3b3dd480554b27c1058243 /agl_service_voiceagent/protos
parent: f2b62ba4da5a178221c3210c2d468cd684e626cc (diff)
2 files changed, 35 insertions, 0 deletions
diff --git a/agl_service_voiceagent/protos/audio_processing.proto b/agl_service_voiceagent/protos/audio_processing.proto
new file mode 100644
index 0000000..edacc04
--- /dev/null
+++ b/agl_service_voiceagent/protos/audio_processing.proto
@@ -0,0 +1,23 @@
+// proto file for audio processing service for whiisper online service
+
+syntax = "proto3";
+
+package audioproc;
+
+service AudioProcessing {
+  // Sends audio data and receives processed text.
+  rpc ProcessAudio (AudioRequest) returns (TextResponse);
+}
+
+// The request message containing the audio data.
+message AudioRequest {
+  bytes audio_data = 1;
+}
+
+// The response message containing the processed text.
+message TextResponse {
+  string text = 1;
+}
+
+// usage: 
+// python -m grpc_tools.protoc -I. --python_out=./generated/ --grpc_python_out=./generated/ audio_processing.proto
+\ No newline at end of file
diff --git a/agl_service_voiceagent/protos/voice_agent.proto b/agl_service_voiceagent/protos/voice_agent.proto
index 40dfe6a..bd2daa2 100644
--- a/agl_service_voiceagent/protos/voice_agent.proto
+++ b/agl_service_voiceagent/protos/voice_agent.proto
@@ -11,6 +11,15 @@ service VoiceAgentService {
   rpc ExecuteCommand(ExecuteInput) returns (ExecuteResult);
 }
 
+enum STTFramework {
+  VOSK = 0;
+  WHISPER = 1;
+}
+
+enum OnlineMode {
+  ONLINE = 0;
+  OFFLINE = 1;
+}
 
 enum RecordAction {
   START = 0;
@@ -69,6 +78,7 @@ message S_RecognizeVoiceControl {
   VoiceAudio audio_stream = 1;
   NLUModel nlu_model = 2;
   string stream_id = 3;
+  STTFramework stt_framework = 4;
 }
 
 message RecognizeVoiceControl {
@@ -76,6 +86,8 @@ message RecognizeVoiceControl {
   NLUModel nlu_model = 2;
   RecordMode record_mode = 3;
   string stream_id = 4;
+  STTFramework stt_framework = 5;
+  OnlineMode online_mode = 6;
 }
 
 message RecognizeTextControl {
author	Anuj Solanki <anuj603362@gmail.com>	2024-06-16 18:49:45 +0530
committer	Anuj Solanki <anuj603362@gmail.com>	2024-09-07 20:16:14 +0530
commit	1144fcd343bc56f8c27ff73d3e76904010dbb832 (patch)
tree	490915cd969f19b4eb3b3dd480554b27c1058243 /agl_service_voiceagent/protos
parent	f2b62ba4da5a178221c3210c2d468cd684e626cc (diff)