Integrate Whisper AI into agl-service-voiceagent

V1: - Integrated Whisper AI for speech-to-text functionality into agl-service-voiceagent. - Add support for both online and offline mode. - Implemented a gRPC-based connection for online mode between Whisper ASR service and voice-agent service. V2: - Update kuksa-interface - Add whisper-cpp for speech-to-text functionality - Add support to control media using mpd - Fix audio recorder Bug-AGL: SPEC-5200 Change-Id: I2661ae61ba2c3283bcfde26d6e4f498270240b19 Signed-off-by: Anuj Solanki <anuj603362@gmail.com>
author: Anuj Solanki <anuj603362@gmail.com> 2024-06-16 18:49:45 +0530
committer: Anuj Solanki <anuj603362@gmail.com> 2024-09-07 20:16:14 +0530
commit: 1144fcd343bc56f8c27ff73d3e76904010dbb832 (patch)
tree: 490915cd969f19b4eb3b3dd480554b27c1058243
parent: f2b62ba4da5a178221c3210c2d468cd684e626cc (diff)
22 files changed, 1560 insertions, 93 deletions
diff --git a/.gitignore b/.gitignore
index 2035a32..8b0ce61 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+.DS_Store
 
 # PyInstaller
 #  Usually these files are written by a python script from a template
@@ -171,4 +172,4 @@ cython_debug/
 # logs dir
 logs/
 
-generated/
+
diff --git a/agl_service_voiceagent/client.py b/agl_service_voiceagent/client.py
index 88ef785..ee7bc52 100644
--- a/agl_service_voiceagent/client.py
+++ b/agl_service_voiceagent/client.py
@@ -14,16 +14,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
+sys.path.append("../")
 import time
 import grpc
 from agl_service_voiceagent.generated import voice_agent_pb2
 from agl_service_voiceagent.generated import voice_agent_pb2_grpc
 
-def run_client(server_address, server_port, action, mode, nlu_engine, recording_time):
+def run_client(server_address, server_port, action, mode, nlu_engine, recording_time,stt_framework,online_mode):
     SERVER_URL = server_address + ":" + server_port
     nlu_engine = voice_agent_pb2.RASA if nlu_engine == "rasa" else voice_agent_pb2.SNIPS
     print("Starting Voice Agent Client...")
     print(f"Client connecting to URL: {SERVER_URL}")
+    print("STT Framework:", stt_framework)
     with grpc.insecure_channel(SERVER_URL) as channel:
         print("Press Ctrl+C to stop the client.")
         print("Voice Agent Client started!")
@@ -51,18 +54,74 @@ def run_client(server_address, server_port, action, mode, nlu_engine, recording_
         
         elif action == 'ExecuteVoiceCommand':
             if mode == 'auto':
-                raise ValueError("[-] Auto mode is not implemented yet.")
+                # raise ValueError("[-] Auto mode is not implemented yet.")
+                stub = voice_agent_pb2_grpc.VoiceAgentServiceStub(channel=channel)
+                stt_framework = voice_agent_pb2.VOSK if stt_framework == "vosk" else voice_agent_pb2.WHISPER
+                online_mode = voice_agent_pb2.ONLINE if online_mode == True else voice_agent_pb2.OFFLINE
+                while(True):
+                    wake_request = voice_agent_pb2.Empty()
+                    wake_results = stub.DetectWakeWord(wake_request)
+                    wake_word_detected = False
+                    for wake_result in wake_results:
+                        print("Wake word status: ", wake_word_detected)
+                        if wake_result.status:
+                            print("Wake word status: ", wake_result.status)
+                            wake_word_detected = True
+                            break
+                    print("Wake word detected: ", wake_word_detected)
+                    if wake_word_detected:
+                        print("[+] Wake Word detected! Recording voice command...")
+                        record_start_request = voice_agent_pb2.RecognizeVoiceControl(action=voice_agent_pb2.START, nlu_model=nlu_engine, record_mode=voice_agent_pb2.MANUAL, stt_framework=stt_framework,online_mode=online_mode,)
+                        response = stub.RecognizeVoiceCommand(iter([record_start_request]))
+                        stream_id = response.stream_id
+
+                        time.sleep(recording_time) # pause here for the number of seconds passed by user or default 5 seconds
+
+                        record_stop_request = voice_agent_pb2.RecognizeVoiceControl(action=voice_agent_pb2.STOP, nlu_model=nlu_engine, record_mode=voice_agent_pb2.MANUAL, stream_id=stream_id,stt_framework=stt_framework,online_mode=online_mode,)
+                        record_result = stub.RecognizeVoiceCommand(iter([record_stop_request]))
+                        print("[+] Voice command recording ended!")
+
+                        status = "Uh oh! Status is unknown."
+                        if record_result.status == voice_agent_pb2.REC_SUCCESS:
+                            status = "Yay! Status is success."
+                        elif record_result.status == voice_agent_pb2.VOICE_NOT_RECOGNIZED:
+                            status = "Voice not recognized."
+                        elif record_result.status == voice_agent_pb2.INTENT_NOT_RECOGNIZED:
+                            status = "Intent not recognized."
+
+                        # Process the response
+                        print("Status:", status)
+                        print("Command:", record_result.command)
+                        print("Intent:", record_result.intent)
+                        intent_slots = []
+                        for slot in record_result.intent_slots:
+                            print("Slot Name:", slot.name)
+                            print("Slot Value:", slot.value)
+                            i_slot = voice_agent_pb2.IntentSlot(name=slot.name, value=slot.value)
+                            intent_slots.append(i_slot)
+
+                        if record_result.status == voice_agent_pb2.REC_SUCCESS:
+                            print("[+] Executing voice command...")
+                            exec_voice_command_request = voice_agent_pb2.ExecuteInput(intent=record_result.intent, intent_slots=intent_slots)
+                            response = stub.ExecuteCommand(exec_voice_command_request)
+                            print("Response:", response)
+                    wake_word_detected = False
+                    time.sleep(1)
+
+                
 
             elif mode == 'manual':
                 stub = voice_agent_pb2_grpc.VoiceAgentServiceStub(channel)
+                stt_framework = voice_agent_pb2.VOSK if stt_framework == "vosk" else voice_agent_pb2.WHISPER
+                online_mode = voice_agent_pb2.ONLINE if online_mode == True else voice_agent_pb2.OFFLINE
                 print("[+] Recording voice command in manual mode...")
-                record_start_request = voice_agent_pb2.RecognizeVoiceControl(action=voice_agent_pb2.START, nlu_model=nlu_engine, record_mode=voice_agent_pb2.MANUAL)
+                record_start_request = voice_agent_pb2.RecognizeVoiceControl(action=voice_agent_pb2.START, nlu_model=nlu_engine, record_mode=voice_agent_pb2.MANUAL, stt_framework=stt_framework,online_mode=online_mode,)
                 response = stub.RecognizeVoiceCommand(iter([record_start_request]))
                 stream_id = response.stream_id
 
                 time.sleep(recording_time) # pause here for the number of seconds passed by user or default 5 seconds
 
-                record_stop_request = voice_agent_pb2.RecognizeVoiceControl(action=voice_agent_pb2.STOP, nlu_model=nlu_engine, record_mode=voice_agent_pb2.MANUAL, stream_id=stream_id)
+                record_stop_request = voice_agent_pb2.RecognizeVoiceControl(action=voice_agent_pb2.STOP, nlu_model=nlu_engine, record_mode=voice_agent_pb2.MANUAL, stream_id=stream_id,stt_framework=stt_framework,online_mode=online_mode,)
                 record_result = stub.RecognizeVoiceCommand(iter([record_stop_request]))
                 print("[+] Voice command recording ended!")
                 
diff --git a/agl_service_voiceagent/config.ini b/agl_service_voiceagent/config.ini
index 1651da5..d6d695e 100644
--- a/agl_service_voiceagent/config.ini
+++ b/agl_service_voiceagent/config.ini
@@ -1,12 +1,15 @@
 [General]
 base_audio_dir = /usr/share/nlu/commands/
-stt_model_path = /usr/share/vosk/VOSK_STT_MODEL_NAME/
-wake_word_model_path = /usr/share/vosk/VOSK_WWD_MODEL_NAME/
+vosk_model_path = /usr/share/vosk/vosk-model-small-en-us-0.15/
+whisper_model_path = /usr/share/whisper/tiny.pt
+whisper_cpp_path = /usr/bin/whisper-cpp
+whisper_cpp_model_path = /usr/share/whisper-cpp/models/tiny.en.bin
+wake_word_model_path = /usr/share/vosk/vosk-model-small-en-us-0.15/
 snips_model_path = /usr/share/nlu/snips/model/
 channels = 1
 sample_rate = 16000
 bits_per_sample = 16
-wake_word = WAKE_WORD_VALUE
+wake_word = hello
 server_port = 51053
 server_address = 127.0.0.1
 rasa_model_path = /usr/share/nlu/rasa/models/
@@ -14,13 +17,28 @@ rasa_server_port = 51054
 rasa_detached_mode = 1
 base_log_dir = /usr/share/nlu/logs/
 store_voice_commands = 0
+online_mode = 1
+online_mode_address = 65.108.107.216
+online_mode_port = 50051
+online_mode_timeout = 15
+mpd_ip = 127.0.0.1
+mpd_port = 6600
 
 [Kuksa]
 ip = 127.0.0.1
 port = 55555
 protocol = grpc
 insecure = 0
-token = PYTHON_DIR/kuksa_certificates/jwt/super-admin.json.token
+token =  /usr/lib/python3.12/site-packages/kuksa_certificates/jwt/super-admin.json.token
+tls_server_name = Server
+
+[VSS]
+hostname = localhost
+port = 55555
+protocol = grpc
+insecure = 0
+token_filename = /etc/xdg/AGL/agl-vss-helper/agl-vss-helper.token
+ca_cert_filename = /etc/kuksa-val/CA.pem
 tls_server_name = Server
 
 [Mapper]
diff --git a/agl_service_voiceagent/generated/audio_processing_pb2.py b/agl_service_voiceagent/generated/audio_processing_pb2.py
new file mode 100644
index 0000000..fdbeedb
--- /dev/null
+++ b/agl_service_voiceagent/generated/audio_processing_pb2.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: audio_processing.proto
+# Protobuf Python Version: 5.26.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x16\x61udio_processing.proto\x12\taudioproc\"\"\n\x0c\x41udioRequest\x12\x12\n\naudio_data\x18\x01 \x01(\x0c\"\x1c\n\x0cTextResponse\x12\x0c\n\x04text\x18\x01 \x01(\t2S\n\x0f\x41udioProcessing\x12@\n\x0cProcessAudio\x12\x17.audioproc.AudioRequest\x1a\x17.audioproc.TextResponseb\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'audio_processing_pb2', _globals)
+if not _descriptor._USE_C_DESCRIPTORS:
+  DESCRIPTOR._loaded_options = None
+  _globals['_AUDIOREQUEST']._serialized_start=37
+  _globals['_AUDIOREQUEST']._serialized_end=71
+  _globals['_TEXTRESPONSE']._serialized_start=73
+  _globals['_TEXTRESPONSE']._serialized_end=101
+  _globals['_AUDIOPROCESSING']._serialized_start=103
+  _globals['_AUDIOPROCESSING']._serialized_end=186
+# @@protoc_insertion_point(module_scope)
diff --git a/agl_service_voiceagent/generated/audio_processing_pb2_grpc.py b/agl_service_voiceagent/generated/audio_processing_pb2_grpc.py
new file mode 100644
index 0000000..4b54903
--- /dev/null
+++ b/agl_service_voiceagent/generated/audio_processing_pb2_grpc.py
@@ -0,0 +1,106 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+import warnings
+
+from . import audio_processing_pb2 as audio__processing__pb2
+
+GRPC_GENERATED_VERSION = '1.65.0rc1'
+GRPC_VERSION = grpc.__version__
+EXPECTED_ERROR_RELEASE = '1.65.0'
+SCHEDULED_RELEASE_DATE = 'June 25, 2024'
+_version_not_supported = False
+
+try:
+    from grpc._utilities import first_version_is_lower
+    _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
+except ImportError:
+    _version_not_supported = True
+
+if _version_not_supported:
+    warnings.warn(
+        f'The grpc package installed is at version {GRPC_VERSION},'
+        + f' but the generated code in audio_processing_pb2_grpc.py depends on'
+        + f' grpcio>={GRPC_GENERATED_VERSION}.'
+        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
+        + f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
+        + f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
+        RuntimeWarning
+    )
+
+
+class AudioProcessingStub(object):
+    """The audio processing service definition.
+    """
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.ProcessAudio = channel.unary_unary(
+                '/audioproc.AudioProcessing/ProcessAudio',
+                request_serializer=audio__processing__pb2.AudioRequest.SerializeToString,
+                response_deserializer=audio__processing__pb2.TextResponse.FromString,
+                _registered_method=True)
+
+
+class AudioProcessingServicer(object):
+    """The audio processing service definition.
+    """
+
+    def ProcessAudio(self, request, context):
+        """Sends audio data and receives processed text.
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_AudioProcessingServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'ProcessAudio': grpc.unary_unary_rpc_method_handler(
+                    servicer.ProcessAudio,
+                    request_deserializer=audio__processing__pb2.AudioRequest.FromString,
+                    response_serializer=audio__processing__pb2.TextResponse.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'audioproc.AudioProcessing', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+    server.add_registered_method_handlers('audioproc.AudioProcessing', rpc_method_handlers)
+
+
+ # This class is part of an EXPERIMENTAL API.
+class AudioProcessing(object):
+    """The audio processing service definition.
+    """
+
+    @staticmethod
+    def ProcessAudio(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/audioproc.AudioProcessing/ProcessAudio',
+            audio__processing__pb2.AudioRequest.SerializeToString,
+            audio__processing__pb2.TextResponse.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
diff --git a/agl_service_voiceagent/generated/voice_agent_pb2.py b/agl_service_voiceagent/generated/voice_agent_pb2.py
new file mode 100644
index 0000000..4606f60
--- /dev/null
+++ b/agl_service_voiceagent/generated/voice_agent_pb2.py
@@ -0,0 +1,62 @@
+# -*- coding: utf-8 -*-
+# Generated by the protocol buffer compiler.  DO NOT EDIT!
+# source: voice_agent.proto
+# Protobuf Python Version: 5.26.1
+"""Generated protocol buffer code."""
+from google.protobuf import descriptor as _descriptor
+from google.protobuf import descriptor_pool as _descriptor_pool
+from google.protobuf import symbol_database as _symbol_database
+from google.protobuf.internal import builder as _builder
+# @@protoc_insertion_point(imports)
+
+_sym_db = _symbol_database.Default()
+
+
+
+
+DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x11voice_agent.proto\"\x07\n\x05\x45mpty\"C\n\rServiceStatus\x12\x0f\n\x07version\x18\x01 \x01(\t\x12\x0e\n\x06status\x18\x02 \x01(\x08\x12\x11\n\twake_word\x18\x03 \x01(\t\"^\n\nVoiceAudio\x12\x13\n\x0b\x61udio_chunk\x18\x01 \x01(\x0c\x12\x14\n\x0c\x61udio_format\x18\x02 \x01(\t\x12\x13\n\x0bsample_rate\x18\x03 \x01(\x05\x12\x10\n\x08language\x18\x04 \x01(\t\" \n\x0eWakeWordStatus\x12\x0e\n\x06status\x18\x01 \x01(\x08\"\x93\x01\n\x17S_RecognizeVoiceControl\x12!\n\x0c\x61udio_stream\x18\x01 \x01(\x0b\x32\x0b.VoiceAudio\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12\x11\n\tstream_id\x18\x03 \x01(\t\x12$\n\rstt_framework\x18\x04 \x01(\x0e\x32\r.STTFramework\"\xd1\x01\n\x15RecognizeVoiceControl\x12\x1d\n\x06\x61\x63tion\x18\x01 \x01(\x0e\x32\r.RecordAction\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\x12 \n\x0brecord_mode\x18\x03 \x01(\x0e\x32\x0b.RecordMode\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\rstt_framework\x18\x05 \x01(\x0e\x32\r.STTFramework\x12 \n\x0bonline_mode\x18\x06 \x01(\x0e\x32\x0b.OnlineMode\"J\n\x14RecognizeTextControl\x12\x14\n\x0ctext_command\x18\x01 \x01(\t\x12\x1c\n\tnlu_model\x18\x02 \x01(\x0e\x32\t.NLUModel\")\n\nIntentSlot\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t\"\x8e\x01\n\x0fRecognizeResult\x12\x0f\n\x07\x63ommand\x18\x01 \x01(\t\x12\x0e\n\x06intent\x18\x02 \x01(\t\x12!\n\x0cintent_slots\x18\x03 \x03(\x0b\x32\x0b.IntentSlot\x12\x11\n\tstream_id\x18\x04 \x01(\t\x12$\n\x06status\x18\x05 \x01(\x0e\x32\x14.RecognizeStatusType\"A\n\x0c\x45xecuteInput\x12\x0e\n\x06intent\x18\x01 \x01(\t\x12!\n\x0cintent_slots\x18\x02 \x03(\x0b\x32\x0b.IntentSlot\"E\n\rExecuteResult\x12\x10\n\x08response\x18\x01 \x01(\t\x12\"\n\x06status\x18\x02 \x01(\x0e\x32\x12.ExecuteStatusType*%\n\x0cSTTFramework\x12\x08\n\x04VOSK\x10\x00\x12\x0b\n\x07WHISPER\x10\x01*%\n\nOnlineMode\x12\n\n\x06ONLINE\x10\x00\x12\x0b\n\x07OFFLINE\x10\x01*#\n\x0cRecordAction\x12\t\n\x05START\x10\x00\x12\x08\n\x04STOP\x10\x01*\x1f\n\x08NLUModel\x12\t\n\x05SNIPS\x10\x00\x12\x08\n\x04RASA\x10\x01*\"\n\nRecordMode\x12\n\n\x06MANUAL\x10\x00\x12\x08\n\x04\x41UTO\x10\x01*\xb4\x01\n\x13RecognizeStatusType\x12\r\n\tREC_ERROR\x10\x00\x12\x0f\n\x0bREC_SUCCESS\x10\x01\x12\x12\n\x0eREC_PROCESSING\x10\x02\x12\x18\n\x14VOICE_NOT_RECOGNIZED\x10\x03\x12\x19\n\x15INTENT_NOT_RECOGNIZED\x10\x04\x12\x17\n\x13TEXT_NOT_RECOGNIZED\x10\x05\x12\x1b\n\x17NLU_MODEL_NOT_SUPPORTED\x10\x06*\x82\x01\n\x11\x45xecuteStatusType\x12\x0e\n\nEXEC_ERROR\x10\x00\x12\x10\n\x0c\x45XEC_SUCCESS\x10\x01\x12\x14\n\x10KUKSA_CONN_ERROR\x10\x02\x12\x18\n\x14INTENT_NOT_SUPPORTED\x10\x03\x12\x1b\n\x17INTENT_SLOTS_INCOMPLETE\x10\x04\x32\xa4\x03\n\x11VoiceAgentService\x12,\n\x12\x43heckServiceStatus\x12\x06.Empty\x1a\x0e.ServiceStatus\x12\x34\n\x10S_DetectWakeWord\x12\x0b.VoiceAudio\x1a\x0f.WakeWordStatus(\x01\x30\x01\x12+\n\x0e\x44\x65tectWakeWord\x12\x06.Empty\x1a\x0f.WakeWordStatus0\x01\x12G\n\x17S_RecognizeVoiceCommand\x12\x18.S_RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12\x43\n\x15RecognizeVoiceCommand\x12\x16.RecognizeVoiceControl\x1a\x10.RecognizeResult(\x01\x12?\n\x14RecognizeTextCommand\x12\x15.RecognizeTextControl\x1a\x10.RecognizeResult\x12/\n\x0e\x45xecuteCommand\x12\r.ExecuteInput\x1a\x0e.ExecuteResultb\x06proto3')
+
+_globals = globals()
+_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
+_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'voice_agent_pb2', _globals)
+if not _descriptor._USE_C_DESCRIPTORS:
+  DESCRIPTOR._loaded_options = None
+  _globals['_STTFRAMEWORK']._serialized_start=993
+  _globals['_STTFRAMEWORK']._serialized_end=1030
+  _globals['_ONLINEMODE']._serialized_start=1032
+  _globals['_ONLINEMODE']._serialized_end=1069
+  _globals['_RECORDACTION']._serialized_start=1071
+  _globals['_RECORDACTION']._serialized_end=1106
+  _globals['_NLUMODEL']._serialized_start=1108
+  _globals['_NLUMODEL']._serialized_end=1139
+  _globals['_RECORDMODE']._serialized_start=1141
+  _globals['_RECORDMODE']._serialized_end=1175
+  _globals['_RECOGNIZESTATUSTYPE']._serialized_start=1178
+  _globals['_RECOGNIZESTATUSTYPE']._serialized_end=1358
+  _globals['_EXECUTESTATUSTYPE']._serialized_start=1361
+  _globals['_EXECUTESTATUSTYPE']._serialized_end=1491
+  _globals['_EMPTY']._serialized_start=21
+  _globals['_EMPTY']._serialized_end=28
+  _globals['_SERVICESTATUS']._serialized_start=30
+  _globals['_SERVICESTATUS']._serialized_end=97
+  _globals['_VOICEAUDIO']._serialized_start=99
+  _globals['_VOICEAUDIO']._serialized_end=193
+  _globals['_WAKEWORDSTATUS']._serialized_start=195
+  _globals['_WAKEWORDSTATUS']._serialized_end=227
+  _globals['_S_RECOGNIZEVOICECONTROL']._serialized_start=230
+  _globals['_S_RECOGNIZEVOICECONTROL']._serialized_end=377
+  _globals['_RECOGNIZEVOICECONTROL']._serialized_start=380
+  _globals['_RECOGNIZEVOICECONTROL']._serialized_end=589
+  _globals['_RECOGNIZETEXTCONTROL']._serialized_start=591
+  _globals['_RECOGNIZETEXTCONTROL']._serialized_end=665
+  _globals['_INTENTSLOT']._serialized_start=667
+  _globals['_INTENTSLOT']._serialized_end=708
+  _globals['_RECOGNIZERESULT']._serialized_start=711
+  _globals['_RECOGNIZERESULT']._serialized_end=853
+  _globals['_EXECUTEINPUT']._serialized_start=855
+  _globals['_EXECUTEINPUT']._serialized_end=920
+  _globals['_EXECUTERESULT']._serialized_start=922
+  _globals['_EXECUTERESULT']._serialized_end=991
+  _globals['_VOICEAGENTSERVICE']._serialized_start=1494
+  _globals['_VOICEAGENTSERVICE']._serialized_end=1914
+# @@protoc_insertion_point(module_scope)
diff --git a/agl_service_voiceagent/generated/voice_agent_pb2_grpc.py b/agl_service_voiceagent/generated/voice_agent_pb2_grpc.py
new file mode 100644
index 0000000..15d76f4
--- /dev/null
+++ b/agl_service_voiceagent/generated/voice_agent_pb2_grpc.py
@@ -0,0 +1,362 @@
+# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
+"""Client and server classes corresponding to protobuf-defined services."""
+import grpc
+import warnings
+
+from . import voice_agent_pb2 as voice__agent__pb2
+
+GRPC_GENERATED_VERSION = '1.65.0rc1'
+GRPC_VERSION = grpc.__version__
+EXPECTED_ERROR_RELEASE = '1.65.0'
+SCHEDULED_RELEASE_DATE = 'June 25, 2024'
+_version_not_supported = False
+
+try:
+    from grpc._utilities import first_version_is_lower
+    _version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
+except ImportError:
+    _version_not_supported = True
+
+if _version_not_supported:
+    warnings.warn(
+        f'The grpc package installed is at version {GRPC_VERSION},'
+        + f' but the generated code in voice_agent_pb2_grpc.py depends on'
+        + f' grpcio>={GRPC_GENERATED_VERSION}.'
+        + f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+        + f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
+        + f' This warning will become an error in {EXPECTED_ERROR_RELEASE},'
+        + f' scheduled for release on {SCHEDULED_RELEASE_DATE}.',
+        RuntimeWarning
+    )
+
+
+class VoiceAgentServiceStub(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def __init__(self, channel):
+        """Constructor.
+
+        Args:
+            channel: A grpc.Channel.
+        """
+        self.CheckServiceStatus = channel.unary_unary(
+                '/VoiceAgentService/CheckServiceStatus',
+                request_serializer=voice__agent__pb2.Empty.SerializeToString,
+                response_deserializer=voice__agent__pb2.ServiceStatus.FromString,
+                _registered_method=True)
+        self.S_DetectWakeWord = channel.stream_stream(
+                '/VoiceAgentService/S_DetectWakeWord',
+                request_serializer=voice__agent__pb2.VoiceAudio.SerializeToString,
+                response_deserializer=voice__agent__pb2.WakeWordStatus.FromString,
+                _registered_method=True)
+        self.DetectWakeWord = channel.unary_stream(
+                '/VoiceAgentService/DetectWakeWord',
+                request_serializer=voice__agent__pb2.Empty.SerializeToString,
+                response_deserializer=voice__agent__pb2.WakeWordStatus.FromString,
+                _registered_method=True)
+        self.S_RecognizeVoiceCommand = channel.stream_unary(
+                '/VoiceAgentService/S_RecognizeVoiceCommand',
+                request_serializer=voice__agent__pb2.S_RecognizeVoiceControl.SerializeToString,
+                response_deserializer=voice__agent__pb2.RecognizeResult.FromString,
+                _registered_method=True)
+        self.RecognizeVoiceCommand = channel.stream_unary(
+                '/VoiceAgentService/RecognizeVoiceCommand',
+                request_serializer=voice__agent__pb2.RecognizeVoiceControl.SerializeToString,
+                response_deserializer=voice__agent__pb2.RecognizeResult.FromString,
+                _registered_method=True)
+        self.RecognizeTextCommand = channel.unary_unary(
+                '/VoiceAgentService/RecognizeTextCommand',
+                request_serializer=voice__agent__pb2.RecognizeTextControl.SerializeToString,
+                response_deserializer=voice__agent__pb2.RecognizeResult.FromString,
+                _registered_method=True)
+        self.ExecuteCommand = channel.unary_unary(
+                '/VoiceAgentService/ExecuteCommand',
+                request_serializer=voice__agent__pb2.ExecuteInput.SerializeToString,
+                response_deserializer=voice__agent__pb2.ExecuteResult.FromString,
+                _registered_method=True)
+
+
+class VoiceAgentServiceServicer(object):
+    """Missing associated documentation comment in .proto file."""
+
+    def CheckServiceStatus(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def S_DetectWakeWord(self, request_iterator, context):
+        """Stream version of DetectWakeWord, assumes audio is coming from client
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def DetectWakeWord(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def S_RecognizeVoiceCommand(self, request_iterator, context):
+        """Stream version of RecognizeVoiceCommand, assumes audio is coming from client
+        """
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def RecognizeVoiceCommand(self, request_iterator, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def RecognizeTextCommand(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+    def ExecuteCommand(self, request, context):
+        """Missing associated documentation comment in .proto file."""
+        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
+        context.set_details('Method not implemented!')
+        raise NotImplementedError('Method not implemented!')
+
+
+def add_VoiceAgentServiceServicer_to_server(servicer, server):
+    rpc_method_handlers = {
+            'CheckServiceStatus': grpc.unary_unary_rpc_method_handler(
+                    servicer.CheckServiceStatus,
+                    request_deserializer=voice__agent__pb2.Empty.FromString,
+                    response_serializer=voice__agent__pb2.ServiceStatus.SerializeToString,
+            ),
+            'S_DetectWakeWord': grpc.stream_stream_rpc_method_handler(
+                    servicer.S_DetectWakeWord,
+                    request_deserializer=voice__agent__pb2.VoiceAudio.FromString,
+                    response_serializer=voice__agent__pb2.WakeWordStatus.SerializeToString,
+            ),
+            'DetectWakeWord': grpc.unary_stream_rpc_method_handler(
+                    servicer.DetectWakeWord,
+                    request_deserializer=voice__agent__pb2.Empty.FromString,
+                    response_serializer=voice__agent__pb2.WakeWordStatus.SerializeToString,
+            ),
+            'S_RecognizeVoiceCommand': grpc.stream_unary_rpc_method_handler(
+                    servicer.S_RecognizeVoiceCommand,
+                    request_deserializer=voice__agent__pb2.S_RecognizeVoiceControl.FromString,
+                    response_serializer=voice__agent__pb2.RecognizeResult.SerializeToString,
+            ),
+            'RecognizeVoiceCommand': grpc.stream_unary_rpc_method_handler(
+                    servicer.RecognizeVoiceCommand,
+                    request_deserializer=voice__agent__pb2.RecognizeVoiceControl.FromString,
+                    response_serializer=voice__agent__pb2.RecognizeResult.SerializeToString,
+            ),
+            'RecognizeTextCommand': grpc.unary_unary_rpc_method_handler(
+                    servicer.RecognizeTextCommand,
+                    request_deserializer=voice__agent__pb2.RecognizeTextControl.FromString,
+                    response_serializer=voice__agent__pb2.RecognizeResult.SerializeToString,
+            ),
+            'ExecuteCommand': grpc.unary_unary_rpc_method_handler(
+                    servicer.ExecuteCommand,
+                    request_deserializer=voice__agent__pb2.ExecuteInput.FromString,
+                    response_serializer=voice__agent__pb2.ExecuteResult.SerializeToString,
+            ),
+    }
+    generic_handler = grpc.method_handlers_generic_handler(
+            'VoiceAgentService', rpc_method_handlers)
+    server.add_generic_rpc_handlers((generic_handler,))
+    server.add_registered_method_handlers('VoiceAgentService', rpc_method_handlers)
+
+
+ # This class is part of an EXPERIMENTAL API.
+class VoiceAgentService(object):
+    """Missing associated documentation comment in .proto file."""
+
+    @staticmethod
+    def CheckServiceStatus(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/VoiceAgentService/CheckServiceStatus',
+            voice__agent__pb2.Empty.SerializeToString,
+            voice__agent__pb2.ServiceStatus.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def S_DetectWakeWord(request_iterator,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.stream_stream(
+            request_iterator,
+            target,
+            '/VoiceAgentService/S_DetectWakeWord',
+            voice__agent__pb2.VoiceAudio.SerializeToString,
+            voice__agent__pb2.WakeWordStatus.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def DetectWakeWord(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_stream(
+            request,
+            target,
+            '/VoiceAgentService/DetectWakeWord',
+            voice__agent__pb2.Empty.SerializeToString,
+            voice__agent__pb2.WakeWordStatus.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def S_RecognizeVoiceCommand(request_iterator,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.stream_unary(
+            request_iterator,
+            target,
+            '/VoiceAgentService/S_RecognizeVoiceCommand',
+            voice__agent__pb2.S_RecognizeVoiceControl.SerializeToString,
+            voice__agent__pb2.RecognizeResult.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def RecognizeVoiceCommand(request_iterator,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.stream_unary(
+            request_iterator,
+            target,
+            '/VoiceAgentService/RecognizeVoiceCommand',
+            voice__agent__pb2.RecognizeVoiceControl.SerializeToString,
+            voice__agent__pb2.RecognizeResult.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def RecognizeTextCommand(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/VoiceAgentService/RecognizeTextCommand',
+            voice__agent__pb2.RecognizeTextControl.SerializeToString,
+            voice__agent__pb2.RecognizeResult.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
+
+    @staticmethod
+    def ExecuteCommand(request,
+            target,
+            options=(),
+            channel_credentials=None,
+            call_credentials=None,
+            insecure=False,
+            compression=None,
+            wait_for_ready=None,
+            timeout=None,
+            metadata=None):
+        return grpc.experimental.unary_unary(
+            request,
+            target,
+            '/VoiceAgentService/ExecuteCommand',
+            voice__agent__pb2.ExecuteInput.SerializeToString,
+            voice__agent__pb2.ExecuteResult.FromString,
+            options,
+            channel_credentials,
+            insecure,
+            call_credentials,
+            compression,
+            wait_for_ready,
+            timeout,
+            metadata,
+            _registered_method=True)
diff --git a/agl_service_voiceagent/nlu/snips_interface.py b/agl_service_voiceagent/nlu/snips_interface.py
index 1febe92..a32f574 100644
--- a/agl_service_voiceagent/nlu/snips_interface.py
+++ b/agl_service_voiceagent/nlu/snips_interface.py
@@ -13,7 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import re
 from typing import Text
 from snips_inference_agl import SnipsNLUEngine
@@ -47,6 +46,10 @@ class SnipsInterface:
         preprocessed_text = text.lower().strip()
         # remove special characters, punctuation, and extra whitespaces
         preprocessed_text = re.sub(r'[^\w\s]', '', preprocessed_text).strip()
+        # replace % with " precent"
+        preprocessed_text = re.sub(r'%', ' percent', preprocessed_text)
+        # replace ° with " degrees"
+        preprocessed_text = re.sub(r'°', ' degrees ', preprocessed_text)
         return preprocessed_text
 
     def extract_intent(self, text: Text):
diff --git a/agl_service_voiceagent/protos/audio_processing.proto b/agl_service_voiceagent/protos/audio_processing.proto
new file mode 100644
index 0000000..edacc04
--- /dev/null
+++ b/agl_service_voiceagent/protos/audio_processing.proto
@@ -0,0 +1,23 @@
+// proto file for audio processing service for whiisper online service
+
+syntax = "proto3";
+
+package audioproc;
+
+service AudioProcessing {
+  // Sends audio data and receives processed text.
+  rpc ProcessAudio (AudioRequest) returns (TextResponse);
+}
+
+// The request message containing the audio data.
+message AudioRequest {
+  bytes audio_data = 1;
+}
+
+// The response message containing the processed text.
+message TextResponse {
+  string text = 1;
+}
+
+// usage: 
+// python -m grpc_tools.protoc -I. --python_out=./generated/ --grpc_python_out=./generated/ audio_processing.proto
+\ No newline at end of file
diff --git a/agl_service_voiceagent/protos/voice_agent.proto b/agl_service_voiceagent/protos/voice_agent.proto
index 40dfe6a..bd2daa2 100644
--- a/agl_service_voiceagent/protos/voice_agent.proto
+++ b/agl_service_voiceagent/protos/voice_agent.proto
@@ -11,6 +11,15 @@ service VoiceAgentService {
   rpc ExecuteCommand(ExecuteInput) returns (ExecuteResult);
 }
 
+enum STTFramework {
+  VOSK = 0;
+  WHISPER = 1;
+}
+
+enum OnlineMode {
+  ONLINE = 0;
+  OFFLINE = 1;
+}
 
 enum RecordAction {
   START = 0;
@@ -69,6 +78,7 @@ message S_RecognizeVoiceControl {
   VoiceAudio audio_stream = 1;
   NLUModel nlu_model = 2;
   string stream_id = 3;
+  STTFramework stt_framework = 4;
 }
 
 message RecognizeVoiceControl {
@@ -76,6 +86,8 @@ message RecognizeVoiceControl {
   NLUModel nlu_model = 2;
   RecordMode record_mode = 3;
   string stream_id = 4;
+  STTFramework stt_framework = 5;
+  OnlineMode online_mode = 6;
 }
 
 message RecognizeTextControl {
diff --git a/agl_service_voiceagent/server.py b/agl_service_voiceagent/server.py
index aa107dc..b244aa4 100644
--- a/agl_service_voiceagent/server.py
+++ b/agl_service_voiceagent/server.py
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
+sys.path.append("../")
 import grpc
 from concurrent import futures
 from agl_service_voiceagent.generated import voice_agent_pb2_grpc
@@ -24,7 +26,8 @@ def run_server():
     logger = get_logger()
     SERVER_URL = get_config_value('SERVER_ADDRESS') + ":" + str(get_config_value('SERVER_PORT'))
     print("Starting Voice Agent Service...")
-    print(f"STT Model Path: {get_config_value('STT_MODEL_PATH')}")
+    print(f"VOSK Model Path: {get_config_value('VOSK_MODEL_PATH')}")
+    print(f"WHISPER Model Path: {get_config_value('WHISPER_MODEL_PATH')}")
     print(f"Audio Store Directory: {get_config_value('BASE_AUDIO_DIR')}")
     server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
     voice_agent_pb2_grpc.add_VoiceAgentServiceServicer_to_server(VoiceAgentServicer(), server)
diff --git a/agl_service_voiceagent/service.py b/agl_service_voiceagent/service.py
index baf7b02..b5fb50e 100644
--- a/agl_service_voiceagent/service.py
+++ b/agl_service_voiceagent/service.py
@@ -23,6 +23,7 @@ current_dir = os.path.dirname(os.path.abspath(__file__))
 generated_dir = os.path.join(current_dir, "generated")
 # Add the "generated" folder to sys.path
 sys.path.append(generated_dir)
+sys.path.append("../")
 
 import argparse
 from agl_service_voiceagent.utils.config import set_config_path, load_config, update_config_value, get_config_value, get_logger
@@ -49,8 +50,9 @@ def main():
     # Add the arguments for the server
     server_parser.add_argument('--default', action='store_true', help='Starts the server based on default config file.')
     server_parser.add_argument('--config', required=False, help='Path to a config file. Server is started based on this config file.')
-    server_parser.add_argument('--stt-model-path', required=False, help='Path to the Speech To Text model for Voice Commad detection. Currently only supports VOSK Kaldi.')
-    server_parser.add_argument('--ww-model-path', required=False, help='Path to the Speech To Text model for Wake Word detection. Currently only supports VOSK Kaldi. Defaults to the same model as --stt-model-path if not provided.')
+    server_parser.add_argument('--vosk-model-path', required=False, help='Path to the Vosk Speech To Text model for Voice Commad detection.')
+    server_parser.add_argument('--whisper-model-path', required=False, help='Path to the Whisper Speech To Text model for Voice Commad detection.')
+    server_parser.add_argument('--ww-model-path', required=False, help='Path to the Speech To Text model for Wake Word detection. Currently only supports VOSK Kaldi. Defaults to the same model as --vosk-model-path if not provided.')
     server_parser.add_argument('--snips-model-path', required=False, help='Path to the Snips NLU model.')
     server_parser.add_argument('--rasa-model-path', required=False, help='Path to the RASA NLU model.')
     server_parser.add_argument('--rasa-detached-mode', required=False, help='Assume that the RASA server is already running and does not start it as a sub process.')
@@ -59,6 +61,13 @@ def main():
     server_parser.add_argument('--audio-store-dir', required=False, help='Directory to store the generated audio files.')
     server_parser.add_argument('--log-store-dir', required=False, help='Directory to store the generated log files.')
 
+    # Arguments for online mode
+    server_parser.add_argument('--online-mode', required=False, help='Enable online mode for the Voice Agent Service (default is False).')
+    server_parser.add_argument('--online-mode-address', required=False, help='URL of the online server to connect to.')
+    server_parser.add_argument('--online-mode-port', required=False, help='Port of the online server to connect to.')
+    server_parser.add_argument('--online-mode-timeout', required=False, help='Timeout value in seconds for the online server connection.')
+
+
     # Add the arguments for the client
     client_parser.add_argument('--server-address', required=True, help='Address of the gRPC server running the Voice Agent Service.')
     client_parser.add_argument('--server-port', required=True, help='Port of the gRPC server running the Voice Agent Service.')
@@ -66,6 +75,10 @@ def main():
     client_parser.add_argument('--mode', help='Mode to run the client in. Supported modes: "auto" and "manual".')
     client_parser.add_argument('--nlu', help='NLU engine/model to use. Supported NLU engines: "snips" and "rasa".')
     client_parser.add_argument('--recording-time', help='Number of seconds to continue recording the voice command. Required by the \'manual\' mode. Defaults to 10 seconds.')
+    client_parser.add_argument('--stt-framework', help='STT framework to use. Supported frameworks: "vosk". Defaults to "vosk".')
+
+    # Arguments for online mode in client as --online-mode is a reserved keyword
+    client_parser.add_argument('--online-mode', required=False, help='Enable online mode for the Voice Agent Service (default is False).')
 
     args = parser.parse_args()
     
@@ -74,8 +87,12 @@ def main():
 
     elif args.subcommand == 'run-server':
         if not args.default and not args.config:
-            if not args.stt_model_path:
-                print("Error: The --stt-model-path is missing. Please provide a value. Use --help to see available options.")
+            if not args.vosk_model_path:
+                print("Error: The --vosk-model-path is missing. Please provide a value. Use --help to see available options.")
+                exit(1)
+
+            if not args.whisper_model_path:
+                print("Error: The --whisper-model-path is missing. Please provide a value. Use --help to see available options.")
                 exit(1)
             
             if not args.snips_model_path:
@@ -94,6 +111,16 @@ def main():
                 print("Error: The --vss-signals-spec-path is missing. Please provide a value. Use --help to see available options.")
                 exit(1)
             
+            # Error check for online mode
+            if args.online_mode:
+                if not args.online_mode_address:
+                    print("Error: The --online-mode-address is missing. Please provide a value. Use --help to see available options.")
+                    exit(1)
+                
+                if not args.online_mode_port:
+                    print("Error: The --online-mode-port is missing. Please provide a value. Use --help to see available options.")
+                    exit(1)    
+                           
             # Contruct the default config file path
             config_path = os.path.join(current_dir, "config.ini")
 
@@ -105,21 +132,36 @@ def main():
             logger.info("Starting Voice Agent Service in server mode using CLI provided params...")
             
             # Get the values provided by the user
-            stt_path = args.stt_model_path
+            vosk_path = args.vosk_model_path
+            whisper_path = args.whisper_model_path
             snips_model_path = args.snips_model_path
             rasa_model_path = args.rasa_model_path
             intents_vss_map_path = args.intents_vss_map_path
             vss_signals_spec_path = args.vss_signals_spec_path
             
+            # Get the values for online mode
+            online_mode = False
+            if args.online_mode:
+                online_mode = True
+                online_mode_address = args.online_mode_address
+                online_mode_port = args.online_mode_port
+                online_mode_timeout = args.online_mode_timeout or 5
+                update_config_value('1', 'ONLINE_MODE')
+                update_config_value(online_mode_address, 'ONLINE_MODE_ADDRESS')
+                update_config_value(online_mode_port, 'ONLINE_MODE_PORT')
+                update_config_value(online_mode_timeout, 'ONLINE_MODE_TIMEOUT')
+            
             # Convert to an absolute path if it's a relative path
-            stt_path = add_trailing_slash(os.path.abspath(stt_path)) if not os.path.isabs(stt_path) else stt_path
+            vosk_path = add_trailing_slash(os.path.abspath(vosk_path)) if not os.path.isabs(vosk_path) else vosk_path
+            whisper_path = add_trailing_slash(os.path.abspath(whisper_path)) if not os.path.isabs(whisper_path) else whisper_path
             snips_model_path = add_trailing_slash(os.path.abspath(snips_model_path)) if not os.path.isabs(snips_model_path) else snips_model_path
             rasa_model_path = add_trailing_slash(os.path.abspath(rasa_model_path)) if not os.path.isabs(rasa_model_path) else rasa_model_path
             intents_vss_map_path = os.path.abspath(intents_vss_map_path) if not os.path.isabs(intents_vss_map_path) else intents_vss_map_path
             vss_signals_spec_path = os.path.abspath(vss_signals_spec_path) if not os.path.isabs(vss_signals_spec_path) else vss_signals_spec_path
             
             # Also update the config.ini file
-            update_config_value(stt_path, 'STT_MODEL_PATH')
+            update_config_value(vosk_path, 'VOSK_MODEL_PATH')
+            update_config_value(whisper_path, 'WHISPER_MODEL_PATH')
             update_config_value(snips_model_path, 'SNIPS_MODEL_PATH')
             update_config_value(rasa_model_path, 'RASA_MODEL_PATH')
             update_config_value(intents_vss_map_path, 'INTENTS_VSS_MAP')
@@ -162,7 +204,6 @@ def main():
 
             logger = get_logger()
             logger.info(f"Starting Voice Agent Service in server mode using the default config file...")
-        
         # create the base audio dir if not exists
         if not os.path.exists(get_config_value('BASE_AUDIO_DIR')):
             os.makedirs(get_config_value('BASE_AUDIO_DIR'))
@@ -176,6 +217,8 @@ def main():
         mode = ""
         action = args.action
         recording_time = 5 # seconds
+        stt_framework = args.stt_framework or "vosk"
+        online_mode = args.online_mode or False
 
         if action not in ["GetStatus", "DetectWakeWord", "ExecuteVoiceCommand", "ExecuteTextCommand"]:
             print("Error: Invalid value for --action. Supported actions: 'GetStatus', 'DetectWakeWord', 'ExecuteVoiceCommand' and 'ExecuteTextCommand'. Use --help to see available options.")
@@ -199,8 +242,19 @@ def main():
             mode = args.mode
             if mode == "manual" and args.recording_time:
                 recording_time = int(args.recording_time)
-        
-        run_client(server_address, server_port, action, mode, nlu_engine, recording_time)
+            if args.stt_framework and args.stt_framework not in ['vosk', 'whisper']:
+                print("Error: Invalid value for --stt-framework. Supported frameworks: 'vosk' and 'whisper'. Use --help to see available options.")
+                exit(1)
+            if args.stt_framework:
+                stt_framework = args.stt_framework
+            
+            if args.online_mode and args.online_mode not in ['True', 'False', 'true', 'false', '1', '0']:
+                print("Error: Invalid value for --online-mode. Supported values: 'True' and 'False'. Use --help to see available options.")
+                exit(1)
+            if args.online_mode:
+                online_mode = True if args.online_mode in ['True', 'true', '1'] else False
+
+        run_client(server_address, server_port, action, mode, nlu_engine, recording_time, stt_framework, online_mode)
 
     else:
         print_version()
diff --git a/agl_service_voiceagent/servicers/voice_agent_servicer.py b/agl_service_voiceagent/servicers/voice_agent_servicer.py
index 0565655..2a4de33 100644
--- a/agl_service_voiceagent/servicers/voice_agent_servicer.py
+++ b/agl_service_voiceagent/servicers/voice_agent_servicer.py
@@ -14,9 +14,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
+sys.path.append("../")
 import json
 import time
 import threading
+import asyncio
 from agl_service_voiceagent.generated import voice_agent_pb2
 from agl_service_voiceagent.generated import voice_agent_pb2_grpc
 from agl_service_voiceagent.utils.audio_recorder import AudioRecorder
@@ -28,6 +31,10 @@ from agl_service_voiceagent.utils.config import get_config_value, get_logger
 from agl_service_voiceagent.utils.common import generate_unique_uuid, delete_file
 from agl_service_voiceagent.nlu.snips_interface import SnipsInterface
 from agl_service_voiceagent.nlu.rasa_interface import RASAInterface
+from agl_service_voiceagent.utils.stt_online_service import STTOnlineService
+from agl_service_voiceagent.utils.vss_interface import VSSInterface
+from kuksa_client.grpc import Datapoint
+from agl_service_voiceagent.utils.media_controller import MediaController
 
 
 class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
@@ -46,7 +53,7 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
         self.channels = int(get_config_value('CHANNELS'))
         self.sample_rate = int(get_config_value('SAMPLE_RATE'))
         self.bits_per_sample = int(get_config_value('BITS_PER_SAMPLE'))
-        self.stt_model_path = get_config_value('STT_MODEL_PATH')
+        self.vosk_model_path = get_config_value('VOSK_MODEL_PATH')
         self.wake_word_model_path = get_config_value('WAKE_WORD_MODEL_PATH')
         self.snips_model_path = get_config_value('SNIPS_MODEL_PATH')
         self.rasa_model_path = get_config_value('RASA_MODEL_PATH')
@@ -56,10 +63,25 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
         self.store_voice_command = bool(int(get_config_value('STORE_VOICE_COMMANDS')))
         self.logger = get_logger()
 
+        # load the whisper model_path
+        self.whisper_model_path = get_config_value('WHISPER_MODEL_PATH')
+        self.whisper_cpp_path = get_config_value('WHISPER_CPP_PATH')
+        self.whisper_cpp_model_path = get_config_value('WHISPER_CPP_MODEL_PATH')
+
+        # loading values for online mode
+        self.online_mode = bool(int(get_config_value('ONLINE_MODE')))
+        if self.online_mode:
+            self.online_mode_address = get_config_value('ONLINE_MODE_ADDRESS')
+            self.online_mode_port = int(get_config_value('ONLINE_MODE_PORT'))
+            self.online_mode_timeout = int(get_config_value('ONLINE_MODE_TIMEOUT'))
+            self.stt_online = STTOnlineService(self.online_mode_address, self.online_mode_port, self.online_mode_timeout)
+            self.stt_online.initialize_connection()
+            
+
         # Initialize class methods
         self.logger.info("Loading Speech to Text and Wake Word Model...")
-        self.stt_model = STTModel(self.stt_model_path, self.sample_rate)
-        self.stt_wake_word_model = STTModel(self.wake_word_model_path, self.sample_rate)
+        self.stt_model = STTModel(self.vosk_model_path, self.whisper_model_path,self.whisper_cpp_path,self.whisper_cpp_model_path,self.sample_rate)
+        self.stt_wake_word_model = STTModel(self.vosk_model_path, self.whisper_model_path,self.whisper_cpp_path,self.whisper_cpp_model_path,self.sample_rate)
         self.logger.info("Speech to Text and Wake Word Model loaded successfully.")
 
         self.logger.info("Starting SNIPS intent engine...")
@@ -78,14 +100,91 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
             self.logger.info(f"RASA intent engine detached mode detected! Assuming RASA server is running at URL: 127.0.0.1:{self.rasa_server_port}")
 
         self.rvc_stream_uuids = {}
-        self.kuksa_client = KuksaInterface()
-        self.kuksa_client.connect_kuksa_client()
-        self.kuksa_client.authorize_kuksa_client()
 
         self.logger.info(f"Loading and parsing mapping files...")
         self.mapper = Intent2VSSMapper()
         self.logger.info(f"Successfully loaded and parsed mapping files.")
 
+        # Media controller
+        self.media_controller = MediaController()
+
+        self.vss_interface = VSSInterface()
+        self.vss_thread = threading.Thread(target=self.start_vss_client)
+        self.vss_thread.start()
+        self.vss_event_loop = None
+        
+    # VSS client methods
+
+    def start_vss_client(self):
+        """
+        Start the VSS client.
+        """
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        loop.run_until_complete(self.vss_interface.connect_vss_client())
+        self.vss_event_loop = loop
+        loop.run_forever()
+
+    def connect_vss_client(self):
+        """
+        Connect the VSS client.
+        """
+        future = asyncio.run_coroutine_threadsafe(
+            self.vss_interface.connect_vss_client(), 
+            self.vss_event_loop
+        )
+        return future.result()
+
+    def set_current_values(self, path=None, value=None):
+        """
+        Set the current values.
+
+        Args:
+            path (str): The path to set.
+            value (any): The value to set.
+        """
+        future = asyncio.run_coroutine_threadsafe(
+            self.vss_interface.set_current_values(path, value),
+            self.vss_event_loop
+        )
+        return future.result()
+
+    def get_current_values(self, paths=None):
+        """
+        Get the current values.
+
+        Args:
+            paths (list): The paths to get.
+
+        Returns:
+            dict: The current values.
+        """
+        print("Getting current values for paths:", paths)
+        future = asyncio.run_coroutine_threadsafe(
+            self.vss_interface.get_current_values(paths), 
+            self.vss_event_loop
+        )
+        return future.result()
+
+    def disconnect_vss_client(self):
+        """
+        Disconnect the VSS client.
+        """
+        future = asyncio.run_coroutine_threadsafe(
+            self.vss_interface.disconnect_vss_client(), 
+            self.vss_event_loop
+        )
+        return future.result()
+    
+    def get_vss_server_info(self):
+        """
+        Get the VSS server information.
+        """
+        future = asyncio.run_coroutine_threadsafe(
+            self.vss_interface.get_server_info(), 
+            self.vss_event_loop
+        )
+        return future.result()
 
     def CheckServiceStatus(self, request, context):
         """
@@ -153,6 +252,16 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
         log_intent_slots = []
 
         for request in requests:
+            stt_framework = ''
+            if request.stt_framework == voice_agent_pb2.VOSK:
+                stt_framework = 'vosk'
+            elif request.stt_framework == voice_agent_pb2.WHISPER:
+                stt_framework = 'whisper'
+            
+            use_online_mode = False
+            if request.online_mode == voice_agent_pb2.ONLINE:
+                use_online_mode = True
+
             if request.record_mode == voice_agent_pb2.MANUAL:
 
                 if request.action == voice_agent_pb2.START:
@@ -187,14 +296,32 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
                     del self.rvc_stream_uuids[stream_uuid]
 
                     recorder.stop_recording()
-                    recognizer_uuid = self.stt_model.setup_recognizer()
-                    stt = self.stt_model.recognize_from_file(recognizer_uuid, audio_file)
+                                      
+                    used_kaldi = False
+
+                    if use_online_mode and self.online_mode:
+                        print("Recognizing voice command using online mode.")
+                        if self.stt_online.initialized:
+                            stt = self.stt_online.recognize_audio(audio_file=audio_file)
+                        elif not self.stt_online.initialized:
+                            self.stt_online.initialize_connection()
+                            stt = self.stt_online.recognize_audio(audio_file=audio_file)
+                    else:
+                        recognizer_uuid = self.stt_model.setup_vosk_recognizer()
+                        stt = self.stt_model.recognize_from_file(recognizer_uuid, audio_file,stt_framework=stt_framework)
+                        used_kaldi = True
+
+                    if use_online_mode and self.online_mode and stt is None:
+                        print("Online mode enabled but failed to recognize voice command. Switching to offline mode.")
+                        recognizer_uuid = self.stt_model.setup_vosk_recognizer()
+                        stt = self.stt_model.recognize_from_file(recognizer_uuid, audio_file,stt_framework=stt_framework)
+                        used_kaldi = True
 
+                    print(stt)
                     if stt not in ["FILE_NOT_FOUND", "FILE_FORMAT_INVALID", "VOICE_NOT_RECOGNIZED", ""]:
                         if request.nlu_model == voice_agent_pb2.SNIPS:
                             extracted_intent = self.snips_interface.extract_intent(stt)
                             intent, intent_actions = self.snips_interface.process_intent(extracted_intent)
-
                             if not intent or intent == "":
                                 status = voice_agent_pb2.INTENT_NOT_RECOGNIZED
                             
@@ -223,7 +350,9 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
                         status = voice_agent_pb2.VOICE_NOT_RECOGNIZED
                     
                     # cleanup the kaldi recognizer
-                    self.stt_model.cleanup_recognizer(recognizer_uuid)
+                    if used_kaldi:
+                        self.stt_model.cleanup_recognizer(recognizer_uuid)
+                        used_kaldi = False
 
                     # delete the audio file
                     if not self.store_voice_command:   
@@ -323,6 +452,9 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
         """
         Execute the voice command by sending the intent to Kuksa.
         """
+        if self.vss_interface.is_connected==False:
+            self.logger.error("Kuksa client found disconnected. Trying to close old instance and re-connecting...")
+            self.connect_vss_client()
         # Log the unique request ID, client's IP address, and the endpoint
         request_id = generate_unique_uuid(8)
         client_ip = context.peer()
@@ -335,36 +467,111 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
             slot_name = slot.name
             slot_value = slot.value
             processed_slots.append({"name": slot_name, "value": slot_value})
-
         print(intent)
         print(processed_slots)
+
+        # Check for the media control intents
+        if intent == "MediaControl":
+            for slot in processed_slots:
+                if slot["name"] == "media_control_action":
+                    action = slot["value"]
+                    
+            if action == "resume" or action == "play":
+                if self.media_controller.resume():
+                    exec_response = "Yay, I successfully resumed the media."
+                    exec_status = voice_agent_pb2.EXEC_SUCCESS
+                else:
+                    exec_response = "Uh oh, I failed to resume the media."
+                    exec_status = voice_agent_pb2.EXEC_ERROR
+
+            elif action == "pause":
+                if self.media_controller.pause():
+                    exec_response = "Yay, I successfully paused the media."
+                    exec_status = voice_agent_pb2.EXEC_SUCCESS
+                else:
+                    exec_response = "Uh oh, I failed to pause the media."
+                    exec_status = voice_agent_pb2.EXEC_ERROR
+
+            elif action == "next":
+                if self.media_controller.next():
+                    exec_response = "Yay, I successfully played the next track."
+                    exec_status = voice_agent_pb2.EXEC_SUCCESS
+                else:
+                    exec_response = "Uh oh, I failed to play the next track."
+                    exec_status = voice_agent_pb2.EXEC_ERROR
+
+            elif action == "previous":
+                if self.media_controller.previous():
+                    exec_response = "Yay, I successfully played the previous track."
+                    exec_status = voice_agent_pb2.EXEC_SUCCESS
+                else:
+                    exec_response = "Uh oh, I failed to play the previous track."
+                    exec_status = voice_agent_pb2.EXEC_ERROR
+                    
+            elif action == "stop":
+                if self.media_controller.stop():
+                    exec_response = "Yay, I successfully stopped the media."
+                    exec_status = voice_agent_pb2.EXEC_SUCCESS
+                else:
+                    exec_response = "Uh oh, I failed to stop the media."
+                    exec_status = voice_agent_pb2.EXEC_ERROR
+            else:
+                exec_response = "Sorry, I failed to execute command against intent 'MediaControl'. Maybe try again with more specific instructions."
+                exec_status = voice_agent_pb2.EXEC_ERROR
+            
+
+            response = voice_agent_pb2.ExecuteResult(
+                response=exec_response,
+                status=exec_status
+            )
+            return response
+
+
         execution_list = self.mapper.parse_intent(intent, processed_slots, req_id=request_id)
         exec_response = f"Sorry, I failed to execute command against intent '{intent}'. Maybe try again with more specific instructions."
         exec_status = voice_agent_pb2.EXEC_ERROR
-
         # Check for kuksa status, and try re-connecting again if status is False 
-        if not self.kuksa_client.get_kuksa_status():
+        if self.vss_interface.is_connected:
+            self.logger.info(f"[ReqID#{request_id}] Kuksa client found connected.")
+        else:
             self.logger.error(f"[ReqID#{request_id}] Kuksa client found disconnected. Trying to close old instance and re-connecting...")
-            self.kuksa_client.close_kuksa_client()
-            self.kuksa_client.connect_kuksa_client()
-            self.kuksa_client.authorize_kuksa_client()
-
+            exec_response = "Uh oh, I failed to connect to Kuksa."
+            exec_status = voice_agent_pb2.KUKSA_CONN_ERROR
+            self.disconnect_vss_client()
+            return voice_agent_pb2.ExecuteResult(
+                response=exec_response,
+                status=exec_status
+            )
+        
+        if not self.vss_interface.is_connected:
+            self.logger.error(f"[ReqID#{request_id}] Kuksa client failed to connect.")
+            exec_response = "Uh oh, I failed to connect to Kuksa."
+            exec_status = voice_agent_pb2.KUKSA_CONN_ERROR
+            response = voice_agent_pb2.ExecuteResult(
+                response=exec_response,
+                status=exec_status
+            )
+            return response
         for execution_item in execution_list:
             print(execution_item)
             action = execution_item["action"]
             signal = execution_item["signal"]
 
-            if self.kuksa_client.get_kuksa_status():
+            if self.vss_interface.is_connected:
                 if action == "set" and "value" in execution_item:
                     value = execution_item["value"]
-                    if self.kuksa_client.send_values(signal, value):
+                    response = self.set_current_values(signal, value)
+                    if response is None or response is False:
+                        exec_response = "Uh oh, I failed to send value to Kuksa."
+                        exec_status = voice_agent_pb2.KUKSA_CONN_ERROR
+                    else:
                         exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'."
                         exec_status = voice_agent_pb2.EXEC_SUCCESS
-                
+
                 elif action in ["increase", "decrease"]:
                     if "value" in execution_item:
                         value = execution_item["value"]
-                        if self.kuksa_client.send_values(signal, value):
+                        if self.set_current_values(signal, value):
                             exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'."
                             exec_status = voice_agent_pb2.EXEC_SUCCESS
                     
@@ -372,22 +579,26 @@ class VoiceAgentServicer(voice_agent_pb2_grpc.VoiceAgentServiceServicer):
                         # incoming values are always str as kuksa expects str during subscribe we need to convert
                         # the value to int before performing any arithmetic operations and then convert back to str
                         factor = int(execution_item["factor"]) 
-                        current_value = self.kuksa_client.get_value(signal)
-                        if current_value:
-                            current_value = int(current_value)
-                            if action == "increase":
-                                value = current_value + factor
-                                value = str(value)
-                            elif action == "decrease":
-                                value = current_value - factor
-                                value = str(value)
-                            if self.kuksa_client.send_values(signal, value):
-                                exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'."
-                                exec_status = voice_agent_pb2.EXEC_SUCCESS
-                        
-                        else:
+                        current_value = self.get_current_values(signal)
+                        if current_value is None:
                             exec_response = f"Uh oh, there is no value set for intent '{intent}'. Why not try setting a value first?"
                             exec_status = voice_agent_pb2.KUKSA_CONN_ERROR
+                        else:
+                            if current_value:
+                                current_value = int(current_value)
+                                if action == "increase":
+                                    value = current_value + factor
+                                    value = str(value)
+                                elif action == "decrease":
+                                    value = current_value - factor
+                                    value = str(value)
+                                if self.set_current_values(signal, value):
+                                    exec_response = f"Yay, I successfully updated the intent '{intent}' to value '{value}'."
+                                    exec_status = voice_agent_pb2.EXEC_SUCCESS
+
+                            else:
+                                exec_response = f"Uh oh, there is no value set for intent '{intent}'. Why not try setting a value first?"
+                                exec_status = voice_agent_pb2.KUKSA_CONN_ERROR
 
             else:
                 exec_response = "Uh oh, I failed to connect to Kuksa."
diff --git a/agl_service_voiceagent/utils/audio_recorder.py b/agl_service_voiceagent/utils/audio_recorder.py
index 2e8f11d..49716c9 100644
--- a/agl_service_voiceagent/utils/audio_recorder.py
+++ b/agl_service_voiceagent/utils/audio_recorder.py
@@ -66,6 +66,9 @@ class AudioRecorder:
         self.pipeline = Gst.Pipeline()
         autoaudiosrc = Gst.ElementFactory.make("autoaudiosrc", None)
         queue = Gst.ElementFactory.make("queue", None)
+        queue.set_property("max-size-buffers", 0)
+        queue.set_property("max-size-bytes", 0)
+        queue.set_property("max-size-time", 0)
         audioconvert = Gst.ElementFactory.make("audioconvert", None)
         wavenc = Gst.ElementFactory.make("wavenc", None)
 
diff --git a/agl_service_voiceagent/utils/media_controller.py b/agl_service_voiceagent/utils/media_controller.py
new file mode 100644
index 0000000..60c2717
--- /dev/null
+++ b/agl_service_voiceagent/utils/media_controller.py
@@ -0,0 +1,132 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright (c) 2023 Malik Talha
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from mpd import MPDClient
+import json
+from agl_service_voiceagent.utils.config import get_config_value, get_logger
+from agl_service_voiceagent.utils.common import load_json_file, words_to_number
+
+
+class MediaController:
+    def __init__(self):
+        self.client = MPDClient()
+        self.ip = get_config_value('MPD_IP')
+        self.port = get_config_value('MPD_PORT')
+        self.is_connected = self.connect()
+
+    def connect(self):
+        try:
+            self.client.connect(self.ip, self.port)
+            return True
+        except Exception as e:
+            print(f"[-] Error: Failed to connect to MPD server: {e}")
+            return False
+
+    def play(self, uri):
+        '''
+        Play the media file at the specified URI.
+
+        Args:
+            uri (str): The URI of the media file to play.
+
+        '''
+        if not self.is_connected:
+            print("[-] Error: MPD client is not connected.")
+            return False
+
+        try:
+            self.client.clear()
+            self.client.add(uri)
+            self.client.play()
+            return True
+        except Exception as e:
+            print(f"[-] Error: Failed to play media: {e}")
+            return False
+
+    def stop(self):
+        '''
+        Stop the media player.
+        '''
+        if not self.is_connected:
+            print("[-] Error: MPD client is not connected.")
+            return False
+        try:
+            self.client.stop()
+            return True
+        except Exception as e:
+            print(f"[-] Error: Failed to stop media: {e}")
+            return False
+
+    def pause(self):
+        '''
+        Pause the media player.
+        '''
+        if not self.is_connected:
+            print("[-] Error: MPD client is not connected.")
+            return False
+        try:
+            self.client.pause()
+            return True
+        except Exception as e:
+            print(f"[-] Error: Failed to pause media: {e}")
+            return False
+
+    def resume(self):
+        '''
+        Resume the media player.
+        '''
+        if not self.is_connected:
+            print("[-] Error: MPD client is not connected.")
+            return False
+        try:
+            self.client.play()
+            return True
+        except Exception as e:
+            print(f"[-] Error: Failed to resume media: {e}")
+            return False
+        
+    def next(self):
+        '''
+        Play the next track in the playlist.
+        '''
+        if not self.is_connected:
+            print("[-] Error: MPD client is not connected.")
+            return False
+        try:
+            self.client.next()
+            return True
+        except Exception as e:
+            print(f"[-] Error: Failed to play next track: {e}")
+            return False
+
+    def previous(self):
+        '''
+        Play the previous track in the playlist.
+        '''
+        if not self.is_connected:
+            print("[-] Error: MPD client is not connected.")
+            return False
+        try:
+            self.client.previous()
+            return True
+        except Exception as e:
+            print(f"[-] Error: Failed to play previous track: {e}")
+            return False
+
+    def close(self):
+        self.client.close()
+        self.client.disconnect()
+
diff --git a/agl_service_voiceagent/utils/stt_model.py b/agl_service_voiceagent/utils/stt_model.py
index d51ae31..7e8ad8b 100644
--- a/agl_service_voiceagent/utils/stt_model.py
+++ b/agl_service_voiceagent/utils/stt_model.py
@@ -20,12 +20,18 @@ import vosk
 import wave
 from agl_service_voiceagent.utils.common import generate_unique_uuid
 
+# import the whisper model
+import whisper
+# for whisper timeout feature
+from concurrent.futures import ThreadPoolExecutor  
+import subprocess
+
 class STTModel:
     """
     STTModel is a class for speech-to-text (STT) recognition using the Vosk speech recognition library.
     """
 
-    def __init__(self, model_path, sample_rate=16000):
+    def __init__(self, vosk_model_path,whisper_model_path,whisper_cpp_path,whisper_cpp_model_path,sample_rate=16000):
         """
         Initialize the STTModel instance with the provided model and sample rate.
 
@@ -34,12 +40,15 @@ class STTModel:
             sample_rate (int, optional): The audio sample rate in Hz (default is 16000).
         """
         self.sample_rate = sample_rate
-        self.model = vosk.Model(model_path)
+        self.vosk_model = vosk.Model(vosk_model_path)
         self.recognizer = {}
         self.chunk_size = 1024
+        # self.whisper_model = whisper.load_model(whisper_model_path)
+        self.whisper_cpp_path = whisper_cpp_path
+        self.whisper_cpp_model_path = whisper_cpp_model_path
     
 
-    def setup_recognizer(self):
+    def setup_vosk_recognizer(self):
         """
         Set up a Vosk recognizer for a new session and return a unique identifier (UUID) for the session.
 
@@ -47,10 +56,9 @@ class STTModel:
             str: A unique identifier (UUID) for the session.
         """
         uuid = generate_unique_uuid(6)
-        self.recognizer[uuid] = vosk.KaldiRecognizer(self.model, self.sample_rate)
+        self.recognizer[uuid] = vosk.KaldiRecognizer(self.vosk_model, self.sample_rate)
         return uuid
 
-
     def init_recognition(self, uuid, audio_data):
         """
         Initialize the Vosk recognizer for a session with audio data.
@@ -64,8 +72,8 @@ class STTModel:
         """
         return self.recognizer[uuid].AcceptWaveform(audio_data)
 
-
-    def recognize(self, uuid, partial=False):
+    # Recognize speech using the Vosk recognizer
+    def recognize_using_vosk(self, uuid, partial=False):
         """
         Recognize speech and return the result as a JSON object.
 
@@ -84,14 +92,53 @@ class STTModel:
             self.recognizer[uuid].Reset()
         return result
     
+    # Recognize speech using the whisper model
+    def recognize_using_whisper(self,filename,language = None,timeout = 5,fp16=False):
+        """
+        Recognize speech and return the result as a JSON object.
+
+        Args:
+            filename (str): The path to the audio file.
+            timeout (int, optional): The timeout for recognition (default is 5 seconds).
+            fp16 (bool, optional): If True, use 16-bit floating point precision, (default is False) because cuda is not supported.
+            language (str, optional): The language code for recognition (default is None).
+
+        Returns:
+            dict: A JSON object containing recognition results.
+        """
+        def transcribe_with_whisper():
+            return self.whisper_model.transcribe(filename, language = language,fp16=fp16)
+        
+        with ThreadPoolExecutor() as executor:
+            future = executor.submit(transcribe_with_whisper)
+            try:
+                return future.result(timeout=timeout)
+            except TimeoutError:
+                return {"error": "Transcription with Whisper exceeded the timeout."}
+            
+    def recognize_using_whisper_cpp(self,filename):
+        command = self.whisper_cpp_path
+        arguments = ["-m", self.whisper_cpp_model_path, "-f", filename, "-l", "en","-nt"]
+
+        # Run the executable with the specified arguments
+        result = subprocess.run([command] + arguments, capture_output=True, text=True)      
+
+        if result.returncode == 0:
+            result = result.stdout.replace('\n', ' ').strip()
+            return {"text": result}
+        else:
+            print("Error:\n", result.stderr)
+            return {"error": result.stderr}
+    
 
-    def recognize_from_file(self, uuid, filename):
+    def recognize_from_file(self, uuid, filename,stt_framework="vosk"):
         """
         Recognize speech from an audio file and return the recognized text.
 
         Args:
             uuid (str): The unique identifier (UUID) for the session.
             filename (str): The path to the audio file.
+            stt_model (str): The STT model to use for recognition (default is "vosk").
 
         Returns:
             str: The recognized text or error messages.
@@ -115,12 +162,28 @@ class STTModel:
             audio_data += chunk
 
         if audio_data:
-            if self.init_recognition(uuid, audio_data):
-                result = self.recognize(uuid)
-                return result['text']
-            else:
-                result = self.recognize(uuid, partial=True)
-                return result['partial']
+            # Perform speech recognition using the specified STT model
+            if stt_framework == "vosk":
+                if self.init_recognition(uuid, audio_data):
+                    result = self.recognize_using_vosk(uuid)
+                    return result['text']
+                else:
+                    result = self.recognize_using_vosk(uuid, partial=True)
+                    return result['partial']
+                
+            elif stt_framework == "whisper":
+                result = self.recognize_using_whisper_cpp(filename)
+                if 'error' in result:
+                    print(result['error'])
+                    # If Whisper times out, fall back to Vosk
+                    if self.init_recognition(uuid, audio_data):
+                        result = self.recognize_using_vosk(uuid)
+                        return result['text']
+                    else:
+                        result = self.recognize_using_vosk(uuid, partial=True)
+                        return result['partial']
+                else:
+                    return result.get('text', '')
 
         else:
             print("Voice not recognized. Please speak again...")
diff --git a/agl_service_voiceagent/utils/stt_online_service.py b/agl_service_voiceagent/utils/stt_online_service.py
new file mode 100644
index 0000000..7bbdc5d
--- /dev/null
+++ b/agl_service_voiceagent/utils/stt_online_service.py
@@ -0,0 +1,88 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# Copyright (c) 2023 Malik Talha
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import grpc
+import sys
+sys.path.append("../")
+from agl_service_voiceagent.generated import audio_processing_pb2
+from agl_service_voiceagent.generated import audio_processing_pb2_grpc
+
+class STTOnlineService:
+    """
+    STTOnlineService class is used to connect to an online gPRC based Whisper ASR service.
+    """
+    def __init__(self, server_address, server_port,server_timeout=5):
+        """
+        Initialize the online speech-to-text service.
+
+        Args:
+            server_ip (str): The IP address of the online speech-to-text service.
+            server_port (int): The port number of the online speech-to-text service.
+            server_timeout (int, optional): The timeout value in seconds (default is 5).
+        """
+        self.server_address = server_address
+        self.server_port = server_port
+        self.server_timeout = server_timeout
+        self.client = None
+        self.initialized = False
+
+
+    def initialize_connection(self):
+        """
+        Initialize the connection to the online speech-to-text service.
+        """
+        try:
+            channel = grpc.insecure_channel(f"{self.server_address}:{self.server_port}")
+            self.client = audio_processing_pb2_grpc.AudioProcessingStub(channel)
+            self.initialized = True
+            print("STTOnlineService initialized with server address:",self.server_address,"and port:",self.server_port,"and timeout:",self.server_timeout,"seconds.")
+        except Exception as e:
+            print("Error initializing online speech-to-text service:",e)
+            self.initialized = False
+        return self.initialized
+
+    def close_connection(self):
+        """
+        Close the connection to the online speech-to-text service.
+        """
+        self.client = None
+        self.initialized = False
+        return not self.initialized
+    
+    def recognize_audio(self, audio_file):
+        """
+        Recognize speech from audio data.
+
+        Args:
+            audio_data (bytes): Audio data to process.
+
+        Returns:
+            str: The recognized text.
+        """
+        if not self.initialized:
+            print("STTOnlineService not initialized.")
+            return None
+        try:
+            with open(audio_file, 'rb') as audio_file:
+                audio_data = audio_file.read()
+            request = audio_processing_pb2.AudioRequest(audio_data=audio_data)
+            response = self.client.ProcessAudio(request,timeout=self.server_timeout)
+            return response.text
+        except Exception as e:
+            print("Error recognizing audio:",e)
+            return None
+
+    
+\ No newline at end of file
diff --git a/agl_service_voiceagent/utils/vss_interface.py b/agl_service_voiceagent/utils/vss_interface.py
new file mode 100644
index 0000000..a77e52c
--- /dev/null
+++ b/agl_service_voiceagent/utils/vss_interface.py
@@ -0,0 +1,236 @@
+import json
+import threading
+from kuksa_client import KuksaClientThread
+import sys
+from pathlib import Path
+import asyncio
+import concurrent.futures
+from kuksa_client.grpc.aio import VSSClient
+from kuksa_client.grpc import Datapoint
+import time
+from agl_service_voiceagent.utils.config import get_config_value, get_logger
+
+
+class VSSInterface:
+    """
+    VSS Interface
+
+    This class provides methods to initialize, authorize, connect, send values,
+    check the status, and close the Kuksa client.
+    """
+
+    _instance = None
+    _lock = threading.Lock()
+
+    def __new__(cls):
+        """
+        Get the unique instance of the class.
+
+        Returns:
+            KuksaInterface: The instance of the class.
+        """
+        with cls._lock:
+            if cls._instance is None:
+                cls._instance = super(VSSInterface, cls).__new__(cls)
+                cls._instance.init_client()
+        return cls._instance
+
+    def init_client(self):
+        """
+        Initialize the Kuksa client.
+        """
+        # Defaults
+        self.hostname = str(get_config_value("hostname", "VSS"))
+        self.port = str(get_config_value("port", "VSS"))
+        self.token_filename = str(get_config_value("token_filename", "VSS"))
+        self.tls_server_name = str(get_config_value("tls_server_name", "VSS"))
+        self.verbose = False
+        self.insecure = bool(int(get_config_value("insecure", "VSS")))
+        self.protocol = str(get_config_value("protocol", "VSS"))
+        self.ca_cert_filename = str(get_config_value("ca_cert_filename", "VSS"))
+        self.token = None
+        self.is_connected = False
+        self.logger = get_logger()
+
+        self.set_token()
+
+        # validate config
+        if not self.validate_config():
+            exit(1)
+
+        # define class methods
+        self.vss_client = None
+
+    def validate_config(self):
+        """
+        Validate the Kuksa client configuration.
+
+        Returns:
+            bool: True if the configuration is valid, False otherwise.
+        """
+        if self.hostname is None:
+            print("[-] Error: Kuksa IP address is not set.")
+            self.logger.error("Kuksa IP address is not set.")
+            return False
+
+        if self.port is None:
+            print("[-] Error: Kuksa port is not set.")
+            self.logger.error("Kuksa port is not set.")
+            return False
+
+        if self.token is None:
+            print("[-] Warning: Kuksa auth token is not set.")
+            self.logger.warning("Kuksa auth token is not set.")
+
+        if self.protocol != "ws" and self.protocol != "grpc":
+            print("[-] Error: Invalid Kuksa protocol. Only 'ws' and 'grpc' are supported.")
+            self.logger.error("Invalid Kuksa protocol. Only 'ws' and 'grpc' are supported.")
+            return False
+
+        return True
+    
+    def set_token(self):
+        """
+        Set the Kuksa auth token.
+        """
+        if self.token_filename != "":
+            token_file = open(self.token_filename, "r")
+            self.token = token_file.read()
+        else:
+            self.token = ""
+
+    def get_vss_client(self):
+        """
+        Get the VSS client instance.
+
+        Returns:
+            VSSClientThread: The VSS client instance.
+        """
+        if self.vss_client is None:
+            return None
+        return self.vss_client
+    
+    async def authorize_vss_client(self):
+        """
+        Authorize the VSS client.
+        """
+        if self.vss_client is None:
+            print("[-] Error: Failed to authorize Kuksa client. Kuksa client is not initialized.")
+            self.logger.error("Failed to authorize Kuksa client. Kuksa client is not initialized.")
+            return False
+        try:
+            await self.vss_client.authorize(self.token)
+            print(f"Authorized Kuksa client with token {self.token}")
+            return True
+        except Exception as e:
+            print(f"[-] Error: Failed to authorize Kuksa client: {e}")
+            self.logger.error(f"Failed to authorize Kuksa client: {e}")
+            return False
+        
+    async def get_server_info(self):
+        """
+        Get the server information.
+
+        Returns:
+            dict: The server information.
+        """
+        if self.vss_client is None:
+            return None
+        try:
+            return await self.vss_client.get_server_info()
+        except Exception as e:
+            print(f"[-] Error: Failed to get server info: {e}")
+            self.logger.error(f"Failed to get server info: {e}")
+            return None
+    
+    async def connect_vss_client(self):
+        """
+        Connect the VSS client.
+        """
+        print(f"Connecting to KUKSA.val databroker at {self.hostname}:{self.port}")
+        try:
+            self.vss_client = VSSClient(
+                                self.hostname,
+                                self.port,
+                                root_certificates=Path(self.ca_cert_filename),
+                                token=self.token,
+                                tls_server_name=self.tls_server_name,
+                                ensure_startup_connection=True)
+            await self.vss_client.connect()
+            print(f"[+] Connected to KUKSA.val databroker at {self.hostname}:{self.port}")
+            self.is_connected = True
+            return True
+        except Exception as e:
+            print(f"[-] Error: Failed to connect to Kuksa val databroker: {e}")
+            self.logger.error(f"Failed to connect to Kuksa val databroker: {e}")
+            self.is_connected = False
+            return False
+
+
+    async def set_current_values(self, path=None, value=None):
+        """
+        Set the current values.
+
+        Args:
+            updates (dict): The updates to set.
+        """
+        result = False
+        if self.vss_client is None:
+            print(f"[-] Error: Failed to send value '{value}' to Kuksa. Kuksa client is not initialized.")
+            self.logger.error(f"Failed to send value '{value}' to Kuksa. Kuksa client is not initialized.")
+            return result
+        try:
+            await self.vss_client.set_current_values({path: Datapoint(value)})
+            result = True
+        except Exception as e:
+            print(f"[-] Error: Failed to send value '{value}' to Kuksa: {e}")
+            self.logger.error(f"Failed to send value '{value}' to Kuksa: {e}")
+        return result
+    
+
+    async def get_current_values(self, path=None):
+        """
+        Get the current values.
+
+        Args:
+            paths (list): The paths to get.
+
+        Returns:
+            dict: The current values.
+
+        current_values = await client.get_current_values([
+                'Vehicle.Speed',
+                'Vehicle.ADAS.ABS.IsActive',
+            ])
+            speed_value = current_values['Vehicle.Speed'].value
+        """
+
+        if self.vss_client is None or self.is_connected is False:
+            return None
+        try:
+            result = await self.vss_client.get_current_values([path])
+            return result[path].value
+        except Exception as e:
+            print(f"[-] Error: Failed to get current values: {e}")
+            self.logger.error(f"Failed to get current values: {e}")
+            return None
+
+    async def disconnect_vss_client(self):
+        """
+        Disconnect the VSS client.
+        """
+        if self.vss_client is None:
+            print("[-] Error: Failed to disconnect Kuksa client. Kuksa client is not initialized.")
+            self.logger.error("Failed to disconnect Kuksa client. Kuksa client is not initialized.")
+            return False
+        try:
+            await self.vss_client.disconnect()
+            print("Disconnected from Kuksa val databroker.")
+            self.is_connected = False
+            return True
+        except Exception as e:
+            print(f"[-] Error: Failed to disconnect from Kuksa val databroker: {e}")
+            self.logger.error(f"Failed to disconnect from Kuksa val databroker: {e}")
+            return False
+    
+    
diff --git a/agl_service_voiceagent/utils/wake_word.py b/agl_service_voiceagent/utils/wake_word.py
index 47e547e..b672269 100644
--- a/agl_service_voiceagent/utils/wake_word.py
+++ b/agl_service_voiceagent/utils/wake_word.py
@@ -46,7 +46,7 @@ class WakeWordDetector:
         self.channels = channels
         self.bits_per_sample = bits_per_sample
         self.wake_word_model = stt_model # Speech to text model recognizer
-        self.recognizer_uuid = stt_model.setup_recognizer() 
+        self.recognizer_uuid = stt_model.setup_vosk_recognizer()
         self.audio_buffer = bytearray()
         self.segment_size = int(self.sample_rate * 1.0)  # Adjust the segment size (e.g., 1 second)
      
@@ -140,7 +140,7 @@ class WakeWordDetector:
 
         # Perform wake word detection on the audio_data
         if self.wake_word_model.init_recognition(self.recognizer_uuid, audio_data):
-            stt_result = self.wake_word_model.recognize(self.recognizer_uuid)
+            stt_result = self.wake_word_model.recognize_using_vosk(self.recognizer_uuid)
             print("STT Result: ", stt_result)
             if self.wake_word in stt_result["text"]:
                 self.wake_word_detected = True
diff --git a/mappings/intents_vss_map.json b/mappings/intents_vss_map.json
index a34f93e..9a1d0a4 100644
--- a/mappings/intents_vss_map.json
+++ b/mappings/intents_vss_map.json
@@ -7,18 +7,18 @@
         },
         "HVACFanSpeed": {
             "signals": [
-                "Vehicle.Cabin.HVAC.Station.Row1.Left.FanSpeed",
-                "Vehicle.Cabin.HVAC.Station.Row1.Right.FanSpeed",
-                "Vehicle.Cabin.HVAC.Station.Row2.Left.FanSpeed",
-                "Vehicle.Cabin.HVAC.Station.Row2.Right.FanSpeed"
+                "Vehicle.Cabin.HVAC.Station.Row1.Driver.FanSpeed",
+                "Vehicle.Cabin.HVAC.Station.Row1.Passenger.FanSpeed",
+                "Vehicle.Cabin.HVAC.Station.Row2.Driver.FanSpeed",
+                "Vehicle.Cabin.HVAC.Station.Row2.Passenger.FanSpeed"
             ]
         },
         "HVACTemperature": {
             "signals": [
-                "Vehicle.Cabin.HVAC.Station.Row1.Left.Temperature",
-                "Vehicle.Cabin.HVAC.Station.Row1.Right.Temperature",
-                "Vehicle.Cabin.HVAC.Station.Row2.Left.Temperature",
-                "Vehicle.Cabin.HVAC.Station.Row2.Right.Temperature"
+                "Vehicle.Cabin.HVAC.Station.Row1.Driver.Temperature",
+                "Vehicle.Cabin.HVAC.Station.Row1.Passenger.Temperature",
+                "Vehicle.Cabin.HVAC.Station.Row2.Driver.Temperature",
+                "Vehicle.Cabin.HVAC.Station.Row2.Passenger.Temperature"
             ]
         }
     }
diff --git a/mappings/vss_signals_spec.json b/mappings/vss_signals_spec.json
index 996e1c7..3064c52 100644
--- a/mappings/vss_signals_spec.json
+++ b/mappings/vss_signals_spec.json
@@ -34,7 +34,7 @@
                 }
             }
         },
-        "Vehicle.Cabin.HVAC.Station.Row1.Left.FanSpeed": {
+        "Vehicle.Cabin.HVAC.Station.Row1.Driver.FanSpeed": {
             "default_value": 25,
             "default_change_factor": 5,
             "actions": {
@@ -68,7 +68,7 @@
                 }
             }
         },
-        "Vehicle.Cabin.HVAC.Station.Row1.Right.FanSpeed": {
+        "Vehicle.Cabin.HVAC.Station.Row1.Passenger.FanSpeed": {
             "default_value": 25,
             "default_change_factor": 5,
             "actions": {
@@ -102,7 +102,7 @@
                 }
             }
         },
-        "Vehicle.Cabin.HVAC.Station.Row2.Left.FanSpeed": {
+        "Vehicle.Cabin.HVAC.Station.Row2.Driver.FanSpeed": {
             "default_value": 25,
             "default_change_factor": 5,
             "actions": {
@@ -136,7 +136,7 @@
                 }
             }
         },
-        "Vehicle.Cabin.HVAC.Station.Row2.Right.FanSpeed": {
+        "Vehicle.Cabin.HVAC.Station.Row2.Passenger.FanSpeed": {
             "default_value": 25,
             "default_change_factor": 5,
             "actions": {
@@ -170,7 +170,7 @@
                 }
             }
         },
-        "Vehicle.Cabin.HVAC.Station.Row1.Left.Temperature": {
+        "Vehicle.Cabin.HVAC.Station.Row1.Driver.Temperature": {
             "default_value": 25,
             "default_change_factor": 2,
             "actions": {
@@ -204,7 +204,7 @@
                 }
             }
         },
-        "Vehicle.Cabin.HVAC.Station.Row1.Right.Temperature": {
+        "Vehicle.Cabin.HVAC.Station.Row1.Passenger.Temperature": {
             "default_value": 25,
             "default_change_factor": 2,
             "actions": {
@@ -238,7 +238,7 @@
                 }
             }
         },
-        "Vehicle.Cabin.HVAC.Station.Row2.Left.Temperature": {
+        "Vehicle.Cabin.HVAC.Station.Row2.Driver.Temperature": {
             "default_value": 25,
             "default_change_factor": 2,
             "actions": {
@@ -272,7 +272,7 @@
                 }
             }
         },
-        "Vehicle.Cabin.HVAC.Station.Row2.Right.Temperature": {
+        "Vehicle.Cabin.HVAC.Station.Row2.Passenger.Temperature": {
             "default_value": 25,
             "default_change_factor": 2,
             "actions": {
diff --git a/setup.py b/setup.py
index 99f6ace..f8bdcb4 100644
--- a/setup.py
+++ b/setup.py
@@ -16,11 +16,12 @@ setup(
         "grpcio-tools>=1.45.0",
         "vosk==0.3.42",
         "PyGObject==3.42.0",
-        "rasa==3.6.4",
-        "numpy==1.22.3",
-        "tensorflow==2.12.0",
-        "tensorboard==2.12.0",
-        "keras==2.12.0",
+        "rasa==3.7.0b2",
+        "numpy==1.26.4",
+        "tensorflow",
+        "tensorboard",
+        "keras==3.4.0",
+        "openai-whisper"
     ],
     license="Apache-2.0",
     python_requires=">=3.9",
author	Anuj Solanki <anuj603362@gmail.com>	2024-06-16 18:49:45 +0530
committer	Anuj Solanki <anuj603362@gmail.com>	2024-09-07 20:16:14 +0530
commit	1144fcd343bc56f8c27ff73d3e76904010dbb832 (patch)
tree	490915cd969f19b4eb3b3dd480554b27c1058243
parent	f2b62ba4da5a178221c3210c2d468cd684e626cc (diff)