diff options
author | Anuj Solanki <anuj603362@gmail.com> | 2024-06-16 18:49:45 +0530 |
---|---|---|
committer | Anuj Solanki <anuj603362@gmail.com> | 2024-09-07 20:16:14 +0530 |
commit | 1144fcd343bc56f8c27ff73d3e76904010dbb832 (patch) | |
tree | 490915cd969f19b4eb3b3dd480554b27c1058243 /agl_service_voiceagent/utils/stt_online_service.py | |
parent | f2b62ba4da5a178221c3210c2d468cd684e626cc (diff) |
Integrate Whisper AI into agl-service-voiceagent
V1:
- Integrated Whisper AI for speech-to-text functionality into
agl-service-voiceagent.
- Add support for both online and offline mode.
- Implemented a gRPC-based connection for online mode between
Whisper ASR service and voice-agent service.
V2:
- Update kuksa-interface
- Add whisper-cpp for speech-to-text functionality
- Add support to control media using mpd
- Fix audio recorder
Bug-AGL: SPEC-5200
Change-Id: I2661ae61ba2c3283bcfde26d6e4f498270240b19
Signed-off-by: Anuj Solanki <anuj603362@gmail.com>
Diffstat (limited to 'agl_service_voiceagent/utils/stt_online_service.py')
-rw-r--r-- | agl_service_voiceagent/utils/stt_online_service.py | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/agl_service_voiceagent/utils/stt_online_service.py b/agl_service_voiceagent/utils/stt_online_service.py new file mode 100644 index 0000000..7bbdc5d --- /dev/null +++ b/agl_service_voiceagent/utils/stt_online_service.py @@ -0,0 +1,88 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# Copyright (c) 2023 Malik Talha +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import grpc +import sys +sys.path.append("../") +from agl_service_voiceagent.generated import audio_processing_pb2 +from agl_service_voiceagent.generated import audio_processing_pb2_grpc + +class STTOnlineService: + """ + STTOnlineService class is used to connect to an online gPRC based Whisper ASR service. + """ + def __init__(self, server_address, server_port,server_timeout=5): + """ + Initialize the online speech-to-text service. + + Args: + server_ip (str): The IP address of the online speech-to-text service. + server_port (int): The port number of the online speech-to-text service. + server_timeout (int, optional): The timeout value in seconds (default is 5). + """ + self.server_address = server_address + self.server_port = server_port + self.server_timeout = server_timeout + self.client = None + self.initialized = False + + + def initialize_connection(self): + """ + Initialize the connection to the online speech-to-text service. + """ + try: + channel = grpc.insecure_channel(f"{self.server_address}:{self.server_port}") + self.client = audio_processing_pb2_grpc.AudioProcessingStub(channel) + self.initialized = True + print("STTOnlineService initialized with server address:",self.server_address,"and port:",self.server_port,"and timeout:",self.server_timeout,"seconds.") + except Exception as e: + print("Error initializing online speech-to-text service:",e) + self.initialized = False + return self.initialized + + def close_connection(self): + """ + Close the connection to the online speech-to-text service. + """ + self.client = None + self.initialized = False + return not self.initialized + + def recognize_audio(self, audio_file): + """ + Recognize speech from audio data. + + Args: + audio_data (bytes): Audio data to process. + + Returns: + str: The recognized text. + """ + if not self.initialized: + print("STTOnlineService not initialized.") + return None + try: + with open(audio_file, 'rb') as audio_file: + audio_data = audio_file.read() + request = audio_processing_pb2.AudioRequest(audio_data=audio_data) + response = self.client.ProcessAudio(request,timeout=self.server_timeout) + return response.text + except Exception as e: + print("Error recognizing audio:",e) + return None + +
\ No newline at end of file |