aboutsummaryrefslogtreecommitdiffstats
path: root/agl_service_voiceagent/utils/stt_model.py
diff options
context:
space:
mode:
Diffstat (limited to 'agl_service_voiceagent/utils/stt_model.py')
-rw-r--r--agl_service_voiceagent/utils/stt_model.py83
1 files changed, 58 insertions, 25 deletions
diff --git a/agl_service_voiceagent/utils/stt_model.py b/agl_service_voiceagent/utils/stt_model.py
index 5337162..d51ae31 100644
--- a/agl_service_voiceagent/utils/stt_model.py
+++ b/agl_service_voiceagent/utils/stt_model.py
@@ -21,21 +21,61 @@ import wave
from agl_service_voiceagent.utils.common import generate_unique_uuid
class STTModel:
+ """
+ STTModel is a class for speech-to-text (STT) recognition using the Vosk speech recognition library.
+ """
+
def __init__(self, model_path, sample_rate=16000):
+ """
+ Initialize the STTModel instance with the provided model and sample rate.
+
+ Args:
+ model_path (str): The path to the Vosk speech recognition model.
+ sample_rate (int, optional): The audio sample rate in Hz (default is 16000).
+ """
self.sample_rate = sample_rate
self.model = vosk.Model(model_path)
self.recognizer = {}
self.chunk_size = 1024
+
def setup_recognizer(self):
+ """
+ Set up a Vosk recognizer for a new session and return a unique identifier (UUID) for the session.
+
+ Returns:
+ str: A unique identifier (UUID) for the session.
+ """
uuid = generate_unique_uuid(6)
self.recognizer[uuid] = vosk.KaldiRecognizer(self.model, self.sample_rate)
return uuid
+
def init_recognition(self, uuid, audio_data):
+ """
+ Initialize the Vosk recognizer for a session with audio data.
+
+ Args:
+ uuid (str): The unique identifier (UUID) for the session.
+ audio_data (bytes): Audio data to process.
+
+ Returns:
+ bool: True if initialization was successful, False otherwise.
+ """
return self.recognizer[uuid].AcceptWaveform(audio_data)
+
def recognize(self, uuid, partial=False):
+ """
+ Recognize speech and return the result as a JSON object.
+
+ Args:
+ uuid (str): The unique identifier (UUID) for the session.
+ partial (bool, optional): If True, return partial recognition results (default is False).
+
+ Returns:
+ dict: A JSON object containing recognition results.
+ """
self.recognizer[uuid].SetWords(True)
if partial:
result = json.loads(self.recognizer[uuid].PartialResult())
@@ -44,7 +84,18 @@ class STTModel:
self.recognizer[uuid].Reset()
return result
+
def recognize_from_file(self, uuid, filename):
+ """
+ Recognize speech from an audio file and return the recognized text.
+
+ Args:
+ uuid (str): The unique identifier (UUID) for the session.
+ filename (str): The path to the audio file.
+
+ Returns:
+ str: The recognized text or error messages.
+ """
if not os.path.exists(filename):
print(f"Audio file '{filename}' not found.")
return "FILE_NOT_FOUND"
@@ -75,31 +126,13 @@ class STTModel:
print("Voice not recognized. Please speak again...")
return "VOICE_NOT_RECOGNIZED"
- def cleanup_recognizer(self, uuid):
- del self.recognizer[uuid]
-import wave
-
-def read_wav_file(filename, chunk_size=1024):
- try:
- wf = wave.open(filename, "rb")
- if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE":
- print("Audio file must be WAV format mono PCM.")
- return "FILE_FORMAT_INVALID"
-
- audio_data = b"" # Initialize an empty bytes object to store audio data
- while True:
- chunk = wf.readframes(chunk_size)
- if not chunk:
- break # End of file reached
- audio_data += chunk
-
- return audio_data
- except Exception as e:
- print(f"Error reading audio file: {e}")
- return None
+ def cleanup_recognizer(self, uuid):
+ """
+ Clean up and remove the Vosk recognizer for a session.
-# Example usage:
-filename = "your_audio.wav"
-audio_data = read_wav_file(filename)
+ Args:
+ uuid (str): The unique identifier (UUID) for the session.
+ """
+ del self.recognizer[uuid]
\ No newline at end of file