diff options
Diffstat (limited to 'agl_service_voiceagent/utils/stt_model.py')
-rw-r--r-- | agl_service_voiceagent/utils/stt_model.py | 83 |
1 files changed, 58 insertions, 25 deletions
diff --git a/agl_service_voiceagent/utils/stt_model.py b/agl_service_voiceagent/utils/stt_model.py index 5337162..d51ae31 100644 --- a/agl_service_voiceagent/utils/stt_model.py +++ b/agl_service_voiceagent/utils/stt_model.py @@ -21,21 +21,61 @@ import wave from agl_service_voiceagent.utils.common import generate_unique_uuid class STTModel: + """ + STTModel is a class for speech-to-text (STT) recognition using the Vosk speech recognition library. + """ + def __init__(self, model_path, sample_rate=16000): + """ + Initialize the STTModel instance with the provided model and sample rate. + + Args: + model_path (str): The path to the Vosk speech recognition model. + sample_rate (int, optional): The audio sample rate in Hz (default is 16000). + """ self.sample_rate = sample_rate self.model = vosk.Model(model_path) self.recognizer = {} self.chunk_size = 1024 + def setup_recognizer(self): + """ + Set up a Vosk recognizer for a new session and return a unique identifier (UUID) for the session. + + Returns: + str: A unique identifier (UUID) for the session. + """ uuid = generate_unique_uuid(6) self.recognizer[uuid] = vosk.KaldiRecognizer(self.model, self.sample_rate) return uuid + def init_recognition(self, uuid, audio_data): + """ + Initialize the Vosk recognizer for a session with audio data. + + Args: + uuid (str): The unique identifier (UUID) for the session. + audio_data (bytes): Audio data to process. + + Returns: + bool: True if initialization was successful, False otherwise. + """ return self.recognizer[uuid].AcceptWaveform(audio_data) + def recognize(self, uuid, partial=False): + """ + Recognize speech and return the result as a JSON object. + + Args: + uuid (str): The unique identifier (UUID) for the session. + partial (bool, optional): If True, return partial recognition results (default is False). + + Returns: + dict: A JSON object containing recognition results. + """ self.recognizer[uuid].SetWords(True) if partial: result = json.loads(self.recognizer[uuid].PartialResult()) @@ -44,7 +84,18 @@ class STTModel: self.recognizer[uuid].Reset() return result + def recognize_from_file(self, uuid, filename): + """ + Recognize speech from an audio file and return the recognized text. + + Args: + uuid (str): The unique identifier (UUID) for the session. + filename (str): The path to the audio file. + + Returns: + str: The recognized text or error messages. + """ if not os.path.exists(filename): print(f"Audio file '{filename}' not found.") return "FILE_NOT_FOUND" @@ -75,31 +126,13 @@ class STTModel: print("Voice not recognized. Please speak again...") return "VOICE_NOT_RECOGNIZED" - def cleanup_recognizer(self, uuid): - del self.recognizer[uuid] -import wave - -def read_wav_file(filename, chunk_size=1024): - try: - wf = wave.open(filename, "rb") - if wf.getnchannels() != 1 or wf.getsampwidth() != 2 or wf.getcomptype() != "NONE": - print("Audio file must be WAV format mono PCM.") - return "FILE_FORMAT_INVALID" - - audio_data = b"" # Initialize an empty bytes object to store audio data - while True: - chunk = wf.readframes(chunk_size) - if not chunk: - break # End of file reached - audio_data += chunk - - return audio_data - except Exception as e: - print(f"Error reading audio file: {e}") - return None + def cleanup_recognizer(self, uuid): + """ + Clean up and remove the Vosk recognizer for a session. -# Example usage: -filename = "your_audio.wav" -audio_data = read_wav_file(filename) + Args: + uuid (str): The unique identifier (UUID) for the session. + """ + del self.recognizer[uuid]
\ No newline at end of file |