-
Notifications
You must be signed in to change notification settings - Fork 2k
Description
I guys, i have been working on a conversational agent where i need to send the audio from browser to the azure speech to text (stream the audio). everything working fine but inbetween the speech recognition stops with a event : "End of stream detected". I need to Speech to text to listen continuously and get the audio stream for a realtime conversational agent. Any idea or help would be appreciated. Thanks
this is the source code:
import azure.cognitiveservices.speech as speechsdk
import asyncio
import logging
import time
from logging_config import get_logger
logger = get_logger()
class SpeechToText:
def init(self, speech_key, speech_region, speech_endpoint):
self.speech_key = speech_key
self.speech_region = speech_region
self.speech_endpoint = speech_endpoint
self.recognizer = None
self.stream = None
self.restart_count = 0
self.max_restarts = 3
self.last_restart_time = 0
self.restart_cooldown = 5 # seconds
logger.info("Speech-to-Text service initialized")
def create_recognizer(self, loop, queue):
"""
Create a speech recognizer that will process audio input
"""
try:
logger.info("Creating speech recognizer")
speech_config = speechsdk.SpeechConfig(
subscription=self.speech_key,
region=self.speech_region
)
if self.speech_endpoint:
speech_config.endpoint_id = self.speech_endpoint
format = speechsdk.audio.AudioStreamFormat(
compressed_stream_format=speechsdk.AudioStreamContainerFormat.ANY
)
stream = speechsdk.audio.PushAudioInputStream(format)
audio_config = speechsdk.audio.AudioConfig(stream=stream)
speech_recognizer = speechsdk.SpeechRecognizer(
speech_config=speech_config,
audio_config=audio_config,
language="en-US"
)
# Event callbacks
def recognizing_cb(evt):
# Progress callback
pass
def recognized_cb(evt):
if evt.result.text and evt.result.text.strip():
recognized_text = evt.result.text
logger.info(f"Speech recognized: '{recognized_text}'")
# Send the recognized text and current timestamp
current_time = time.time()
data = {
"text": recognized_text,
"time": current_time
}
asyncio.run_coroutine_threadsafe(queue.put(data), loop)
def stop_cb(evt):
logger.info("Speech recognizer session stopped")
# Don't auto-restart here - let the watchdog handle it if needed
def canceled_cb(evt):
if evt.result.cancellation_details.reason == speechsdk.CancellationReason.EndOfStream:
logger.info("End of stream detected")
# Don't auto-restart here - let the watchdog handle it if needed
elif evt.result.cancellation_details.reason == speechsdk.CancellationReason.Error:
error_details = evt.result.cancellation_details.error_details
logger.error(f"Speech recognition error: {error_details}")
# Connect callbacks to events
speech_recognizer.recognizing.connect(recognizing_cb)
speech_recognizer.recognized.connect(recognized_cb)
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(canceled_cb)
self.recognizer = speech_recognizer
self.stream = stream
return speech_recognizer, stream
except Exception as e:
logger.exception(f"Error creating speech recognizer: {str(e)}")
raise
def start_recognition(self):
"""Start continuous speech recognition"""
current_time = time.time()
# Rate limit the restarts
if current_time - self.last_restart_time < self.restart_cooldown:
self.restart_count += 1
if self.restart_count >= self.max_restarts:
logger.warning(f"Too many recognition restarts ({self.restart_count}). Adding cooldown.")
time.sleep(self.restart_cooldown)
self.restart_count = 0
return
self.last_restart_time = current_time
if self.recognizer:
logger.info("Starting continuous speech recognition")
self.recognizer.start_continuous_recognition()
else:
logger.error("Cannot start recognition - recognizer not initialized")
def stop_recognition(self):
"""Stop continuous speech recognition"""
if self.recognizer:
logger.info("Stopping continuous speech recognition")
self.recognizer.stop_continuous_recognition()
if self.stream:
logger.info("Closing audio stream")
self.stream.close()