Audio stream from browser to Azure Speech to Text

I guys, i have been working on a conversational agent where i need to send the audio from browser to the azure speech to text (stream the audio). everything working fine but inbetween the speech recognition stops with a event : "End of stream detected". I need to Speech to text to listen continuously and get the audio stream for a realtime conversational agent. Any idea or help would be appreciated. Thanks

this is the source code: 

import azure.cognitiveservices.speech as speechsdk
import asyncio
import logging
import time
from logging_config import get_logger

logger = get_logger()

class SpeechToText:
    def __init__(self, speech_key, speech_region, speech_endpoint):
        self.speech_key = speech_key
        self.speech_region = speech_region
        self.speech_endpoint = speech_endpoint
        self.recognizer = None
        self.stream = None
        self.restart_count = 0
        self.max_restarts = 3
        self.last_restart_time = 0
        self.restart_cooldown = 5  # seconds
        logger.info("Speech-to-Text service initialized")
        
    def create_recognizer(self, loop, queue):
        """
        Create a speech recognizer that will process audio input
        """
        try:
            logger.info("Creating speech recognizer")
            speech_config = speechsdk.SpeechConfig(
                subscription=self.speech_key,
                region=self.speech_region
            )
            
            if self.speech_endpoint:
                speech_config.endpoint_id = self.speech_endpoint
                
            format = speechsdk.audio.AudioStreamFormat(
                compressed_stream_format=speechsdk.AudioStreamContainerFormat.ANY
            )
            
            stream = speechsdk.audio.PushAudioInputStream(format)
            audio_config = speechsdk.audio.AudioConfig(stream=stream)
            
            speech_recognizer = speechsdk.SpeechRecognizer(
                speech_config=speech_config,
                audio_config=audio_config,
                language="en-US"
            )
            
            # Event callbacks
            def recognizing_cb(evt):
                # Progress callback
                pass
            
            def recognized_cb(evt):
                if evt.result.text and evt.result.text.strip():
                    recognized_text = evt.result.text
                    logger.info(f"Speech recognized: '{recognized_text}'")
                    # Send the recognized text and current timestamp
                    current_time = time.time()
                    data = {
                        "text": recognized_text,
                        "time": current_time
                    }
                    asyncio.run_coroutine_threadsafe(queue.put(data), loop)
            
            def stop_cb(evt):
                logger.info("Speech recognizer session stopped")
                # Don't auto-restart here - let the watchdog handle it if needed
            
            def canceled_cb(evt):
                if evt.result.cancellation_details.reason == speechsdk.CancellationReason.EndOfStream:
                    logger.info("End of stream detected")
                    # Don't auto-restart here - let the watchdog handle it if needed
                elif evt.result.cancellation_details.reason == speechsdk.CancellationReason.Error:
                    error_details = evt.result.cancellation_details.error_details
                    logger.error(f"Speech recognition error: {error_details}")
                    
            # Connect callbacks to events
            speech_recognizer.recognizing.connect(recognizing_cb)
            speech_recognizer.recognized.connect(recognized_cb)
            speech_recognizer.session_stopped.connect(stop_cb)
            speech_recognizer.canceled.connect(canceled_cb)
            
            self.recognizer = speech_recognizer
            self.stream = stream
            
            return speech_recognizer, stream
            
        except Exception as e:
            logger.exception(f"Error creating speech recognizer: {str(e)}")
            raise
            
    def start_recognition(self):
        """Start continuous speech recognition"""
        current_time = time.time()
        
        # Rate limit the restarts
        if current_time - self.last_restart_time < self.restart_cooldown:
            self.restart_count += 1
            if self.restart_count >= self.max_restarts:
                logger.warning(f"Too many recognition restarts ({self.restart_count}). Adding cooldown.")
                time.sleep(self.restart_cooldown)
                self.restart_count = 0
            return
            
        self.last_restart_time = current_time
        
        if self.recognizer:
            logger.info("Starting continuous speech recognition")
            self.recognizer.start_continuous_recognition()
        else:
            logger.error("Cannot start recognition - recognizer not initialized")
            
    def stop_recognition(self):
        """Stop continuous speech recognition"""
        if self.recognizer:
            logger.info("Stopping continuous speech recognition")
            self.recognizer.stop_continuous_recognition()
        if self.stream:
            logger.info("Closing audio stream")
            self.stream.close()



Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Audio stream from browser to Azure Speech to Text #2792

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Audio stream from browser to Azure Speech to Text #2792

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions