Skip to content

Audio stream from browser to Azure Speech to Text #2792

@ElamaranDev

Description

@ElamaranDev

I guys, i have been working on a conversational agent where i need to send the audio from browser to the azure speech to text (stream the audio). everything working fine but inbetween the speech recognition stops with a event : "End of stream detected". I need to Speech to text to listen continuously and get the audio stream for a realtime conversational agent. Any idea or help would be appreciated. Thanks

this is the source code:

import azure.cognitiveservices.speech as speechsdk
import asyncio
import logging
import time
from logging_config import get_logger

logger = get_logger()

class SpeechToText:
def init(self, speech_key, speech_region, speech_endpoint):
self.speech_key = speech_key
self.speech_region = speech_region
self.speech_endpoint = speech_endpoint
self.recognizer = None
self.stream = None
self.restart_count = 0
self.max_restarts = 3
self.last_restart_time = 0
self.restart_cooldown = 5 # seconds
logger.info("Speech-to-Text service initialized")

def create_recognizer(self, loop, queue):
    """
    Create a speech recognizer that will process audio input
    """
    try:
        logger.info("Creating speech recognizer")
        speech_config = speechsdk.SpeechConfig(
            subscription=self.speech_key,
            region=self.speech_region
        )
        
        if self.speech_endpoint:
            speech_config.endpoint_id = self.speech_endpoint
            
        format = speechsdk.audio.AudioStreamFormat(
            compressed_stream_format=speechsdk.AudioStreamContainerFormat.ANY
        )
        
        stream = speechsdk.audio.PushAudioInputStream(format)
        audio_config = speechsdk.audio.AudioConfig(stream=stream)
        
        speech_recognizer = speechsdk.SpeechRecognizer(
            speech_config=speech_config,
            audio_config=audio_config,
            language="en-US"
        )
        
        # Event callbacks
        def recognizing_cb(evt):
            # Progress callback
            pass
        
        def recognized_cb(evt):
            if evt.result.text and evt.result.text.strip():
                recognized_text = evt.result.text
                logger.info(f"Speech recognized: '{recognized_text}'")
                # Send the recognized text and current timestamp
                current_time = time.time()
                data = {
                    "text": recognized_text,
                    "time": current_time
                }
                asyncio.run_coroutine_threadsafe(queue.put(data), loop)
        
        def stop_cb(evt):
            logger.info("Speech recognizer session stopped")
            # Don't auto-restart here - let the watchdog handle it if needed
        
        def canceled_cb(evt):
            if evt.result.cancellation_details.reason == speechsdk.CancellationReason.EndOfStream:
                logger.info("End of stream detected")
                # Don't auto-restart here - let the watchdog handle it if needed
            elif evt.result.cancellation_details.reason == speechsdk.CancellationReason.Error:
                error_details = evt.result.cancellation_details.error_details
                logger.error(f"Speech recognition error: {error_details}")
                
        # Connect callbacks to events
        speech_recognizer.recognizing.connect(recognizing_cb)
        speech_recognizer.recognized.connect(recognized_cb)
        speech_recognizer.session_stopped.connect(stop_cb)
        speech_recognizer.canceled.connect(canceled_cb)
        
        self.recognizer = speech_recognizer
        self.stream = stream
        
        return speech_recognizer, stream
        
    except Exception as e:
        logger.exception(f"Error creating speech recognizer: {str(e)}")
        raise
        
def start_recognition(self):
    """Start continuous speech recognition"""
    current_time = time.time()
    
    # Rate limit the restarts
    if current_time - self.last_restart_time < self.restart_cooldown:
        self.restart_count += 1
        if self.restart_count >= self.max_restarts:
            logger.warning(f"Too many recognition restarts ({self.restart_count}). Adding cooldown.")
            time.sleep(self.restart_cooldown)
            self.restart_count = 0
        return
        
    self.last_restart_time = current_time
    
    if self.recognizer:
        logger.info("Starting continuous speech recognition")
        self.recognizer.start_continuous_recognition()
    else:
        logger.error("Cannot start recognition - recognizer not initialized")
        
def stop_recognition(self):
    """Stop continuous speech recognition"""
    if self.recognizer:
        logger.info("Stopping continuous speech recognition")
        self.recognizer.stop_continuous_recognition()
    if self.stream:
        logger.info("Closing audio stream")
        self.stream.close()

Metadata

Metadata

Assignees

No one assigned

    Labels

    update neededFor items that are in progress but have not been updated

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions