berangerthomas
diff --git a/‎stellascript/audio/capture.py‎
Lines changed: 67 additions & 24 deletions b/‎stellascript/audio/capture.py‎
Lines changed: 67 additions & 24 deletions
diff --git a/‎stellascript/audio/enhancement.py‎
Lines changed: 43 additions & 10 deletions b/‎stellascript/audio/enhancement.py‎
Lines changed: 43 additions & 10 deletions
diff --git a/‎stellascript/cli.py‎
Lines changed: 31 additions & 4 deletions b/‎stellascript/cli.py‎
Lines changed: 31 additions & 4 deletions
diff --git a/‎stellascript/config.py‎
Lines changed: 69 additions & 18 deletions b/‎stellascript/config.py‎
Lines changed: 69 additions & 18 deletions
@@ -1,29 +1,75 @@
 # stellascript/audio/capture.py
 
-import pyaudio
+"""
+Handles audio capture from the microphone using PyAudio.
+"""
+
+import threading
 from contextlib import contextmanager
+from typing import Callable, Generator, Optional
+
+import pyaudio
+
 from ..logging_config import get_logger
 
 logger = get_logger(__name__)
 
+
 class AudioCapture:
-    def __init__(self, format, channels, rate, chunk):
-        self.format_str = format
-        self.format = self._get_pyaudio_format(format)
-        self.channels = channels
-        self.rate = rate
-        self.chunk = chunk
-        self.pyaudio_instance = None
-        self.stream = None
-
-    def _get_pyaudio_format(self, format_str):
+    """
+    A class to manage audio recording from the microphone.
+
+    This class provides a context manager to handle the lifecycle of a PyAudio
+    stream, ensuring that resources are properly opened and closed.
+    """
+
+    def __init__(self, format: str, channels: int, rate: int, chunk: int) -> None:
+        """
+        Initializes the AudioCapture instance.
+
+        Args:
+            format (str): The audio format string (e.g., "paFloat32").
+            channels (int): The number of audio channels.
+            rate (int): The sampling rate in Hz.
+            chunk (int): The number of frames per buffer.
+        """
+        self.format_str: str = format
+        self.format: int = self._get_pyaudio_format(format)
+        self.channels: int = channels
+        self.rate: int = rate
+        self.chunk: int = chunk
+        self.pyaudio_instance: Optional[pyaudio.PyAudio] = None
+        self.stream: Optional[pyaudio.Stream] = None
+
+    def _get_pyaudio_format(self, format_str: str) -> int:
+        """
+        Converts a format string to a PyAudio format constant.
+
+        Args:
+            format_str (str): The string representation of the format.
+
+        Returns:
+            int: The corresponding PyAudio format constant.
+
+        Raises:
+            ValueError: If the format string is not supported.
+        """
         if format_str == "paFloat32":
             return pyaudio.paFloat32
         # Add other formats if needed
         raise ValueError(f"Unsupported audio format: {format_str}")
 
     @contextmanager
-    def audio_stream(self, callback):
+    def audio_stream(self, callback: Callable) -> Generator[Optional[pyaudio.Stream], None, None]:
+        """
+        A context manager for opening and managing a PyAudio stream.
+
+        Args:
+            callback (Callable): The callback function to process audio chunks.
+
+        Yields:
+            Optional[pyaudio.Stream]: The PyAudio stream object.
+        """
         self.pyaudio_instance = pyaudio.PyAudio()
         try:
             self.stream = self.pyaudio_instance.open(
@@ -40,33 +86,30 @@ def audio_stream(self, callback):
             if self.stream:
                 try:
                     if self.stream.is_active():
-                        # Utiliser stop_stream avec gestion du timeout
-                        import threading
-                        
-                        # Pass stream object as an argument to make it explicit for Pylance
-                        def force_stop(stream_to_stop):
+                        # Use stop_stream with timeout management
+                        def force_stop(stream_to_stop: pyaudio.Stream) -> None:
                             try:
                                 if stream_to_stop:
                                     stream_to_stop.stop_stream()
                             except Exception:
                                 pass
-                        
-                        # Lancer l'arrêt dans un thread avec timeout
+
+                        # Run the stop in a thread with a timeout
                         stop_thread = threading.Thread(target=force_stop, args=(self.stream,), daemon=True)
                         stop_thread.start()
-                        stop_thread.join(timeout=0.2)  # Attendre max 200ms
-                        
-                        # Si le thread n'a pas fini, on continue quand même
+                        stop_thread.join(timeout=0.2)  # Wait max 200ms
+
+                        # If the thread is still running, continue anyway
                         if stop_thread.is_alive():
                             logger.warning("Stream stop timed out, continuing anyway")
                 except Exception:
                     pass
-                
+
                 try:
                     self.stream.close()
                 except Exception:
                     pass
-            
+
             if self.pyaudio_instance:
                 try:
                     self.pyaudio_instance.terminate()
 
@@ -1,24 +1,57 @@
 # stellascript/audio/enhancement.py
 
+"""
+Handles audio enhancement using various methods like DeepFilterNet and Demucs.
+"""
+
 import warnings
+from typing import Any, Optional
+
 import numpy as np
 import torch
 import torchaudio
+
 from ..logging_config import get_logger
 
 logger = get_logger(__name__)
 
+
 class AudioEnhancer:
-    def __init__(self, enhancement_method, device, rate):
-        self.enhancement_method = enhancement_method
-        self.device = device
-        self.rate = rate
-        self.demucs_model = None
-        self.df_model = None
-        self.df_state = None
-
-    def apply(self, audio_data, is_live=False):
-        """Apply selected audio enhancement method."""
+    """
+    A class to apply audio enhancement techniques to audio data.
+
+    This class supports multiple enhancement methods and handles the loading
+    of the necessary models.
+    """
+
+    def __init__(self, enhancement_method: str, device: torch.device, rate: int) -> None:
+        """
+        Initializes the AudioEnhancer.
+
+        Args:
+            enhancement_method (str): The enhancement method to use ('none',
+                                      'deepfilternet', 'demucs').
+            device (torch.device): The device to run the models on (CPU or CUDA).
+            rate (int): The sample rate of the input audio.
+        """
+        self.enhancement_method: str = enhancement_method
+        self.device: torch.device = device
+        self.rate: int = rate
+        self.demucs_model: Optional[Any] = None
+        self.df_model: Optional[Any] = None
+        self.df_state: Optional[Any] = None
+
+    def apply(self, audio_data: np.ndarray, is_live: bool = False) -> np.ndarray:
+        """
+        Apply the selected audio enhancement method.
+
+        Args:
+            audio_data (np.ndarray): The input audio data as a NumPy array.
+            is_live (bool): Flag indicating if the processing is for a live stream.
+
+        Returns:
+            np.ndarray: The enhanced audio data.
+        """
         if self.enhancement_method == "none":
             return audio_data
 
 
@@ -7,8 +7,18 @@
 
 logger = get_logger(__name__)
 
-def parse_args():
-    """Parses command-line arguments."""
+def parse_args() -> argparse.Namespace:
+    """
+    Parses command-line arguments for the Stellascript application.
+
+    This function sets up an ArgumentParser to handle various command-line options
+    for transcription, including language, model selection, input file,
+    diarization, and audio enhancement. It also includes argument validation
+    to ensure compatibility between different options.
+
+    Returns:
+        argparse.Namespace: An object containing the parsed command-line arguments.
+    """
     parser = argparse.ArgumentParser(
         description="Transcribe audio live from microphone or from a file."
     )
@@ -92,8 +102,25 @@ def parse_args():
     validate_args(args, parser)
     return args
 
-def validate_args(args, parser):
-    """Validates parsed arguments."""
+
+def validate_args(args: argparse.Namespace, parser: argparse.ArgumentParser) -> None:
+    """
+    Validates the parsed command-line arguments to ensure they are consistent.
+
+    This function checks for various invalid combinations of arguments, such as:
+    - Using speaker count constraints in live mode.
+    - Incompatible diarization and transcription modes.
+    - Misuse of the similarity threshold with certain diarization methods.
+    - Conflicting arguments for speaker count and similarity threshold.
+
+    Args:
+        args (argparse.Namespace): The parsed command-line arguments.
+        parser (argparse.ArgumentParser): The argument parser, used to report errors.
+
+    Raises:
+        SystemExit: If an invalid combination of arguments is found, the program
+                    exits with an error message.
+    """
     if (args.min_speakers is not None or args.max_speakers is not None) and not args.file:
         parser.error("--min-speakers and --max-speakers can only be used in file mode (--file).")
 
 
@@ -1,38 +1,89 @@
 # stellascript/config.py
 
+"""
+Configuration settings for the Stellascript application.
+
+This module defines various constants that control the behavior of the audio
+processing, transcription, and diarization pipeline. These settings are
+organized into sections for clarity and can be tuned to optimize performance
+for different use cases.
+
+Attributes:
+    FORMAT (str): The audio format used for recording, corresponding to PyAudio's
+                  `paFloat32`.
+    CHANNELS (int): The number of audio channels (1 for mono).
+    RATE (int): The sampling rate in Hz (16000 Hz is standard for speech).
+    CHUNK (int): The number of samples per buffer, used for VAD processing.
+
+    TRANSCRIPTION_MAX_BUFFER_DURATION (float): The maximum duration of the audio
+                                               buffer for transcription in
+                                               seconds.
+    SUBTITLE_MAX_BUFFER_DURATION (float): The maximum duration of the audio
+                                          buffer for subtitle generation in
+                                          seconds.
+    VAD_SPEECH_THRESHOLD (float): The sensitivity threshold for the Voice
+                                  Activity Detection (VAD).
+    VAD_SILENCE_DURATION_S (float): The duration of silence in seconds that
+                                    triggers a segment split.
+    VAD_MIN_SPEECH_DURATION_S (float): The minimum duration of speech in seconds
+                                       to be considered a valid segment.
+
+    SUBTITLE_MAX_LENGTH (int): The maximum number of characters per subtitle line.
+    SUBTITLE_MAX_DURATION_S (float): The maximum duration of a single subtitle
+                                     line in seconds.
+    SUBTITLE_MAX_SILENCE_S (float): The maximum duration of silence to tolerate
+                                    before creating a new subtitle line.
+
+    MAX_MERGE_GAP_S (float): The maximum gap of silence in seconds between two
+                             speech segments to be merged into one.
+
+    TARGET_CHUNK_DURATION_S (float): The target duration for audio chunks when
+                                     processing a file.
+    MAX_CHUNK_DURATION_S (float): The maximum allowed duration for an audio chunk.
+    MIN_SILENCE_GAP_S (float): The minimum duration of silence to be considered a
+                               gap for chunking.
+
+    TRANSCRIPTION_PADDING_S (float): The duration of silence padding added to
+                                     audio segments before transcription.
+
+    MODELS (list[str]): A list of available Whisper models for transcription.
+"""
+
+from typing import List
+
 # Audio Configuration
-FORMAT = "paFloat32"  # Corresponds to pyaudio.paFloat32
-CHANNELS = 1
-RATE = 16000
-CHUNK = 512  # For VAD, 512 samples = 32ms at 16kHz
+FORMAT: str = "paFloat32"  # Corresponds to pyaudio.paFloat32
+CHANNELS: int = 1
+RATE: int = 16000
+CHUNK: int = 512  # For VAD, 512 samples = 32ms at 16kHz
 
 # Transcription Mode Buffering
-TRANSCRIPTION_MAX_BUFFER_DURATION = 75.0  # 1min15s
+TRANSCRIPTION_MAX_BUFFER_DURATION: float = 75.0  # 1min15s
 
 # Subtitle Mode Buffering & VAD
-SUBTITLE_MAX_BUFFER_DURATION = 15.0  # 15s for real-time response
-VAD_SPEECH_THRESHOLD = 0.4  # Lower threshold for higher sensitivity
-VAD_SILENCE_DURATION_S = 0.3  # Shorter silence duration to split segments
-VAD_MIN_SPEECH_DURATION_S = 0.2
+SUBTITLE_MAX_BUFFER_DURATION: float = 15.0  # 15s for real-time response
+VAD_SPEECH_THRESHOLD: float = 0.4  # Lower threshold for higher sensitivity
+VAD_SILENCE_DURATION_S: float = 0.3  # Shorter silence duration to split segments
+VAD_MIN_SPEECH_DURATION_S: float = 0.2
 
 # Subtitle Generation
-SUBTITLE_MAX_LENGTH = 80  # Max characters per subtitle line
-SUBTITLE_MAX_DURATION_S = 15.0  # Max duration of a single subtitle line
-SUBTITLE_MAX_SILENCE_S = 0.5  # Max silence to tolerate before creating a new line
+SUBTITLE_MAX_LENGTH: int = 80  # Max characters per subtitle line
+SUBTITLE_MAX_DURATION_S: float = 15.0  # Max duration of a single subtitle line
+SUBTITLE_MAX_SILENCE_S: float = 0.5  # Max silence to tolerate before creating a new line
 
 # Speaker Diarization
-MAX_MERGE_GAP_S = 5.0  # Max silence between segments to merge
+MAX_MERGE_GAP_S: float = 5.0  # Max silence between segments to merge
 
 # File Transcription Chunking
-TARGET_CHUNK_DURATION_S = 90.0
-MAX_CHUNK_DURATION_S = 120.0
-MIN_SILENCE_GAP_S = 0.5
+TARGET_CHUNK_DURATION_S: float = 90.0
+MAX_CHUNK_DURATION_S: float = 120.0
+MIN_SILENCE_GAP_S: float = 0.5
 
 # Transcription Padding
-TRANSCRIPTION_PADDING_S = 1.5  # 1.5s of silence padding
+TRANSCRIPTION_PADDING_S: float = 1.5  # 1.5s of silence padding
 
 # List of available Whisper models
-MODELS = [
+MODELS: List[str] = [
     "tiny.en", "tiny", "base.en", "base", "small.en", "small",
     "medium.en", "medium", "large-v1", "large-v2", "large-v3", "large",
     "distil-large-v2", "distil-medium.en", "distil-small.en"