[touchtts] fix bug and support chunk-mask for touch_flow (#77)

chwma0 · changwei.ma-halo · web-flow · commit fed5804e65ea · 2025-11-11T17:41:12.000+08:00
Co-authored-by: changwei.ma-halo &lt;changwei.ma-halo@gua.com&gt;
diff --git a/west/models/touch_flow/configuration_touch_flow.py b/west/models/touch_flow/configuration_touch_flow.py
@@ -21,19 +21,38 @@ def __init__(
         hidden_size: int = 0,
         inference_cfg_rate: float = 0.7,
         n_timesteps: int = 5,
+        max_speech_duration: float = 30,
+        min_speech_duration: float = 0.2,
+        decoding_chunk_size: int = 0,
+        enable_full_context: bool = True,
+        max_chunk_size: int = 86,
+        num_decoding_left_chunks: int = 0,
+        static_chunk_size: int = -1,
+        use_dynamic_chunk: bool = False,
+        use_dynamic_left_chunk: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.llm_model_name_or_path = llm_model_name_or_path
         self.s3tokenizer_model_name_or_path = s3tokenizer_model_name_or_path
         self.speaker_model_path = speaker_model_path
+        self.text_tokenizer_path = text_tokenizer_path
         self.num_speech_tokens = num_speech_tokens
         self.t_scheduler = t_scheduler
         self.sigma_min = sigma_min
         self.training_cfg_rate = training_cfg_rate
         self.hidden_size = hidden_size
         self.inference_cfg_rate = inference_cfg_rate
         self.n_timesteps = n_timesteps
+        self.max_speech_duration = max_speech_duration
+        self.min_speech_duration = min_speech_duration
+        self.use_dynamic_chunk = use_dynamic_chunk
+        self.use_dynamic_left_chunk = use_dynamic_left_chunk
+        self.decoding_chunk_size = decoding_chunk_size
+        self.static_chunk_size = static_chunk_size
+        self.num_decoding_left_chunks = num_decoding_left_chunks
+        self.enable_full_context = enable_full_context
+        self.max_chunk_size = max_chunk_size
 
 
 __all__ = ["TouchFlowConfig"]
diff --git a/west/models/touch_flow/extractor_touch_flow.py b/west/models/touch_flow/extractor_touch_flow.py
@@ -21,6 +21,12 @@ def __init__(self, tokenizer, model_config, inference=False):
     def extract(self, item):
         import s3tokenizer
         waveform, sample_rate = torchaudio.load(item['wav'])
+        duration = waveform.size(1) / sample_rate
+        if not self.inference and (
+                duration < self.model_config.min_speech_duration
+                or duration > self.model_config.max_speech_duration):
+            return None
+
         audio = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
         audio_22k = torchaudio.transforms.Resample(sample_rate, 22050)(waveform)
         mel_vocoder = mel_spectrogram(audio_22k,
diff --git a/west/models/touch_flow/modeling_touch_flow.py b/west/models/touch_flow/modeling_touch_flow.py
@@ -12,7 +12,8 @@
 from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                           PreTrainedModel)
 
-from west.utils.mask import make_pad_mask, non_causal_mask
+from west.utils.mask import (add_optional_chunk_mask, make_pad_mask,
+                             mask_to_bias)
 from west.utils.utils import freeze_module
 
 from .configuration_touch_flow import TouchFlowConfig
@@ -148,7 +149,17 @@ def forward(
                            dim=-1)  # (B, T, 5*M)
         inputs = self.input_projector(inputs)  # (B, T, D)
         mask = ~make_pad_mask(mel_vocoder_lengths).to(device)  # (B, T)
-        att_mask = non_causal_mask(mel_vocoder_lengths).to(device)  # (B, T, T)
+        att_mask = add_optional_chunk_mask(
+            xs=token_cond, masks=mask.unsqueeze(1),
+            use_dynamic_chunk=self.config.use_dynamic_chunk,
+            use_dynamic_left_chunk=self.config.use_dynamic_left_chunk,
+            decoding_chunk_size=self.config.decoding_chunk_size,
+            static_chunk_size=self.config.static_chunk_size,
+            num_decoding_left_chunks=self.config.num_decoding_left_chunks,
+            enable_full_context=self.config.enable_full_context,
+            max_chunk_size=self.config.max_chunk_size)  # (B, T, T)
+        if self.llm.config._attn_implementation == "sdpa":
+            att_mask = mask_to_bias(att_mask, token_cond.dtype)
         att_mask = att_mask.unsqueeze(1).float()  # (B, 1, T, T)
         result = self.llm.model(inputs_embeds=inputs,
                                 attention_mask=att_mask,
@@ -213,7 +224,18 @@ def inference(
         x_in[0:1, :, 3 * M:4 * M] = spk_cond
         x_in[0:1, :, 4 * M:5 * M] = mel_cond
         vocoder_lengths = torch.tensor([T], dtype=torch.long, device=device)
-        att_mask = non_causal_mask(vocoder_lengths).to(device)  # (B, T, T)
+        mask = ~make_pad_mask(vocoder_lengths).to(device)  # (B, T)
+        att_mask = add_optional_chunk_mask(
+            xs=token_cond, masks=mask.unsqueeze(1),
+            use_dynamic_chunk=self.config.use_dynamic_chunk,
+            use_dynamic_left_chunk=self.config.use_dynamic_left_chunk,
+            decoding_chunk_size=self.config.decoding_chunk_size,
+            static_chunk_size=self.config.static_chunk_size,
+            num_decoding_left_chunks=self.config.num_decoding_left_chunks,
+            enable_full_context=self.config.enable_full_context,
+            max_chunk_size=self.config.max_chunk_size)  # (B, T, T)
+        if self.llm.config._attn_implementation == "sdpa":
+            att_mask = mask_to_bias(att_mask, token_cond.dtype)
         att_mask = att_mask.unsqueeze(1).float()  # (B, 1, T, T)
         for step in range(1, len(t_span)):
             x_in[:, :, 0:M] = pt
@@ -239,6 +261,6 @@ def inference(
 
     def init_tokenizer(self):
         tokenizer = AutoTokenizer.from_pretrained(
-            self.config.llm_model_name_or_path)
+            self.config.text_tokenizer_path)
         tokenizer.bos_token = "<|im_start|>"
         return tokenizer
diff --git a/west/utils/mask.py b/west/utils/mask.py
@@ -48,4 +48,125 @@ def non_causal_mask(lengths):
     return mask
 
 
+def subsequent_chunk_mask(
+        size: int,
+        chunk_size: int,
+        num_left_chunks: int = -1,
+        device: torch.device = torch.device("cpu"),
+) -> torch.Tensor:
+    """Create mask for subsequent steps (size, size) with chunk size,
+       this is for streaming encoder
+
+    Args:
+        size (int): size of mask
+        chunk_size (int): size of chunk
+        num_left_chunks (int): number of left chunks
+            <0: use full chunk
+            >=0: use num_left_chunks
+        device (torch.device): "cpu" or "cuda" or torch.Tensor.device
+
+    Returns:
+        torch.Tensor: mask
+
+    Examples:
+        >>> subsequent_chunk_mask(4, 2)
+        [[1, 1, 0, 0],
+         [1, 1, 0, 0],
+         [1, 1, 1, 1],
+         [1, 1, 1, 1]]
+    """
+    ret = torch.zeros(size, size, device=device, dtype=torch.bool)
+    for i in range(size):
+        if num_left_chunks < 0:
+            start = 0
+        else:
+            start = max((i // chunk_size - num_left_chunks) * chunk_size, 0)
+        ending = min((i // chunk_size + 1) * chunk_size, size)
+        ret[i, start:ending] = True
+    return ret
+
+
+def mask_to_bias(mask: torch.Tensor, dtype: torch.dtype) -> torch.Tensor:
+    assert mask.dtype == torch.bool
+    assert dtype in [torch.float32, torch.bfloat16, torch.float16]
+    mask = mask.to(dtype)
+    # attention mask bias
+    # NOTE(Mddct): torch.finfo jit issues
+    #     chunk_masks = (1.0 - chunk_masks) * torch.finfo(dtype).min
+    mask = (1.0 - mask) * -1.0e+10
+    return mask
+
+
+def add_optional_chunk_mask(xs: torch.Tensor,
+                            masks: torch.Tensor,
+                            use_dynamic_chunk: bool,
+                            use_dynamic_left_chunk: bool,
+                            decoding_chunk_size: int,
+                            static_chunk_size: int,
+                            num_decoding_left_chunks: int,
+                            enable_full_context: bool = True,
+                            max_chunk_size: int = 25):
+    """ Apply optional mask for encoder.
+
+    Args:
+        xs (torch.Tensor): padded input, (B, L, D), L for max length
+        mask (torch.Tensor): mask for xs, (B, 1, L)
+        use_dynamic_chunk (bool): whether to use dynamic chunk or not
+        use_dynamic_left_chunk (bool): whether to use dynamic left chunk for
+            training.
+        decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's
+            0: default for training, use random dynamic chunk.
+            <0: for decoding, use full chunk.
+            >0: for decoding, use fixed chunk size as set.
+        static_chunk_size (int): chunk size for static chunk training/decoding
+            if it's greater than 0, if use_dynamic_chunk is true,
+            this parameter will be ignored
+        num_decoding_left_chunks: number of left chunks, this is for decoding,
+            the chunk size is decoding_chunk_size.
+            >=0: use num_decoding_left_chunks
+            <0: use all left chunks
+        enable_full_context (bool):
+            True: chunk size is [1, max_chunk_size] or full context(max_len)
+            False: chunk size ~ U[1, max_chunk_size]
+
+    Returns:
+        torch.Tensor: chunk mask of the input xs.
+    """
+    # Whether to use chunk mask or not
+    if use_dynamic_chunk:
+        max_len = xs.size(1)
+        if decoding_chunk_size < 0:
+            chunk_size = max_len
+            num_left_chunks = -1
+        elif decoding_chunk_size > 0:
+            chunk_size = decoding_chunk_size
+            num_left_chunks = num_decoding_left_chunks
+        else:
+            # chunk_size maybe [1, max_chunk_size] or max_len if full context.
+            chunk_size = torch.randint(1, max_len, (1, )).item()
+            num_left_chunks = -1
+            if chunk_size > max_len // 2 and enable_full_context:
+                chunk_size = max_len
+            else:
+                chunk_size = chunk_size % max_chunk_size + 1
+                if use_dynamic_left_chunk:
+                    max_left_chunks = (max_len - 1) // chunk_size
+                    num_left_chunks = torch.randint(0, max_left_chunks,
+                                                    (1, )).item()
+        chunk_masks = subsequent_chunk_mask(xs.size(1), chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    elif static_chunk_size > 0:
+        num_left_chunks = num_decoding_left_chunks
+        chunk_masks = subsequent_chunk_mask(xs.size(1), static_chunk_size,
+                                            num_left_chunks,
+                                            xs.device)  # (L, L)
+        chunk_masks = chunk_masks.unsqueeze(0)  # (1, L, L)
+        chunk_masks = masks & chunk_masks  # (B, L, L)
+    else:
+        chunk_masks = masks
+    return chunk_masks
+
 # print(non_causal_mask(torch.tensor([2, 3, 4], dtype=torch.long)))