[feat] add masked batch and limited context decoding

khanld · khanld · commit 24a19c5ebcb6 · 2025-04-15T13:14:11.000+07:00
diff --git a/wenet/bin/recognize.py b/wenet/bin/recognize.py
@@ -281,7 +281,7 @@ def main():
         with torch.no_grad():
             for batch_idx, batch in enumerate(test_data_loader):
                 keys = batch["keys"]
-                feats = batch["feats"].to(device)
+                feats = batch["feats"].to(device) if type(batch["feats"]) is torch.Tensor else batch["feats"]
                 target = batch["target"].to(device)
                 feats_lengths = batch["feats_lengths"].to(device)
                 target_lengths = batch["target_lengths"].to(device)
diff --git a/wenet/chunkformer/attention.py b/wenet/chunkformer/attention.py
@@ -103,10 +103,12 @@ def forward(self, query: torch.Tensor,
             k = torch.cat([key_cache, k], dim=2)
             v = torch.cat([value_cache, v], dim=2)
 
-        # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
-        #   non-trivial to calculate `next_cache_start` here.
-
-        new_cache = torch.cat((k, v), dim=-1)
+            # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
+            #   non-trivial to calculate `next_cache_start` here.
+            new_cache = torch.cat((k, v), dim=-1)
+        else:
+            # streaming long-form transcription is disabled if input cache is empty, only support long-form transcription and masked batch
+            new_cache = cache
 
         n_batch_pos = pos_emb.size(0)
         p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
@@ -139,7 +141,7 @@ def forward_parallel_chunk(self, query: torch.Tensor,
                 key: torch.Tensor, value: torch.Tensor,
                 mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
                 pos_emb: torch.Tensor = torch.empty(0),
-                cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+                cache: torch.Tensor = torch.zeros((0, 0, 0)),
                 right_context_size: int = 0,
                 left_context_size: int = 0,
                 truncated_context_size: int = 0
@@ -153,20 +155,20 @@ def forward_parallel_chunk(self, query: torch.Tensor,
                 (#batch, time1, time2), (0, 0, 0) means fake mask.
             pos_emb (torch.Tensor): Positional embedding tensor
                 (#batch, time2, size).
-            cache (torch.Tensor): Cache tensor (B, 1, head, cache_t, d_k * 2),
-                where `cache_t == chunk_size * num_decoding_left_chunks`
+            cache (torch.Tensor): Cache tensor (cache_t, head, d_k * 2),
+                where `cache_t == left_context_size`
                 and `head * d_k == size`
         Returns:
             torch.Tensor: Output tensor (#batch, time1, d_model).
-            torch.Tensor: Cache tensor (1, head, cache_t + time1, d_k * 2)
-                where `cache_t == chunk_size * num_decoding_left_chunks`
+            torch.Tensor: Cache tensor (cache_t, head, d_k * 2)
+                where `cache_t == left_context_size`
                 and `head * d_k == size`
         """
         q, k, v = self.forward_qkv(query, key, value)
 
         q = q.transpose(1, 2)  # (batch, time1, head, d_k)
-
-        if cache.size(2) <= 0:
+        cache_t = cache.size(0)
+        if cache_t == 0:
             cache = torch.zeros((left_context_size, self.h, self.d_k * 2), device=q.device, dtype=q.dtype)
 
         kv = torch.cat([k, v], dim=-1) # (B, head, time1, d_k * 2),
@@ -175,7 +177,11 @@ def forward_parallel_chunk(self, query: torch.Tensor,
 
         #----------Overlapping Chunk Transformation-----------------------------------
         kv = torch.cat([cache, kv], dim=0)
-        new_cache = kv[:truncated_context_size + cache.size(0)][-cache.size(0):].cpu()
+
+        if cache_t > 0:
+            new_cache = kv[:truncated_context_size + cache.size(0)][-cache.size(0):]
+        else:
+            new_cache = torch.zeros((0, 0, 0), device=q.device, dtype=q.dtype)
         kv = torch.nn.functional.pad(kv, (0, 0, 0, 0, 0, right_context_size))
         kv = kv.unfold(0, left_context_size + q.shape[1] + right_context_size, q.shape[1])
         #-----------------------------------------------------------------------------
diff --git a/wenet/chunkformer/convolution.py b/wenet/chunkformer/convolution.py
@@ -185,26 +185,27 @@ def forward_parallel_chunk(
         self,
         x: torch.Tensor,
         mask_pad: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
-        cache: torch.Tensor = torch.zeros((0, 0, 0)),
+        cache: torch.Tensor = torch.zeros((0, 0)),
         truncated_context_size: int = 0
 
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Compute convolution module.
         Args:
-            x (torch.Tensor): Input tensor (#batch, time, channels).
+            x (torch.Tensor): Input tensor (time, channels).
             mask_pad (torch.Tensor): used for batch padding (#batch, 1, time),
                 (0, 0, 0) means fake mask.
             cache (torch.Tensor): left context cache, it is only
-                used in causal convolution (#batch, channels, cache_t),
-                (0, 0, 0) meas fake cache.
+                used in causal convolution (channels, cache_t),
+                (0, 0) meas fake cache.
         Returns:
-            torch.Tensor: Output tensor (#batch, time, channels).
+            torch.Tensor: Output tensor (time, channels).
         """
         # exchange the temporal dimension and the feature dimension
         x = x.transpose(1, 2)  # (#batch, channels, time)
         lorder = self.kernel_size//2
         chunk_size = x.shape[-1]
-        if cache.size(0) == 0:
+        cache_t = cache.size(-1)
+        if cache_t == 0:
             cache = torch.zeros(self.channels, lorder).to(x.device)
         # GLU mechanism
         x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
@@ -213,9 +214,15 @@ def forward_parallel_chunk(
         #----------Overlapping Chunk Transformation-----------------------------------
         x = x.transpose(0, 1).reshape( self.channels, -1)  # [C, n_chunk * T]
         x = torch.cat([cache, x], dim=-1)
-        new_cache = x[:, :truncated_context_size + cache.size(-1)][:, -cache.size(-1):].cpu()
+
+        # streaming long-form transcription is disabled if input cache is empty, only support long-form transcription and masked batch
+        if cache_t > 0:
+            new_cache = x[:, :truncated_context_size + cache.size(-1)][:, -cache.size(-1):]
+        else:
+            new_cache = torch.zeros((0, 0))
+
         x = nn.functional.pad(x, (0, lorder), 'constant', 0.0)
-        x = x.unfold(-1, chunk_size + 2 * lorder, chunk_size).transpose(0, 1) #[n_chunk +1, C, cnn_cache_size]
+        x = x.unfold(-1, chunk_size + 2 * lorder, chunk_size).transpose(0, 1) #[n_chunk +1, C, chunk_size + 2 * lorder]
         #-----------------------------------------------------------------------------
 
         if mask_pad.size(2) > 0:  # time > 0
@@ -232,7 +239,6 @@ def forward_parallel_chunk(
         x = self.pointwise_conv2(x)
         # mask batch padding
         if mask_pad.size(2) > 0:  # time > 0
-            # x.masked_fill_(~mask_pad[:, :, self.lorder:], 0.0)
-            x.masked_fill_(~mask_pad[:, :, self.lorder:-self.lorder], 0.0)
+            x.masked_fill_(~mask_pad[:, :, lorder:-lorder], 0.0)
 
         return x.transpose(1, 2), new_cache
diff --git a/wenet/chunkformer/encoder.py b/wenet/chunkformer/encoder.py
@@ -177,22 +177,62 @@ def __init__(
             ) for _ in range(num_blocks)
         ])
 
+    def forward(self, 
+                xs: torch.Tensor,
+                xs_lens: torch.Tensor,
+                decoding_chunk_size: int = 0,
+                num_decoding_left_chunks: int = -1,
+                **kwargs):
+        """
+        Main forward function that dispatches to either the standard
+        forward pass or the parallel chunk version based on the
+        model's training mode.
+        """
+        if self.training:
+            return super().forward(
+                xs=xs,
+                xs_lens=xs_lens,
+                decoding_chunk_size=decoding_chunk_size,
+                num_decoding_left_chunks=num_decoding_left_chunks,
+                **kwargs
+            )  # Call the parent class's forward method
+        else:
+            if decoding_chunk_size > 0 and num_decoding_left_chunks > 0:
+                # If both decoding_chunk_size and num_decoding_left_chunks
+                # are set, use the parallel chunk version
+                return self.forward_parallel_chunk(
+                    xs=xs,
+                    xs_origin_lens=xs_lens,
+                    chunk_size=decoding_chunk_size,
+                    left_context_size=num_decoding_left_chunks,
+                    right_context_size=num_decoding_left_chunks, # we assume left and right context are the same
+                )
+            else:
+                # Otherwise, use the standard forward pass
+                return super().forward(
+                    xs=xs,
+                    xs_lens=xs_lens,
+                    decoding_chunk_size=decoding_chunk_size,
+                    num_decoding_left_chunks=num_decoding_left_chunks,
+                    **kwargs
+                )
+
     def forward_parallel_chunk(
         self,
         xs,
         xs_origin_lens,
         chunk_size: int = -1,
         left_context_size: int = -1,
         right_context_size: int = -1,
-        att_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
-        cnn_cache: torch.Tensor = torch.zeros((0, 0, 0, 0)),
+        att_cache: torch.Tensor = torch.zeros((0, 0, 0)),
+        cnn_cache: torch.Tensor = torch.zeros((0, 0)),
         truncated_context_size:int = 0,
         offset: torch.Tensor = torch.zeros(0),
         ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Embed positions in tensor.
 
         Args:
-            xs: padded input tensor (B, T, D)
+            xs: list of B input tensors (T, D)
             xs_lens: input length (B)
             decoding_chunk_size: decoding chunk size for dynamic chunk
                 0: default for training, use random dynamic chunk.
@@ -208,10 +248,11 @@ def forward_parallel_chunk(
             masks: torch.Tensor batch padding mask after subsample
                 (B, 1, T' ~= T/subsample_rate)
         """
-        assert offset.shape[0] == len(xs), f"{offset.shape[0]} - {len(xs)}"
+        if offset.shape[0] == 0:
+            offset = torch.zeros(len(xs), dtype=torch.long, device=xs_origin_lens.device)
         
         # --------------------------Chunk Batching-------------------------------------------
-        subsampling = self.embed.subsampling_factor
+        subsampling = self.embed.subsampling_rate
         context = self.embed.right_context + 1 # Add current frame
         size = (chunk_size - 1) * subsampling + context
         step = subsampling * chunk_size
@@ -258,6 +299,7 @@ def forward_parallel_chunk(
 
         xs = torch.cat(x_pad, dim=0).to(device)
         xs_lens = torch.tensor(xs_lens).to(device)
+        masks = ~make_pad_mask(xs_lens, xs.size(1)).unsqueeze(1)  # (B, 1, T)
         upper_bounds = torch.cat(upper_bounds).unsqueeze(1).to(device)
         lower_bounds = torch.cat(lower_bounds).unsqueeze(1).to(device)
         upper_bounds_conv = torch.cat(upper_bounds_conv).unsqueeze(1).to(device)
@@ -269,9 +311,7 @@ def forward_parallel_chunk(
             xs = self.global_cmvn(xs)
 
 
-        xs, pos_emb, xs_lens = self.embed(xs, xs_lens, offset=left_context_size, right_context_size=right_context_size)
-        masks = ~make_pad_mask(xs_lens, xs.size(1)).unsqueeze(1)  # (B, 1, T)
-
+        xs, pos_emb, masks = self.embed(xs, masks, offset=left_context_size, right_context_size=right_context_size)
 
         mask_pad = torch.arange(0, conv_lorder + chunk_size + conv_lorder, device=masks.device).unsqueeze(0).repeat(xs.size(0), 1) # [B, left_context_size + chunksize]
         mask_pad = (lower_bounds_conv <= mask_pad) & (mask_pad < upper_bounds_conv)
@@ -280,7 +320,6 @@ def forward_parallel_chunk(
         att_mask = (lower_bounds <= att_mask) & (att_mask < upper_bounds)
         att_mask = att_mask.flip(-1).unsqueeze(1)
 
-
         r_att_cache = []
         r_cnn_cache = []
         for i, layer in enumerate(self.encoders):
@@ -296,19 +335,35 @@ def forward_parallel_chunk(
             r_att_cache.append(new_att_cache)
             r_cnn_cache.append(new_cnn_cache)
 
-        del att_cache
-        del cnn_cache
         if self.normalize_before:
             xs = self.after_norm(xs)
 
-        xs_lens = self.embed.calc_length(xs_origin_lens)
-        offset += xs_lens
-
 
         # NOTE(xcsong): shape(r_att_cache) is (elayers, head, ?, d_k * 2),
         #   ? may be larger than cache_t1, it depends on required_cache_size
         r_att_cache = torch.stack(r_att_cache, dim=0)
         # NOTE(xcsong): shape(r_cnn_cache) is (e, b=1, hidden-dim, cache_t2)
         r_cnn_cache = torch.stack(r_cnn_cache, dim=0)
-        return xs, xs_lens, n_chunks, r_att_cache, r_cnn_cache, offset
-    
+
+        # It would be no need to reconstruct (padding) in greedy search 
+        # but for compatibility with Wenet, we reconstruct it here
+        xs_lens = self.embed.calc_length(xs_origin_lens)
+        xs, masks = self.reconstruct(xs, xs_lens, n_chunks)
+        offset += xs_lens
+
+        return xs, masks
+    
+    def reconstruct(
+        self, 
+        xs,
+        xs_lens,
+        n_chunks
+    ):
+        xs = xs.split(n_chunks, dim=0)   
+        xs = [x.reshape(-1, self._output_size)[:x_len] for x, x_len in zip(xs, xs_lens)]
+
+        xs = torch.nn.utils.rnn.pad_sequence(xs,
+                                    batch_first=True,
+                                    padding_value=0)
+        masks = ~make_pad_mask(xs_lens, xs.size(1)).unsqueeze(1).to(xs.device) # (B, 1, T)
+        return xs, masks
diff --git a/wenet/dataset/dataset.py b/wenet/dataset/dataset.py
@@ -132,24 +132,26 @@ def Dataset(data_type,
 
     batch_conf = conf.get('batch_conf', {})
     batch_type = batch_conf.get('batch_type', 'static')
+    pad_feat = batch_conf.get('pad_feat', 'True')
+
     assert batch_type in ['static', 'bucket', 'dynamic']
     if batch_type == 'static':
         assert 'batch_size' in batch_conf
         batch_size = batch_conf.get('batch_size', 16)
-        dataset = dataset.batch(batch_size, wrapper_class=processor.padding)
+        dataset = dataset.batch(batch_size, wrapper_class=lambda batch: processor.padding(batch, pad_feat))
     elif batch_type == 'bucket':
         assert 'bucket_boundaries' in batch_conf
         assert 'bucket_batch_sizes' in batch_conf
         dataset = dataset.bucket_by_sequence_length(
             processor.feats_length_fn,
             batch_conf['bucket_boundaries'],
             batch_conf['bucket_batch_sizes'],
-            wrapper_class=processor.padding)
+            wrapper_class=lambda batch: processor.padding(batch, pad_feat))
     else:
         max_frames_in_batch = batch_conf.get('max_frames_in_batch', 12000)
         dataset = dataset.dynamic_batch(
             processor.DynamicBatchWindow(max_frames_in_batch),
-            wrapper_class=processor.padding,
+            wrapper_class=lambda batch: processor.padding(batch, pad_feat)
         )
 
     return dataset
diff --git a/wenet/dataset/processor.py b/wenet/dataset/processor.py
@@ -527,7 +527,7 @@ def spec_trim(sample, max_t=20):
     return sample
 
 
-def padding(data):
+def padding(data, pad_feat=True):
     """ Padding the data into training data
 
         Args:
@@ -565,7 +565,7 @@ def padding(data):
 
     batch = {
         "keys": sorted_keys,
-        "feats": padded_feats,
+        "feats": padded_feats if pad_feat else sorted_feats,
         "target": padding_labels,
         "feats_lengths": feats_lengths,
         "target_lengths": label_lengths,
diff --git a/wenet/transformer/asr_model.py b/wenet/transformer/asr_model.py
@@ -301,7 +301,7 @@ def decode(
 
         Returns: dict results of all decoding methods
         """
-        assert speech.shape[0] == speech_lengths.shape[0]
+        assert len(speech) == len(speech_lengths)
         assert decoding_chunk_size != 0
         encoder_out, encoder_mask = self._forward_encoder(
             speech, speech_lengths, decoding_chunk_size,