wenet-e2e
diff --git a/‎wenet/bin/recognize.py‎
Lines changed: 4 additions & 1 deletion b/‎wenet/bin/recognize.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎wenet/chunkformer/attention.py‎
Lines changed: 33 additions & 28 deletions b/‎wenet/chunkformer/attention.py‎
Lines changed: 33 additions & 28 deletions
diff --git a/‎wenet/chunkformer/convolution.py‎
Lines changed: 31 additions & 22 deletions b/‎wenet/chunkformer/convolution.py‎
Lines changed: 31 additions & 22 deletions
diff --git a/‎wenet/chunkformer/embedding.py‎
Lines changed: 13 additions & 10 deletions b/‎wenet/chunkformer/embedding.py‎
Lines changed: 13 additions & 10 deletions
@@ -281,7 +281,10 @@ def main():
         with torch.no_grad():
             for batch_idx, batch in enumerate(test_data_loader):
                 keys = batch["keys"]
-                feats = batch["feats"].to(device) if type(batch["feats"]) is torch.Tensor else batch["feats"]
+                if type(batch["feats"]) is torch.Tensor:
+                    feats = batch["feats"].to(device)
+                else:
+                    feats = batch["feats"]
                 target = batch["target"].to(device)
                 feats_lengths = batch["feats_lengths"].to(device)
                 target_lengths = batch["target_lengths"].to(device)
 
@@ -1,7 +1,7 @@
 """Multi-Head Attention layer definition."""
 
 import math
-from typing import Tuple, Union
+from typing import Tuple
 
 import torch
 from torch import nn
@@ -107,7 +107,6 @@ def forward(self, query: torch.Tensor,
             #   non-trivial to calculate `next_cache_start` here.
             new_cache = torch.cat((k, v), dim=-1)
         else:
-            # streaming long-form transcription is disabled if input cache is empty, only support long-form transcription and masked batch
             new_cache = cache
 
         n_batch_pos = pos_emb.size(0)
@@ -137,15 +136,18 @@ def forward(self, query: torch.Tensor,
 
         return self.forward_attention(v, scores, mask), new_cache
 
-    def forward_parallel_chunk(self, query: torch.Tensor,
-                key: torch.Tensor, value: torch.Tensor,
-                mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
-                pos_emb: torch.Tensor = torch.empty(0),
-                cache: torch.Tensor = torch.zeros((0, 0, 0)),
-                right_context_size: int = 0,
-                left_context_size: int = 0,
-                truncated_context_size: int = 0
-                ) -> Tuple[torch.Tensor, torch.Tensor]:
+    def forward_parallel_chunk(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        mask: torch.Tensor = torch.ones((0, 0, 0), dtype=torch.bool),
+        pos_emb: torch.Tensor = torch.empty(0),
+        cache: torch.Tensor = torch.zeros((0, 0, 0)),
+        right_context_size: int = 0,
+        left_context_size: int = 0,
+        truncated_context_size: int = 0
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Compute 'Scaled Dot Product Attention' with rel. positional encoding.
         Args:
             query (torch.Tensor): Query tensor (#batch, time1, size).
@@ -169,32 +171,39 @@ def forward_parallel_chunk(self, query: torch.Tensor,
         q = q.transpose(1, 2)  # (batch, time1, head, d_k)
         cache_t = cache.size(0)
         if cache_t == 0:
-            cache = torch.zeros((left_context_size, self.h, self.d_k * 2), device=q.device, dtype=q.dtype)
+            cache = torch.zeros(
+                (left_context_size, self.h, self.d_k * 2), 
+                device=q.device, dtype=q.dtype
+            )
+        # (B, head, time1, d_k * 2),
+        kv = torch.cat([k, v], dim=-1)
+        # [n_chunk * chunk_size, head, F]
+        kv = kv.transpose(1, 2).reshape(-1, self.h, self.d_k * 2)
 
-        kv = torch.cat([k, v], dim=-1) # (B, head, time1, d_k * 2),
-        kv = kv.transpose(1, 2).reshape(-1, self.h, self.d_k * 2) # [n_chunk * chunk_size, head, F]
 
-
-        #----------Overlapping Chunk Transformation-----------------------------------
+        # ----------Overlapping Chunk Transformation-----------------------------------
         kv = torch.cat([cache, kv], dim=0)
 
         if cache_t > 0:
             new_cache = kv[:truncated_context_size + cache.size(0)][-cache.size(0):]
         else:
+            # Streaming long-form transcription is disabled if input cache is empty,
             new_cache = torch.zeros((0, 0, 0), device=q.device, dtype=q.dtype)
         kv = torch.nn.functional.pad(kv, (0, 0, 0, 0, 0, right_context_size))
-        kv = kv.unfold(0, left_context_size + q.shape[1] + right_context_size, q.shape[1])
-        #-----------------------------------------------------------------------------
-
+        kv = kv.unfold(
+            0, 
+            left_context_size + q.shape[1] + right_context_size, 
+            q.shape[1]
+        )
+        # -----------------------------------------------------------------------------
 
-        kv = kv.transpose(2, 3) #[n_chunk + 1, head, F, left_context_size]
+        # [n_chunk + 1, head, F, left_context_size]
+        kv = kv.transpose(2, 3)
         k, v = torch.split(
             kv, kv.size(-1) // 2, dim=-1)
-        
+
         # NOTE(xcsong): We do cache slicing in encoder.forward_chunk, since it's
         #   non-trivial to calculate `next_cache_start` here.
-
-
         n_batch_pos = pos_emb.size(0)
         p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k)
         p = p.transpose(1, 2)  # (batch, head, time1, d_k)
@@ -216,14 +225,10 @@ def forward_parallel_chunk(self, query: torch.Tensor,
         # (batch, head, time1, time2)
         matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1))
 
-        # Remove rel_shift since it is useless in speech recognition,
-        # and it requires special attention for streaming.
-
+        # Add relative shift with right context inclusion,it can stream
         matrix_bd = self.rel_shift(matrix_bd, left_context_size, right_context_size)
 
         scores = (matrix_ac + matrix_bd) / math.sqrt(
             self.d_k)  # (batch, head, time1, time2)
 
-
         return self.forward_attention(v, scores, mask), new_cache
-
 
@@ -19,8 +19,6 @@
 
 import torch
 from torch import nn
-from wenet.utils.class_utils import WENET_NORM_CLASSES
-
 
 class ChunkConvolutionModule(nn.Module):
     """ConvolutionModule in ChunkFormer model."""
@@ -58,11 +56,13 @@ def __init__(self,
             padding = 0
             self.lorder = kernel_size - 1
         elif dynamic_conv:
+            # kernel_size should be an odd number for none causal convolution
             assert (kernel_size - 1) % 2 == 0
             padding = 0
-            self.lorder = (kernel_size - 1)//2
+            self.lorder = (kernel_size - 1) // 2
         else:
             # kernel_size should be an odd number for none causal convolution
+            assert (kernel_size - 1) % 2 == 0
             padding = (kernel_size - 1) // 2
             self.lorder = 0
         self.depthwise_conv = nn.Conv1d(
@@ -92,7 +92,7 @@ def __init__(self,
             bias=bias,
         )
         self.activation = activation
-        
+
     def forward(
         self,
         x: torch.Tensor,
@@ -145,13 +145,17 @@ def forward(
             size = self.lorder + decoding_chunk_size
             step = decoding_chunk_size
 
-            n_frames_pad = (step - ((x.size(2) - size) %  step)) % step
-            x = torch.nn.functional.pad(x, (0, n_frames_pad)) # (batch, 2*channel, dim + n_frames_pad)
+            n_frames_pad = (step - ((x.size(2) - size) % step)) % step
+            # (batch, 2*channel, dim + n_frames_pad)
+            x = torch.nn.functional.pad(x, (0, n_frames_pad))
 
             n_chunks = ((x.size(2) - size) // step) + 1
-            x = x.unfold(-1, size=size, step=step) # [B, C, n_chunks, size]
-            x = x.transpose(1, 2) # [B, n_chunks, C, size]
-            x = x.reshape(-1, x.size(2), x.size(3)) # [B * n_chunks, C, size]
+            # [B, C, n_chunks, size]
+            x = x.unfold(-1, size=size, step=step)
+            # [B, n_chunks, C, size]
+            x = x.transpose(1, 2)
+            # [B * n_chunks, C, size]
+            x = x.reshape(-1, x.size(2), x.size(3))
 
             # pad right for dynamic conv
             x = nn.functional.pad(x, (0, self.lorder), 'constant', 0.0)
@@ -161,19 +165,22 @@ def forward(
         x = self.depthwise_conv(x)
 
         if self.dynamic_conv:
-            # x size: [B * n_chunk, C, decoding_chunk_size]
-            x = x.reshape(-1, n_chunks, x.size(1), x.size(2)) # [B, n_chunk, C, decoding_chunk_size]
-            x = x.transpose(1, 2) # [B, C, n_chunks, decoding_chunk_size]
-            x = x.reshape(x.size(0), x.size(1), -1) # [B, C, n_chunks * decoding_chunk_size]
-            x = x[..., :x.size(2) - n_frames_pad] # (batch, channel, dim)
+            # [B, n_chunk, C, decoding_chunk_size]
+            x = x.reshape(-1, n_chunks, x.size(1), x.size(2))
+            # [B, C, n_chunks, decoding_chunk_size]
+            x = x.transpose(1, 2)
+            # [B, C, n_chunks * decoding_chunk_size]
+            x = x.reshape(x.size(0), x.size(1), -1)
+            # remove padding
+            x = x[..., :x.size(2) - n_frames_pad]
 
         if self.use_layer_norm:
             x = x.transpose(1, 2)
         x = self.activation(self.norm(x))
         if self.use_layer_norm:
             x = x.transpose(1, 2)
         x = self.pointwise_conv2(x)
-        
+
         # mask batch padding
         if mask_pad.size(2) > 0:  # time > 0
             x.masked_fill_(~mask_pad.to(torch.bool), 0.0)
@@ -202,7 +209,7 @@ def forward_parallel_chunk(
         """
         # exchange the temporal dimension and the feature dimension
         x = x.transpose(1, 2)  # (#batch, channels, time)
-        lorder = self.kernel_size//2
+        lorder = self.kernel_size // 2
         chunk_size = x.shape[-1]
         cache_t = cache.size(-1)
         if cache_t == 0:
@@ -211,19 +218,21 @@ def forward_parallel_chunk(
         x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
         x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
 
-        #----------Overlapping Chunk Transformation-----------------------------------
-        x = x.transpose(0, 1).reshape( self.channels, -1)  # [C, n_chunk * T]
+        # ----------Overlapping Chunk Transformation-----------------------------------
+        x = x.transpose(0, 1).reshape(self.channels, -1)  # [C, n_chunk * T]
         x = torch.cat([cache, x], dim=-1)
 
-        # streaming long-form transcription is disabled if input cache is empty, only support long-form transcription and masked batch
+        # Streaming long-form transcription is disabled if input cache is empty
         if cache_t > 0:
-            new_cache = x[:, :truncated_context_size + cache.size(-1)][:, -cache.size(-1):]
+            new_cache = x[:, :truncated_context_size + cache.size(-1)]
+            new_cache = new_cache[:, -cache.size(-1):]
         else:
             new_cache = torch.zeros((0, 0))
 
         x = nn.functional.pad(x, (0, lorder), 'constant', 0.0)
-        x = x.unfold(-1, chunk_size + 2 * lorder, chunk_size).transpose(0, 1) #[n_chunk +1, C, chunk_size + 2 * lorder]
-        #-----------------------------------------------------------------------------
+        x = x.unfold(-1, chunk_size + 2 * lorder, chunk_size).transpose(0, 1)
+        # [n_chunk +1, C, chunk_size + 2 * lorder]
+        # -----------------------------------------------------------------------------
 
         if mask_pad.size(2) > 0:  # time > 0
             x = torch.where(mask_pad, x, 0)
 
@@ -4,14 +4,10 @@
 from typing import Tuple, Union
 
 import torch
-import torch.nn.functional as F
 
 class RelPositionalEncodingWithRightContext(torch.nn.Module):
     """Relative positional encoding module.
 
-    See : Appendix B in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
-    Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/transformer/embedding.py
-
     Args:
         d_model: Embedding dimension.
         dropout_rate: Dropout rate.
@@ -50,14 +46,19 @@ def extend_pe(self, size: int, left_context: Union[int, torch.Tensor] = 0) -> No
 
         # Reserve the order of positive indices and concat both positive and
         # negative indices. This is used to support the shifting trick
-        # as in "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
+        # as in "Transformer-XL: Attentive Language Models Beyond a 
+        # Fixed-Length Context"
         pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
         pe_negative = pe_negative[1:].unsqueeze(0)
         self.pe = torch.cat([pe_positive, pe_negative], dim=1)
 
-    def position_encoding(self, offset: Union[int, torch.Tensor], size: int,
-                          apply_dropout: bool = False, 
-                          right_context_size: Union[int, torch.Tensor] = 0) -> torch.Tensor:
+    def position_encoding(
+        self,
+        offset: Union[int, torch.Tensor],
+        size: int,
+        apply_dropout: bool = False,
+        right_context_size: Union[int, torch.Tensor] = 0
+    ) -> torch.Tensor:
 
         if isinstance(offset, int):
             assert offset + size < self.max_len
@@ -102,5 +103,7 @@ def forward(
 
         """
         x = x * self.xscale
-        pos_emb = self.position_encoding(offset, x.size(1), False, right_context_size).to(device=x.device, dtype=x.dtype)
-        return self.dropout(x), self.dropout(pos_emb)
+        pos_emb = self.position_encoding(
+            offset, x.size(1), False, 
+            right_context_size).to(device=x.device, dtype=x.dtype)
+        return self.dropout(x), self.dropout(pos_emb)