add missing files

Mddct · Mddct · commit 82c34c4b0ef8 · 2025-02-07T14:24:03.000+08:00
diff --git a/wenet/firered/attention.py b/wenet/firered/attention.py
@@ -49,7 +49,7 @@ def position_encoding(self,
 
         raise NotImplementedError('firedasr not support streaming pos encding')
 
-    def forward(self, x):
+    def forward(self, x, offset=None):
         Tmax, T = self.pe.size(1), x.size(1)
         pos_emb = self.pe[:, Tmax // 2 - T + 1:Tmax // 2 + T].clone().detach()
         return self.dropout(x), self.dropout(pos_emb)
@@ -99,7 +99,7 @@ def rel_shift(self, x):
                                  x.size()[1],
                                  x.size(3) + 1, x.size(2))
         x = x_padded[:, :, 1:].view_as(x)
-        x = x[:, :, :, :, x.size(-1) // 2 + 1]
+        x = x[:, :, :, :x.size(-1) // 2 + 1]
 
         return x
 
diff --git a/wenet/firered/convert_FireRed_AED_L_to_wenet_config_and_ckpt.py b/wenet/firered/convert_FireRed_AED_L_to_wenet_config_and_ckpt.py
@@ -92,7 +92,7 @@ def convert_to_wenet_yaml(tokenizer: BaseTokenizer, dims, wenet_yaml_path: str,
     configs['ctc_conf'] = {}
     configs['ctc_conf']['ctc_blank_id'] = 0
 
-    configs['cmvn'] = None
+    configs['cmvn'] = 'global_cmvn'
     configs['cmvn_conf'] = {}
     configs['cmvn_conf']['cmvn_file'] = json_cmvn_path
     configs['cmvn_conf']['is_json_cmvn'] = True
diff --git a/wenet/firered/encoder.py b/wenet/firered/encoder.py
@@ -0,0 +1,127 @@
+from typing import Optional
+
+import torch
+from wenet.firered.encoder_layer import FireRedConformerEncoderLayer
+from wenet.transformer.convolution import ConvolutionModule
+from wenet.transformer.encoder import BaseEncoder
+from wenet.utils.class_utils import (WENET_ACTIVATION_CLASSES,
+                                     WENET_ATTENTION_CLASSES,
+                                     WENET_MLP_CLASSES)
+
+
+class FireRedConformerEncoder(BaseEncoder):
+    """Conformer encoder module."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        pos_enc_layer_type: str = "rel_pos",
+        normalize_before: bool = True,
+        static_chunk_size: int = 0,
+        use_dynamic_chunk: bool = False,
+        global_cmvn: torch.nn.Module = None,
+        use_dynamic_left_chunk: bool = False,
+        positionwise_conv_kernel_size: int = 1,
+        macaron_style: bool = True,
+        selfattention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        cnn_module_kernel: int = 15,
+        causal: bool = False,
+        cnn_module_norm: str = "batch_norm",
+        query_bias: bool = True,
+        key_bias: bool = True,
+        value_bias: bool = True,
+        conv_bias: bool = True,
+        gradient_checkpointing: bool = False,
+        use_sdpa: bool = False,
+        layer_norm_type: str = 'layer_norm',
+        norm_eps: float = 1e-5,
+        n_kv_head: Optional[int] = None,
+        head_dim: Optional[int] = None,
+        mlp_type: str = 'position_wise_feed_forward',
+        mlp_bias: bool = True,
+        n_expert: int = 8,
+        n_expert_activated: int = 2,
+        conv_norm_eps: float = 1e-5,
+        conv_inner_factor: int = 2,
+        final_norm: bool = True,
+    ):
+        """ConstruConformerEncoder
+
+        Args:
+            input_size to use_dynamic_chunk, see in BaseEncoder
+            positionwise_conv_kernel_size (int): Kernel size of positionwise
+                conv1d layer.
+            macaron_style (bool): Whether to use macaron style for
+                positionwise layer.
+            selfattention_layer_type (str): Encoder attention layer type,
+                the parameter has no effect now, it's just for configure
+                compatibility.
+            activation_type (str): Encoder activation function type.
+            use_cnn_module (bool): Whether to use convolution module.
+            cnn_module_kernel (int): Kernel size of convolution module.
+            causal (bool): whether to use causal convolution or not.
+            key_bias: whether use bias in attention.linear_k, False for whisper models.
+        """
+        super().__init__(input_size, output_size, attention_heads,
+                         linear_units, num_blocks, dropout_rate,
+                         positional_dropout_rate, attention_dropout_rate,
+                         input_layer, pos_enc_layer_type, normalize_before,
+                         static_chunk_size, use_dynamic_chunk, global_cmvn,
+                         use_dynamic_left_chunk, gradient_checkpointing,
+                         use_sdpa, layer_norm_type, norm_eps, final_norm)
+        activation = WENET_ACTIVATION_CLASSES[activation_type]()
+
+        # self-attention module definition
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+            query_bias,
+            key_bias,
+            value_bias,
+            use_sdpa,
+            n_kv_head,
+            head_dim,
+        )
+        # feed-forward module definition
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+            activation,
+            mlp_bias,
+            n_expert,
+            n_expert_activated,
+        )
+        # convolution module definition
+        convolution_layer_args = (output_size, cnn_module_kernel, activation,
+                                  cnn_module_norm, causal, conv_bias,
+                                  conv_norm_eps, conv_inner_factor)
+
+        mlp_class = WENET_MLP_CLASSES[mlp_type]
+
+        self.encoders = torch.nn.ModuleList([
+            FireRedConformerEncoderLayer(
+                output_size,
+                WENET_ATTENTION_CLASSES[selfattention_layer_type](
+                    *encoder_selfattn_layer_args),
+                mlp_class(*positionwise_layer_args),
+                mlp_class(*positionwise_layer_args) if macaron_style else None,
+                ConvolutionModule(
+                    *convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                layer_norm_type=layer_norm_type,
+                norm_eps=norm_eps,
+            ) for _ in range(num_blocks)
+        ])
diff --git a/wenet/firered/encoder_layer.py b/wenet/firered/encoder_layer.py
@@ -0,0 +1,42 @@
+from typing import Optional
+
+import torch
+from torch import nn
+from wenet.transformer.encoder_layer import ConformerEncoderLayer
+
+
+class FireRedConformerEncoderLayer(ConformerEncoderLayer):
+    """Encoder layer module.
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention`
+            instance can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module
+             instance.
+            `PositionwiseFeedForward` instance can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool):
+            True: use layer_norm before each sub-block.
+            False: use layer_norm after each sub-block.
+    """
+
+    def __init__(self,
+                 size: int,
+                 self_attn: torch.nn.Module,
+                 feed_forward: Optional[nn.Module] = None,
+                 feed_forward_macaron: Optional[nn.Module] = None,
+                 conv_module: Optional[nn.Module] = None,
+                 dropout_rate: float = 0.1,
+                 normalize_before: bool = True,
+                 layer_norm_type: str = 'layer_norm',
+                 norm_eps: float = 0.00001):
+        super().__init__(size, self_attn, feed_forward, feed_forward_macaron,
+                         conv_module, dropout_rate, normalize_before,
+                         layer_norm_type, norm_eps)
+        del self.norm_mha
+        self.norm_mha = torch.nn.Identity()
diff --git a/wenet/firered/model.py b/wenet/firered/model.py
@@ -58,8 +58,6 @@ def __init__(
 
         # fix final norm in conformer
         del self.encoder.after_norm
-        # fix output bias
-        del self.decoder.output_layer.bias
 
     @torch.jit.unused
     def forward_encoder_chunk(