[feat] add chunkformer training config and results

khanld · khanld · commit 3e027ee9143d · 2025-04-16T23:07:02.000+07:00
diff --git a/examples/librispeech/s0/README.md b/examples/librispeech/s0/README.md
@@ -313,3 +313,37 @@ test other
 | ctc_greedy_search      | 8.73 | 9.82 | 9.83 |
 | ctc prefix beam search | 8.70 | 9.81 | 9.79 |
 | attention rescoring    | 8.05 | 9.08 | 9.10 |
+
+
+## ChunkFormer U2++ Result
+
+* Model info:
+    * Encoder Params: 32,356,096
+    * Downsample rate: dw_striding 8x
+    * encoder_dim 256, head 4, linear_units 2048
+    * num_blocks 12, cnn_module_kernel 15
+* Feature info: using fbank feature, cmvn, dither, online speed perturb
+* Training info:
+    * train_u2++_chunkformer_small.yaml, kernel size 15
+    * dynamic batch size 120.000, 2 gpu, acc_grad 4, 200 epochs, dither 1.0
+    * adamw, lr 1e-3, warmuplr, warmup_steps: 25000
+    * specaug and speed perturb
+* Decoding info: ctc_weight 0.3, reverse weight 0.5, average_num 100, beam size 10
+
+#### Full context training -> Chunk context inferencing:
+⚠️ Attention Decoder does **not** support chunk-context inference due to cross-attention mismatch with full context training. Chunk-context training is required to resolve this mismatch.
+
+| Decoding Mode          | Dev Clean | Dev Other | Test Clean | Test Other |
+|------------------------|-----------|-----------|------------|------------|
+| CTC Greedy Search      | 3.05      | 8.84      | 3.27       | 8.54       |
+| CTC Prefix Beam Search | 3.04      | 8.83      | 3.26       | 8.54       |
+| Attention Decoder      | 4.58      | 9.62      | 5.07       | 9.22       |
+| Attention Rescoring    | 2.83      | 8.39      | 2.97       | 8.02       |
+
+#### Full context training -> Full context inferencing:
+| Decoding Mode          | Dev Clean | Dev Other | Test Clean | Test Other |
+|------------------------|-----------|-----------|------------|------------|
+| CTC Greedy Search      | 3.08      | 8.82      | 3.24       | 8.55       |
+| CTC Prefix Beam Search | 3.06      | 8.80      | 3.23       | 8.53       |
+| Attention Decoder      | 2.92      | 8.28      | 3.03       | 8.05       |
+| Attention Rescoring    | 2.80      | 8.37      | 2.94       | 8.03       |
diff --git a/examples/librispeech/s0/conf/train_u2++_chunkformer_small.yaml b/examples/librispeech/s0/conf/train_u2++_chunkformer_small.yaml
@@ -0,0 +1,116 @@
+# network architecture
+# encoder related
+encoder: chunkformer
+encoder_conf:
+    output_size: 256    # dimension of attention
+    attention_heads: 4
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: dw_striding # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+    normalize_before: true
+    cnn_module_kernel: 15
+    use_cnn_module: True
+    activation_type: 'swish'
+    pos_enc_layer_type: 'chunk_rel_pos'
+    selfattention_layer_type: 'chunk_rel_seflattn'
+    dynamic_conv: false
+    cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+
+# decoder related
+decoder: bitransformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 3
+    r_num_blocks: 3
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.1
+    src_attention_dropout_rate: 0.1
+
+tokenizer: bpe
+tokenizer_conf:
+  symbol_table_path: 'data/lang_char/train_960_bpe5000_units.txt'
+  split_with_space: false
+  bpe_path: 'data/lang_char/train_960_bpe5000.model'
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train_960/global_cmvn'
+  is_json_cmvn: true
+
+# hybrid CTC/attention
+model: asr_model
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+    reverse_weight: 0.3
+
+# dataset related
+dataset: asr
+dataset_conf:
+    filter_conf:
+        max_length: 40960
+        min_length: 0
+        token_max_length: 400
+        token_min_length: 1
+        # min_output_input_ratio: 0.0005
+        # max_output_input_ratio: 0.1
+    resample_conf:
+        resample_rate: 16000
+    speed_perturb: true
+    fbank_conf:
+        num_mel_bins: 80
+        frame_shift: 10
+        frame_length: 25
+        dither: 1.0
+    spec_aug: true
+    spec_aug_conf:
+        num_t_mask: 2
+        num_f_mask: 2
+        max_t: 50
+        max_f: 10
+    spec_sub: false
+    spec_sub_conf:
+        num_t_sub: 3
+        max_t: 30
+    shuffle: true
+    shuffle_conf:
+        shuffle_size: 1000
+    sort: false
+    sort_conf:
+        sort_size: 2000  # sort_size should be less than shuffle_size
+    batch_conf:
+        batch_type: 'dynamic' # static or dynamic
+        max_frames_in_batch: 120000
+        # At inference, pad_feat should be False to activate
+        # masked batch and chunk context decoding
+        pad_feat: True
+
+grad_clip: 5
+accum_grad: 4
+max_epoch: 200
+log_interval: 100
+
+optim: adamw
+optim_conf:
+    lr: 0.001
+scheduler: warmuplr     # pytorch v1.1.0+ required
+scheduler_conf:
+    warmup_steps: 25000
diff --git a/wenet/chunkformer/embedding.py b/wenet/chunkformer/embedding.py
@@ -25,16 +25,14 @@ def __init__(self, d_model: int, dropout_rate: float, max_len: int = 5000) -> No
         self.max_len = max_len
         self.extend_pe(max_len)
 
-    def extend_pe(self, size: int, left_context: Union[int, torch.Tensor] = 0) -> None:
+    def extend_pe(self, size: int) -> None:
         """Reset the positional encodings."""
-        x_size_1 = size + left_context
-
         # Suppose `i` means to the position of query vector and `j` means the
         # position of key vector. We use position relative positions when keys
         # are to the left (i>j) and negative relative positions otherwise (i<j).
-        pe_positive = torch.zeros(x_size_1, self.d_model)
-        pe_negative = torch.zeros(x_size_1, self.d_model)
-        position = torch.arange(0, x_size_1, dtype=torch.float32).unsqueeze(1)
+        pe_positive = torch.zeros(size, self.d_model)
+        pe_negative = torch.zeros(size, self.d_model)
+        position = torch.arange(0, size, dtype=torch.float32).unsqueeze(1)
         div_term = torch.exp(
             torch.arange(0, self.d_model, 2, dtype=torch.float32)
             * -(math.log(10000.0) / self.d_model)
diff --git a/wenet/chunkformer/encoder.py b/wenet/chunkformer/encoder.py
@@ -257,8 +257,8 @@ def forward_parallel_chunk(
 
         conv_lorder = self.cnn_module_kernel // 2
 
-        upper_bounds = []
-        lower_bounds = []
+        upper_bounds_att = []
+        lower_bounds_att = []
         upper_bounds_conv = []
         lower_bounds_conv = []
         x_pad = []
@@ -279,13 +279,13 @@ def forward_parallel_chunk(
 
             # attention boundaries
             max_len = 1 + (xs_origin_len - context) // subsampling
-            upper_bound = chunk_size + right_context_size + torch.arange(
+            upper_bound_att = chunk_size + right_context_size + torch.arange(
                 0, 
                 1 + (xs_origin_len + n_frames_pad - context) // subsampling, 
                 1 + (size - context) // subsampling, device=device
             )
-            lower_bound = upper_bound - max_len
-            upper_bound += offs
+            lower_bound_att = upper_bound_att - max_len
+            upper_bound_att += offs
 
             # convolution boundaries
             upper_bound_conv = chunk_size + conv_lorder + torch.arange(
@@ -301,8 +301,8 @@ def forward_parallel_chunk(
 
 
             xs_lens += [size] * (n_chunk - 1) + [size - n_frames_pad]
-            upper_bounds.append(upper_bound)
-            lower_bounds.append(lower_bound)
+            upper_bounds_att.append(upper_bound_att)
+            lower_bounds_att.append(lower_bound_att)
             upper_bounds_conv.append(upper_bound_conv)
             lower_bounds_conv.append(lower_bound_conv)
             x_pad.append(x)
@@ -312,8 +312,8 @@ def forward_parallel_chunk(
         xs = torch.cat(x_pad, dim=0).to(device)
         xs_lens = torch.tensor(xs_lens).to(device)
         masks = ~make_pad_mask(xs_lens, xs.size(1)).unsqueeze(1)  # (B, 1, T)
-        upper_bounds = torch.cat(upper_bounds).unsqueeze(1).to(device)
-        lower_bounds = torch.cat(lower_bounds).unsqueeze(1).to(device)
+        upper_bounds_att = torch.cat(upper_bounds_att).unsqueeze(1).to(device)
+        lower_bounds_att = torch.cat(lower_bounds_att).unsqueeze(1).to(device)
         upper_bounds_conv = torch.cat(upper_bounds_conv).unsqueeze(1).to(device)
         lower_bounds_conv = torch.cat(lower_bounds_conv).unsqueeze(1).to(device)
 
@@ -346,7 +346,7 @@ def forward_parallel_chunk(
             left_context_size + chunk_size + right_context_size, 
             device=masks.device
         ).unsqueeze(0).repeat(xs.size(0), 1)
-        att_mask = (lower_bounds <= att_mask) & (att_mask < upper_bounds)
+        att_mask = (lower_bounds_att <= att_mask) & (att_mask < upper_bounds_att)
         att_mask = att_mask.flip(-1).unsqueeze(1)
 
         r_att_cache = []
diff --git a/wenet/chunkformer/encoder_layer copy.py b/wenet/chunkformer/encoder_layer copy.py