wenet-e2e · robin1001 · Sep 10, 2025 · Sep 10, 2025
diff --git a/examples/aishell2/asr/README.md b/examples/aishell2/asr/README.md
@@ -0,0 +1,27 @@
+## Tutorial
+
+First, prepare the train data `data/train.jsonl`, the data is like:
+
+```
+{"key": "IC0001W0001", "wav": "AISHELL-2/iOS/data/wav/C0001/IC0001W0001.wav", "txt": "厨房用具"}
+{"key": "IC0001W0002", "wav": "AISHELL-2/iOS/data/wav/C0001/IC0001W0002.wav", "txt": "电压力锅"}
+```
+where `wav` is the wav path, `txt` is the transcript.
+
+To train the model, just run
+
+``` shell
+bash run.sh --stage train
+```
+
+To decode, just prepare the test data `data/test.jsonl` the same as train. then just run
+
+``` shell
+bash run.sh --stage decode
+```
+
+## Results
+
+| LLM        | Speech Encoder | LoRA | test CER | Details                               |
+|------------|----------------|------|----------|---------------------------------------|
+| Qwen3-1.7B | firered        | No   | 5.41     | 4 A800 GPUs, pack 18000, 10000 steps  |
diff --git a/examples/aishell2/asr/conf/accelerator_config.json b/examples/aishell2/asr/conf/accelerator_config.json
@@ -0,0 +1,3 @@
+{
+    "dispatch_batches": false
+}
diff --git a/examples/aishell2/asr/conf/ds_config_zero2.json b/examples/aishell2/asr/conf/ds_config_zero2.json
@@ -0,0 +1,54 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/examples/aishell2/asr/conf/generation_config.json b/examples/aishell2/asr/conf/generation_config.json
@@ -0,0 +1,9 @@
+{
+  "do_sample": false,
+  "eos_token_id": [
+      151645,
+      151643
+  ],
+  "max_new_tokens": 50,
+  "transformers_version": "4.37.0"
+}
diff --git a/examples/aishell2/asr/conf/qwen3-1.7b_firered.json b/examples/aishell2/asr/conf/qwen3-1.7b_firered.json
@@ -0,0 +1,17 @@
+{
+  "architectures": [
+    "TouchASU"
+  ],
+  "encoder_ds_rate": 4,
+  "encoder_projector_ds_rate": 2,
+  "hidden_size": 2048,
+  "llm_model_name_or_path": "Qwen/Qwen3-1.7B",
+  "lora_config": null,
+  "max_speech_frames": 2000,
+  "min_speech_frames": 20,
+  "model_type": "touch_asu",
+  "projector_hidden_size": 2048,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.52.3",
+  "wenet_model_name_or_path": "firered"
+}
diff --git a/examples/aishell2/asr/run.sh b/examples/aishell2/asr/run.sh
@@ -0,0 +1,63 @@
+# Copyright 2025 Binbin Zhang([email protected])
+
+[ ! -s west ] && ln -s ../../../west
+[ ! -s tools ] && ln -s ../../../tools
+export PYTHONPATH=$PYTHONPATH:$PWD
+# Change this to all your available gpus, such as "0,1,2,3"
+export CUDA_VISIBLE_DEVICES="0,1,2,3"
+num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F ',' '{print NF}')
+
+stage=train # data/train/decode
+data=data
+dir=exp/Qwen3-1.7B-firered
+steps=5000  # training steps
+
+model_conf=conf/qwen3-1.7b_firered.json
+decode_conf=conf/generation_config.json
+
+if [ $stage == "data" ] || [ $stage == "all" ]; then
+    echo "Prepare required data"
+fi
+
+
+if [ $stage == "train" ] || [ $stage == "all" ]; then
+    torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus west/bin/train.py \
+        --model_config_or_dir $model_conf \
+        --data_path $data/train.jsonl \
+        --output_dir $dir \
+        --pack_size 8192 \
+        --bf16 True \
+        --max_steps $steps \
+        --num_data_cycles 1000 \
+        --per_device_train_batch_size 1 \
+        --per_device_eval_batch_size 1 \
+        --gradient_accumulation_steps 4 \
+        --save_strategy "steps" \
+        --save_steps 100 \
+        --save_total_limit 100 \
+        --learning_rate 3e-4 \
+        --weight_decay 0.01 \
+        --adam_beta2 0.95 \
+        --warmup_ratio 0.5 \
+        --lr_scheduler_type "cosine" \
+        --logging_steps 1 \
+        --report_to "tensorboard" \
+        --gradient_checkpointing \
+        --dataloader_num_workers 2 \
+        --dataloader_prefetch_factor 10 \
+        --save_total_limit 10000 \
+        --deepspeed conf/ds_config_zero2.json \
+        --accelerator_config conf/accelerator_config.json
+fi
+
+
+if [ $stage == "decode" ] || [ $stage == "all" ]; then
+    mdir=$dir/checkpoint-${steps}
+    cp $decode_conf $mdir
+    python west/bin/decode.py \
+        --data_path $data/test.jsonl \
+        --model_dir $mdir \
+        --result_path $mdir/result.jsonl
+    python tools/compute_wer.py --char=1 --v=1 \
+        $data/test.jsonl $mdir/result.jsonl > $mdir/result.wer
+fi