[example] add qa recipe

cdliang11 · cdliang11 · commit 5caa735ee41f · 2025-09-02T14:52:03.000+08:00
diff --git a/examples/belle_1.4M_qa/conf/accelerator_config.json b/examples/belle_1.4M_qa/conf/accelerator_config.json
@@ -0,0 +1,3 @@
+{
+    "dispatch_batches": false
+}
diff --git a/examples/belle_1.4M_qa/conf/ds_config_zero2.json b/examples/belle_1.4M_qa/conf/ds_config_zero2.json
@@ -0,0 +1,54 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "none",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto"
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 100,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
diff --git a/examples/belle_1.4M_qa/conf/touch_asu_config.json b/examples/belle_1.4M_qa/conf/touch_asu_config.json
@@ -0,0 +1,10 @@
+{
+  "encoder_ds_rate": 4,
+  "encoder_projector_ds_rate": 2,
+  "llm_model_name_or_path": "/bucket/output/jfs-hdfs/user/binbin.zhang/huggingface/hub/Qwen2-1.5B-Instruct",
+  "lora_config": null,
+  "model_type": "touch_asu",
+  "projector_hidden_size": 2048,
+  "transformers_version": "4.52.3",
+  "wenet_model_name_or_path": "/bucket/output/jfs-hdfs/user/binbin.zhang/models/wenet/wenetspeech/u2pp_conformer/"
+}
diff --git a/examples/belle_1.4M_qa/run.sh b/examples/belle_1.4M_qa/run.sh
@@ -0,0 +1,71 @@
+# Copyright 2025 Binbin Zhang(binbzha@qq.com)
+
+[ ! -s west ] && ln -s ../../../west
+[ ! -s tools ] && ln -s ../../../tools
+export PYTHONPATH=$PYTHONPATH:$PWD
+
+export CUDA_VISIBLE_DEVICES="1"  # Change this to all your available gpus, such as "0,1,2,3"
+num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F ',' '{print NF}')
+
+model_config_or_dir=pretrain_qwen1.7b_aishell_asr
+
+stage=decode # data/train/decode
+data=data
+
+steps=10000  # training steps
+pack_size=8192
+lr_rate=5e-5
+dir=exp/Qwe3-1.7B-Instruct-firered-${pack_size}-${lr_rate}
+
+# Note: Change your model settings in `conf/touch_asu_config.json`
+
+
+if [ $stage == "data" ] || [ $stage == "all" ]; then
+    echo "Prepare required data"
+    # TODO:
+    mkdir $data
+    cp -r /jfs-hdfs/user/Archive/AQA/qa_test/chinese_qa.jsonl $data
+fi
+
+
+if [ $stage == "train" ] || [ $stage == "all" ]; then
+    torchrun --standalone --nnodes=1 --nproc_per_node=$num_gpus west/bin/train.py \
+        --model_config_or_dir $model_config_or_dir \
+        --data_path $data/data_tn_cn_messages_aishell.list \
+        --output_dir $dir \
+        --pack_size $pack_size \
+        --bf16 True \
+        --max_steps $steps \
+        --num_data_cycles 100 \
+        --per_device_train_batch_size 1 \
+        --per_device_eval_batch_size 1 \
+        --gradient_accumulation_steps 1 \
+        --save_strategy "steps" \
+        --save_steps 100 \
+        --save_total_limit 100 \
+        --learning_rate $lr_rate \
+        --weight_decay 0.01 \
+        --adam_beta2 0.95 \
+        --warmup_ratio 0.5 \
+        --lr_scheduler_type "cosine" \
+        --logging_steps 1 \
+        --report_to "tensorboard" \
+        --gradient_checkpointing \
+        --dataloader_num_workers 2 \
+        --dataloader_prefetch_factor 10 \
+        --save_total_limit 10000 \
+        --deepspeed conf/ds_config_zero2.json \
+        --accelerator_config conf/accelerator_config.json
+fi
+
+
+if [ $stage == "decode" ] || [ $stage == "all" ]; then
+    mdir=$dir/checkpoint-${steps}
+    python west/bin/decode.py \
+        --data_path $data/chinese_qa.jsonl \
+        --model_dir $mdir \
+        --result_path $mdir/result.txt
+    python tools/get_qa_hyp_ref_text.py $data/chinese_qa_messages.jsonl \
+        $mdir/result.txt $mdir/result.json
+    python tools/compute-acc-of-contain.py $mdir/result.json
+fi
diff --git a/examples/belle_1.4M_qa/tools b/examples/belle_1.4M_qa/tools
@@ -0,0 +1 @@
+../../tools
diff --git a/examples/belle_1.4M_qa/west b/examples/belle_1.4M_qa/west
@@ -0,0 +1 @@
+../../west
diff --git a/west/dataset/dataset.py b/west/dataset/dataset.py
@@ -131,6 +131,9 @@ def _read_one(self):
                             try:
                                 x['txt'] = x['txt'].decode('utf8')
                                 x['wav'] = io.BytesIO(x['wav'])
+                                if "messages" in x.keys():
+                                    x['messages'] = json.loads(
+                                        x['messages'].decode('utf8'))
                                 yield x
                             except Exception:
                                 logging.info(f'Dataset decode error, {line}')
@@ -244,9 +247,9 @@ def __iter__(self) -> Dict[str, torch.Tensor]:
     print(tokenizer.bos_token_id)
     data_args = DataArguments
     data_args.data_path = 'data/train.jsonl'
-    data_args.extractor_type = 'tts_codec'
-    dataset = SpeechDataset(tokenizer, data_args)
+    data_args.extractor_type = 'touch_asu'
+    extractor = Extractor.get_class(data_args.extractor_type)(tokenizer)
+    dataset = SpeechDataset(extractor, data_args)
     for i, x in enumerate(dataset):
-        print(x)
         if i > 0:
             break
diff --git a/west/models/touch_asu/extractor_touch_asu.py b/west/models/touch_asu/extractor_touch_asu.py
@@ -14,14 +14,31 @@ class ExtractorTouchASU(Extractor):
     fields_pack_offset = {'audio_offsets'}
 
     def extract(self, item):
+        """
+        1. speech pretraining data (asr):
+        messages = [
+            {'role': 'user', 'content': [{
+                'type': 'text', 'text': 'Transcribe the Speech'}]},
+            {'role': 'assistant', 'content': item['txt']},
+        ]
+        2. QA: SFT data (multi-turn)
+        messages = [
+            {'role': 'system', 'content': 'You are a helpful assistant.'},  # optional # noqa
+            {'role': 'user', 'content': 'What is the capital of China?'},   # optional # noqa
+            {'role': 'assistant', 'content': 'The capital of China is Beijing.'},   # optional # noqa
+            {'role': 'user', 'content': {'type': 'audio', 'audio': item['wav']}},  # last turn # noqa
+            {'role': 'assistant', 'content': item['txt']},
+        ]
+        """
         IGNORE_TOKEN_ID = LabelSmoother.ignore_index
-        if 'messages' in item:  # OpenAI role-content based SFT data
+        # OpenAI role-content based SFT data
+        # At least one pair of "user" and "assistant"
+        if 'messages' in item and len(item["messages"]) >= 2:
             messages = item['messages']
         else:  # Speech pretraining data
             messages = [
                 {
-                    'role':
-                    'user',
+                    'role': 'user',
                     'content': [{
                         'type': 'text',
                         'text': 'Transcribe the Speech'
@@ -36,13 +53,16 @@ def extract(self, item):
                 },
             ]
 
-        t0 = '<|im_start|>user\n'
+        t0 = ''
         t1 = '<|audio_eos|><|im_end|>\n' + '<|im_start|>assistant\n'
         t2 = ''
-        for msg in messages:
-            if msg['role'] == 'system':
-                t0 += msg['content']
-            elif msg['role'] == 'user':
+        # multi-turn
+        for msg in messages[:-2]:
+            t0 += '<|im_start|>' + msg['role'] + '\n' + \
+                  msg['content'] + '<|im_end|>\n'
+        for msg in messages[-2:]:
+            if msg['role'] == 'user':
+                t0 += '<|im_start|>user\n'
                 if isinstance(msg['content'], dict):
                     assert msg['content']['type'] == 'audio'
                     t0 += '<|audio_bos|>'

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "dispatch_batches": false`
	`3`	`+}`