[fix] fix hey_snips training script(#195) (#204)

mlxu995 · menglong.xu · web-flow · commit 1a8ee651620f · 2025-09-17T10:05:23.000+08:00
Co-authored-by: menglong.xu &lt;menglong.xu@aispeech.com&gt;
diff --git a/examples/hey_snips/s0/conf/ds_tcn.yaml b/examples/hey_snips/s0/conf/ds_tcn.yaml
@@ -2,13 +2,17 @@ dataset_conf:
     filter_conf:
         max_length: 2048
         min_length: 0
+        token_max_length: 200
+        token_min_length: 1
+        max_output_input_ratio: 1
+        min_output_input_ratio: 0.0005
     resample_conf:
         resample_rate: 16000
     speed_perturb: false
     reverb_prob: 0.2
     noise_prob: 0.3
-    feature_extraction_conf:
-        feature_type: 'fbank'
+    feats_type: 'fbank'
+    fbank_conf:
         num_mel_bins: 40
         frame_shift: 10
         frame_length: 25
@@ -22,6 +26,7 @@ dataset_conf:
     shuffle: true
     shuffle_conf:
         shuffle_size: 1500
+    sort: false
     batch_conf:
         batch_size: 256
 
diff --git a/examples/hey_snips/s0/local/prepare_data.py b/examples/hey_snips/s0/local/prepare_data.py
@@ -19,24 +19,31 @@ def main():
                         type=str,
                         help='dir containing all the wav files')
     parser.add_argument('path', type=str, help='path to the json file')
+    parser.add_argument('dict', type=str, help='path to the dict file')
     parser.add_argument('out_dir', type=str, help='out dir')
     args = parser.parse_args()
 
+    id2token = {}
+    with open(args.dict, 'r', encoding='utf-8') as f:
+        for line in f:
+            token, idx = line.strip().split()
+            id2token[int(idx)] = token
+
     with open(args.path, 'r', encoding='utf-8') as f:
         data = json.load(f)
-        utt_id, label = [], []
+        utt_id, text = [], []
         for entry in data:
             if entry['duration'] > 0:
                 utt_id.append(entry['id'])
                 keyword_id = 0 if entry['is_hotword'] == 1 else -1
-                label.append(keyword_id)
+                text.append(id2token[keyword_id])
 
     abs_dir = os.path.abspath(args.wav_dir)
     wav_path = os.path.join(args.out_dir, 'wav.scp')
     text_path = os.path.join(args.out_dir, 'text')
     with open(wav_path, 'w', encoding='utf-8') as f_wav, \
          open(text_path, 'w', encoding='utf-8') as f_text:
-        for utt, l in zip(utt_id, label):
+        for utt, l in zip(utt_id, text):
             f_wav.write('{} {}\n'.format(utt,
                                          os.path.join(abs_dir, utt + ".wav")))
             f_text.write('{} {}\n'.format(utt, l))
diff --git a/examples/hey_snips/s0/run.sh b/examples/hey_snips/s0/run.sh
@@ -4,8 +4,10 @@
 
 . ./path.sh
 
-stage=0
-stop_stage=4
+set -euo pipefail
+
+stage=$1
+stop_stage=$2
 num_keywords=1
 
 config=conf/ds_tcn.yaml
@@ -24,8 +26,7 @@ noise_lmdb=
 reverb_lmdb=
 
 . tools/parse_options.sh || exit 1;
-
-set -euo pipefail
+window_shift=50
 
 if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
   echo "Extracte all datasets"
@@ -36,14 +37,15 @@ fi
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
   echo "Preparing datasets..."
   mkdir -p dict
-  echo "<filler> -1" > dict/words.txt
-  echo "Hey_Snips 0" >> dict/words.txt
+  echo "<FILLER> -1" > dict/dict.txt
+  echo "<HEY_SNIPS> 0" >> dict/dict.txt
+  awk '{print $1}' dict/dict.txt > dict/words.txt
 
   for folder in train dev test; do
     mkdir -p data/$folder
     json_path=$download_dir/hey_snips_research_6k_en_train_eval_clean_ter/$folder.json
     local/prepare_data.py $download_dir/hey_snips_research_6k_en_train_eval_clean_ter/audio_files $json_path \
-      data/$folder
+      dict/dict.txt data/$folder
   done
 fi
 
@@ -78,7 +80,8 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     --num_workers 8 \
     --num_keywords $num_keywords \
     --min_duration 50 \
-    --seed 777 \
+    --seed 666 \
+    --dict ./dict \
     $cmvn_opts \
     ${reverb_lmdb:+--reverb_lmdb $reverb_lmdb} \
     ${noise_lmdb:+--noise_lmdb $noise_lmdb} \
@@ -97,21 +100,23 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
   python wekws/bin/score.py \
     --config $dir/config.yaml \
     --test_data data/test/data.list \
+    --gpu 0 \
     --batch_size 256 \
     --checkpoint $score_checkpoint \
     --score_file $result_dir/score.txt \
+    --dict ./dict \
     --num_workers 8
-  first_keyword=0
-  last_keyword=$(($num_keywords+$first_keyword-1))
-  for keyword in $(seq $first_keyword $last_keyword); do
+
+  for keyword in `tail -n +2 dict/words.txt`; do
     python wekws/bin/compute_det.py \
       --keyword $keyword \
       --test_data data/test/data.list \
+      --window_shift $window_shift \
       --score_file $result_dir/score.txt \
       --stats_file $result_dir/stats.${keyword}.txt
   done
   python wekws/bin/plot_det_curve.py \
-    --keywords_dict dict/words.txt \
+    --keywords_dict dict/dict.txt \
     --stats_dir $result_dir \
     --figure_file $result_dir/det.png \
     --xlim 10 \
diff --git a/wekws/bin/score.py b/wekws/bin/score.py
@@ -80,6 +80,7 @@ def main():
     test_conf = copy.deepcopy(configs['dataset_conf'])
     test_conf['filter_conf']['max_length'] = 102400
     test_conf['filter_conf']['min_length'] = 0
+    test_conf['filter_conf']['min_output_input_ratio'] = 0
     test_conf['speed_perturb'] = False
     test_conf['spec_aug'] = False
     test_conf['shuffle'] = False