[touch_asu] save in json format and compute wer in json to avoid tric… (#47)

robin1001 · web-flow · commit 1a530bb4a48e · 2025-09-09T16:56:08.000+08:00
* [touch_asu] save in json format and compute wer in json to avoid tricky problems

* fix lint

* fix lint in decode.py
diff --git a/examples/aishell/asr/conf/generation_config.json b/examples/aishell/asr/conf/generation_config.json
@@ -1,7 +1,5 @@
 {
-  "bos_token_id": 151643,
-  "do_sample": true,
-  "eos_token_id": 151643,
+  "do_sample": false,
   "max_new_tokens": 50,
   "transformers_version": "4.37.0"
 }
diff --git a/examples/aishell/asr/run.sh b/examples/aishell/asr/run.sh
@@ -57,9 +57,8 @@ if [ $stage == "decode" ] || [ $stage == "all" ]; then
     cp conf/generation_config.json $mdir
     python west/bin/decode.py \
         --data_path $data/test.jsonl \
-        --model_dir $PWD/$mdir \
-        --result_path $mdir/result.txt
-    paste <(awk '{print $1}' $data/test.text) $mdir/result.txt > $mdir/result.hyp
-    python tools/compute-wer.py --char=1 --v=1 \
-        $data/test.text $mdir/result.hyp > $mdir/result.wer
+        --model_dir $mdir \
+        --result_path $mdir/result.jsonl
+    python tools/compute_wer.py --char=1 --v=1 \
+        $data/test.jsonl $mdir/result.jsonl > $mdir/result.wer
 fi
diff --git a/tools/compute_wer.py b/tools/compute_wer.py
@@ -1,8 +1,10 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
-import re, sys, unicodedata
 import codecs
+import json
+import sys
+import unicodedata
 
 remove_tag = True
 spacelist = [' ', '\t', '\r', '\n']
@@ -21,17 +23,20 @@ def characterize(string):
             i += 1
             continue
         cat1 = unicodedata.category(char)
-        #https://unicodebook.readthedocs.io/unicode.html#unicode-categories
-        if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist:  # space or not assigned
+        #  https://unicodebook.readthedocs.io/unicode.html#unicode-categories
+        if cat1 == 'Zs' or cat1 == 'Cn' or char in spacelist:
+            # space or not assigned
             i += 1
             continue
         if cat1 == 'Lo':  # letter-other
             res.append(char)
             i += 1
         else:
-            # some input looks like: <unk><noise>, we want to separate it to two words.
+            # some input looks like: <unk><noise>, we want to separate it to
+            # two words.
             sep = ' '
-            if char == '<': sep = '>'
+            if char == '<':
+                sep = '>'
             j = i + 1
             while j < len(string):
                 c = string[j]
@@ -46,7 +51,8 @@ def characterize(string):
 
 
 def stripoff_tags(x):
-    if not x: return ''
+    if not x:
+        return ''
     chars = []
     i = 0
     T = len(x)
@@ -210,9 +216,9 @@ def calculate(self, lab, rec):
             elif self.space[i][j]['error'] == 'non':  # starting point
                 break
             else:  # shouldn't reach here
-                print(
-                    'this should not happen , i = {i} , j = {j} , error = {error}'
-                    .format(i=i, j=j, error=self.space[i][j]['error']))
+                print('this should not happen '
+                      'i = {i} , j = {j} , error = {error}'.format(
+                          i=i, j=j, error=self.space[i][j]['error']))
         return result
 
     def overall(self):
@@ -286,10 +292,10 @@ def default_cluster(word):
 
 def usage():
     print(
-        "compute-wer.py : compute word error rate (WER) and align recognition results and references."
+        "compute-wer.py : compute word error rate (WER) and align recognition results and references."  # noqa
     )
     print(
-        "         usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"
+        "         usage : python compute-wer.py [--cs={0,1}] [--cluster=foo] [--ig=ignore_file] [--char={0,1}] [--v={0,1}] [--padding-symbol={space,underline}] test.ref test.hyp > test.wer"  # noqa
     )
 
 
@@ -364,7 +370,7 @@ def usage():
             verbose = 0
             try:
                 verbose = int(b)
-            except:
+            except Exception:
                 if b == 'true' or b != '0':
                     verbose = 1
             continue
@@ -378,7 +384,7 @@ def usage():
                 padding_symbol = '_'
             continue
         if True or sys.argv[1].startswith('-'):
-            #ignore invalid switch
+            # ignore invalid switch
             del sys.argv[1]
             continue
 
@@ -391,7 +397,7 @@ def usage():
 
     ref_file = sys.argv[1]
     hyp_file = sys.argv[2]
-    rec_set = {}
+    rec_list = []
     if split and not case_sensitive:
         newsplit = dict()
         for w in split:
@@ -401,29 +407,30 @@ def usage():
             newsplit[w.upper()] = words
         split = newsplit
 
-    with codecs.open(hyp_file, 'r', 'utf-8') as fh:
+    with open(hyp_file) as fh:
         for line in fh:
+            item = json.loads(line)
+            assert 'txt' in item
+            line = item['txt']
             if tochar:
                 array = characterize(line)
             else:
                 array = line.strip().split()
-            if len(array) == 0: continue
-            fid = array[0]
-            rec_set[fid] = normalize(array[1:], ignore_words, case_sensitive,
-                                     split)
+            rec_list.append(
+                normalize(array, ignore_words, case_sensitive, split))
 
     # compute error rate on the interaction of reference file and hyp file
-    for line in open(ref_file, 'r', encoding='utf-8'):
+    for i, line in enumerate(open(ref_file, 'r', encoding='utf-8')):
+        item = json.loads(line)
+        assert 'txt' in item
+        line = item['txt']
+        fid = item['wav']
         if tochar:
             array = characterize(line)
         else:
             array = line.rstrip('\n').split()
-        if len(array) == 0: continue
-        fid = array[0]
-        if fid not in rec_set:
-            continue
-        lab = normalize(array[1:], ignore_words, case_sensitive, split)
-        rec = rec_set[fid]
+        lab = normalize(line, ignore_words, case_sensitive, split)
+        rec = rec_list[i]
         if verbose:
             print('\nutt: %s' % fid)
 
@@ -489,8 +496,7 @@ def usage():
 
     if verbose:
         print(
-            '==========================================================================='
-        )
+            '================================================================')
         print()
 
     result = calculator.overall()
@@ -525,8 +531,8 @@ def usage():
             for line in open(cluster_file, 'r', encoding='utf-8'):
                 for token in line.decode('utf-8').rstrip('\n').split():
                     # end of cluster reached, like </Keyword>
-                    if token[0:2] == '</' and token[len(token)-1] == '>' and \
-                       token.lstrip('</').rstrip('>') == cluster_id :
+                    if (token[0:2] == '</' and token[len(token) - 1] == '>'
+                            and token.lstrip('</').rstrip('>') == cluster_id):
                         result = calculator.cluster(cluster)
                         if result['all'] != 0:
                             wer = float(result['ins'] + result['sub'] +
@@ -540,14 +546,13 @@ def usage():
                         cluster_id = ''
                         cluster = []
                     # begin of cluster reached, like <Keyword>
-                    elif token[0] == '<' and token[len(token)-1] == '>' and \
-                         cluster_id == '' :
+                    elif (token[0] == '<' and token[len(token) - 1] == '>'
+                          and cluster_id == ''):
                         cluster_id = token.lstrip('<').rstrip('>')
                         cluster = []
                     # general terms, like WEATHER / CAR / ...
                     else:
                         cluster.append(token)
         print()
         print(
-            '==========================================================================='
-        )
+            '================================================================')
diff --git a/west/bin/decode.py b/west/bin/decode.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2025 Binbin Zhang(binbzha@qq.com)
+import json
 import sys
 from dataclasses import dataclass, field
 
@@ -42,7 +43,8 @@ def main():
             print(text)
             for t in text:
                 t = t.replace('\n', ' ')
-                fid.write(t + '\n')
+                item = {'txt': t}
+                fid.write(json.dumps(item, ensure_ascii=False) + '\n')
             sys.stdout.flush()
     fid.close()
 

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,5 @@`
`1`	`1`	`{`
`2`		`- "bos_token_id": 151643,`
`3`		`- "do_sample": true,`
`4`		`- "eos_token_id": 151643,`
	`2`	`+ "do_sample": false,`
`5`	`3`	`"max_new_tokens": 50,`
`6`	`4`	`"transformers_version": "4.37.0"`
`7`	`5`	`}`