[osum-echat] fix bugs for lint

dragongeng · dragongeng · commit 874ce03b2516 · 2025-10-13T19:17:10.000+08:00
diff --git a/.style.yapf b/.style.yapf
@@ -1,6 +1,6 @@
 [style]
 based_on_style = pep8
 ALLOW_MULTILINE_LAMBDAS = True
-COLUMN_LIMIT = 80
+COLUMN_LIMIT = 50
 SPLIT_COMPLEX_COMPREHENSION = False
 COALESCE_BRACKETS = True
diff --git a/test/test_osum_echat.py b/test/test_osum_echat.py
@@ -8,22 +8,27 @@
 import sys
 
 sys.path.insert(0, '../')
-import west.models.osum_echat.patch4generate  # make patch for generate
+from west.models.osum_echat.patch4generate import do_patch
 
+do_patch()
 
-def get_feat_from_wav_path(input_wav_path, device: torch.device = torch.device('cuda')):
+
+def get_feat_from_wav_path(input_wav_path,
+                           device: torch.device = torch.device('cuda')):
     """..."""
     waveform, sample_rate = torchaudio.load(input_wav_path)
     if waveform.shape[0] > 1:
         waveform = torch.mean(waveform, dim=0, keepdim=True)
-    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
+    resampler = torchaudio.transforms.Resample(orig_freq=sample_rate,
+                                               new_freq=16000)
     waveform = resampler(waveform)
     waveform = waveform.squeeze(0)
     sample_rate = 16000
     window = torch.hann_window(400)
     stft = torch.stft(waveform, 400, 160, window=window, return_complex=True)
-    magnitudes = stft[..., :-1].abs() ** 2
-    filters = torch.from_numpy(librosa.filters.mel(sr=sample_rate, n_fft=400, n_mels=80))
+    magnitudes = stft[..., :-1].abs()**2
+    filters = torch.from_numpy(
+        librosa.filters.mel(sr=sample_rate, n_fft=400, n_mels=80))
     mel_spec = filters @ magnitudes
     log_spec = torch.clamp(mel_spec, min=1e-10).log10()
     log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
@@ -39,19 +44,23 @@ def get_feat_from_wav_path(input_wav_path, device: torch.device = torch.device('
     from huggingface_hub import hf_hub_download
 
     # For natural language think model in west
-    ckpt_path = hf_hub_download(repo_id="ASLP-lab/OSUM-EChat", filename="language_think_west.pt")
+    ckpt_path = hf_hub_download(repo_id="ASLP-lab/OSUM-EChat",
+                                filename="language_think_west.pt")
     osum_config_path = "../examples/aishell/asr/conf/osum_echat.json"
     config_new = AutoConfig.from_pretrained(osum_config_path)
     osum_model = AutoModel.from_config(config_new)
     osum_model.eval()
     osum_model.to('cuda')
-    missing_keys, unexpected_keys = osum_model.load_state_dict(torch.load(ckpt_path, map_location="cpu"), strict=False)
+    missing_keys, unexpected_keys = osum_model.load_state_dict(torch.load(
+        ckpt_path, map_location="cpu"),
+                                                               strict=False)
     for key in missing_keys:
         print("missing tensor: {}".format(key))
     for key in unexpected_keys:
         print("unexpected tensor: {}".format(key))
     print(osum_model)
     test_wav_path = "./data/test_wave4osumechat.wav"
     fake_wav, faek_wav_lens = get_feat_from_wav_path(test_wav_path)
-    osum_output = osum_model.generate(audio_features=fake_wav, audio_features_lengths=faek_wav_lens)
+    osum_output = osum_model.generate(audio_features=fake_wav,
+                                      audio_features_lengths=faek_wav_lens)
     print(osum_output)
diff --git a/west/models/osum_echat/configuration_osum_echat.py b/west/models/osum_echat/configuration_osum_echat.py
@@ -8,13 +8,13 @@ class OSUMEChatConfig(PretrainedConfig):
     model_type = "osum_echat"
 
     def __init__(
-            self,
-            llm_model_name_or_path: str = 'Qwen/Qwen2.5-3B-Instruct',
-            no_init_llm: bool = True,
-            wenet_model_name_or_path: str = 'whisper-medium',
-            lora_config: Optional[Dict[str, Any]] = None,
-            speech_token_num: int = 4097,
-            **kwargs,
+        self,
+        llm_model_name_or_path: str = 'Qwen/Qwen2.5-3B-Instruct',
+        no_init_llm: bool = True,
+        wenet_model_name_or_path: str = 'whisper-medium',
+        lora_config: Optional[Dict[str, Any]] = None,
+        speech_token_num: int = 4097,
+        **kwargs,
     ):
         super().__init__(**kwargs)
         self.llm_model_name_or_path = llm_model_name_or_path
diff --git a/west/models/osum_echat/cumstom_stop_criteria.py b/west/models/osum_echat/cumstom_stop_criteria.py
@@ -6,6 +6,7 @@
 
 
 class ASRLogitsProcessor(LogitsProcessor):
+
     def __init__(self, text_token_num: int):
         self.text_token_num = text_token_num
 
@@ -41,7 +42,8 @@ def __init__(self, text_token_num: int, text_eos_id: int):
 
     def __call__(self, input_ids, scores):
         print(input_ids.shape)
-        assert input_ids.size(0) == 1, "ERROR: S2SSpeechLogitsProcessor only support bs=1 now"
+        assert input_ids.size(
+            0) == 1, "ERROR: S2SSpeechLogitsProcessor only support bs=1 now"
         if self.text_phase:
             scores[..., self.text_token_num:] = torch.finfo(scores.dtype).min
         else:
@@ -64,16 +66,19 @@ def __init__(self, text_eos_id: int, speech_eos_id: int):
         self.text_eos_id = text_eos_id
         self.speech_eos_id = speech_eos_id
 
-    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor,
+                 **kwargs):
         _input_ids = input_ids.flatten().view(-1)
         if torch.isin(_input_ids, self.text_eos_id).any():
-            text_eos_idx = (_input_ids == self.text_eos_id).nonzero(as_tuple=True)[0][0].item()
+            text_eos_idx = (_input_ids == self.text_eos_id).nonzero(
+                as_tuple=True)[0][0].item()
             if torch.sum(_input_ids[text_eos_idx:] == self.speech_eos_id) > 1:
                 return True
         return False
 
 
 class MaxTokenStopper(StoppingCriteria):
+
     def __init__(self, max_tokens):
         self.max_tokens = max_tokens
 
@@ -86,11 +91,12 @@ def __call__(self, input_ids, scores, **kwargs):
 
 
 class InterruptStopper(StoppingCriteria):
+
     def __init__(self):
         self.stop = False
 
     def __call__(self, input_ids, scores, **kwargs):
-        if self.stop == True:
+        if self.stop:
             # self.stop == False # reset
             return True
         else:
diff --git a/west/models/osum_echat/extractor_osum_echat.py b/west/models/osum_echat/extractor_osum_echat.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2025 Xuelong Geng(xlgeng@mail.nwpu.edu.cn)
 
-
 from west.dataset.extractor import Extractor
 
 
diff --git a/west/models/osum_echat/modeling_osum_echat.py b/west/models/osum_echat/modeling_osum_echat.py
diff --git a/west/models/osum_echat/patch4generate.py b/west/models/osum_echat/patch4generate.py

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,5 @@`
`1`	`1`	`# Copyright (c) 2025 Xuelong Geng([email protected])`
`2`	`2`
`3`		`-`
`4`	`3`	`from west.dataset.extractor import Extractor`
`5`	`4`
`6`	`5`