[touchtts] support touchtts trainning from config

chwma0 · chwma0 · commit 7a63d608b225 · 2025-11-14T03:36:00.000Z
diff --git a/examples/libritts/tts/conf/touch_tts_config.json b/examples/libritts/tts/conf/touch_tts_config.json
@@ -1,5 +1,6 @@
 {
   "llm_model_name_or_path": "Qwen/Qwen2.5-0.5B-Audio",
+  "llm_model_tokenizer_dir": "Qwen/Qwen2.5-0.5B-Audio",
   "model_type": "touch_tts",
   "transformers_version": "4.52.3",
   "num_speech_tokens": 4096,
diff --git a/west/models/touch_tts/configuration_touch_tts.py b/west/models/touch_tts/configuration_touch_tts.py
@@ -9,6 +9,7 @@ class TouchTTSConfig(PretrainedConfig):
     def __init__(
         self,
         llm_model_name_or_path: str = 'Qwen/Qwen2-7B',
+        llm_model_tokenizer_dir: str = 'Qwen/Qwen2-7B',
         s3tokenizer_model_name_or_path: str = '',
         num_speech_tokens: int = 4096,
         hidden_size: int = 0,
@@ -23,6 +24,7 @@ def __init__(
         self.hidden_size = hidden_size
         self.max_speech_duration = max_speech_duration
         self.min_speech_duration = min_speech_duration
+        self.llm_model_tokenizer_dir = llm_model_tokenizer_dir
 
 
 __all__ = ["TouchTTSConfig"]
diff --git a/west/models/touch_tts/modeling_touch_tts.py b/west/models/touch_tts/modeling_touch_tts.py
@@ -20,12 +20,7 @@ class TouchTTS(PreTrainedModel, GenerationMixin):
     def __init__(self, config: TouchTTSConfig):
         super().__init__(config)
         llm_config = AutoConfig.from_pretrained(config.llm_model_name_or_path)
-        self.llm = AutoModelForCausalLM.from_pretrained(
-            config.llm_model_name_or_path,
-            config=llm_config,
-            torch_dtype='auto',
-            attn_implementation="flash_attention_2",  # or "flex_attention"
-        )
+        self.llm = AutoModelForCausalLM.from_config(config=llm_config)
         config.hidden_size = llm_config.hidden_size  # for deepseed training
         self.speech_tokenizer = s3tokenizer.load_model(
             'speech_tokenizer_v1_25hz', config.s3tokenizer_model_name_or_path)
@@ -126,6 +121,10 @@ def generate(
 
     def init_tokenizer(self):
         tokenizer = AutoTokenizer.from_pretrained(
-            self.config.llm_model_name_or_path)
-        tokenizer.bos_token = "<|im_start|>"
+            self.config.llm_model_tokenizer_dir, trust_remote_code=True)
+        if 'Qwen' in self.config.llm_model_tokenizer_dir:
+            tokenizer.bos_token = tokenizer.eos_token
+        # Set pad_token if not already set
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
         return tokenizer

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,6 @@`
`1`	`1`	`{`
`2`	`2`	`"llm_model_name_or_path": "Qwen/Qwen2.5-0.5B-Audio",`
	`3`	`+ "llm_model_tokenizer_dir": "Qwen/Qwen2.5-0.5B-Audio",`
`3`	`4`	`"model_type": "touch_tts",`
`4`	`5`	`"transformers_version": "4.52.3",`
`5`	`6`	`"num_speech_tokens": 4096,`