update:优化音频播放方法

openrz · openrz · commit 472106390dca · 2025-05-24T09:39:10.000+08:00
diff --git a/main/xiaozhi-server/core/connection.py b/main/xiaozhi-server/core/connection.py
@@ -14,6 +14,8 @@
 from typing import Dict, Any
 from plugins_func.loadplugins import auto_import_modules
 from config.logger import setup_logging
+from config.config_loader import get_project_dir
+from core.utils import p3
 from core.utils.dialogue import Message, Dialogue
 from core.handle.textHandle import handleTextMessage
 from core.utils.util import (
@@ -615,7 +617,7 @@ def chat(self, query, tool_call=False):
                             text_index += 1
                             self.recode_first_last_text(segment_text, text_index)
                             future = self.executor.submit(
-                                self.speak_and_play, segment_text, text_index
+                                self.speak_and_play, None, segment_text, text_index
                             )
                             self.tts_queue.put((future, text_index))
                             # 更新已处理字符位置
@@ -674,7 +676,7 @@ def chat(self, query, tool_call=False):
                 text_index += 1
                 self.recode_first_last_text(segment_text, text_index)
                 future = self.executor.submit(
-                    self.speak_and_play, segment_text, text_index
+                    self.speak_and_play, None, segment_text, text_index
                 )
                 self.tts_queue.put((future, text_index))
 
@@ -737,7 +739,7 @@ def _handle_function_result(self, result, function_call_data, text_index):
         if result.action == Action.RESPONSE:  # 直接回复前端
             text = result.response
             self.recode_first_last_text(text, text_index)
-            future = self.executor.submit(self.speak_and_play, text, text_index)
+            future = self.executor.submit(self.speak_and_play, None, text, text_index)
             self.tts_queue.put((future, text_index))
             self.dialogue.put(Message(role="assistant", content=text))
         elif result.action == Action.REQLLM:  # 调用函数后再请求llm生成回复
@@ -776,7 +778,7 @@ def _handle_function_result(self, result, function_call_data, text_index):
         elif result.action == Action.NOTFOUND or result.action == Action.ERROR:
             text = result.result
             self.recode_first_last_text(text, text_index)
-            future = self.executor.submit(self.speak_and_play, text, text_index)
+            future = self.executor.submit(self.speak_and_play, None, text, text_index)
             self.tts_queue.put((future, text_index))
             self.dialogue.put(Message(role="assistant", content=text))
         else:
@@ -803,11 +805,7 @@ def _tts_priority_thread(self):
                     self.logger.bind(tag=TAG).debug("正在处理TTS任务...")
                     tts_timeout = int(self.config.get("tts_timeout", 10))
                     tts_file, text, _ = future.result(timeout=tts_timeout)
-                    if text is None or len(text) <= 0:
-                        self.logger.bind(tag=TAG).error(
-                            f"TTS出错：{text_index}: tts text is empty"
-                        )
-                    elif tts_file is None:
+                    if tts_file is None:
                         self.logger.bind(tag=TAG).error(
                             f"TTS出错： file is empty: {text_index}: {text}"
                         )
@@ -816,12 +814,16 @@ def _tts_priority_thread(self):
                             f"TTS生成：文件路径: {tts_file}"
                         )
                         if os.path.exists(tts_file):
-                            if self.audio_format == "pcm":
+                            if tts_file.endswith(".p3"):
+                                audio_datas, _ = p3.decode_opus_from_file(tts_file)
+                            elif self.audio_format == "pcm":
                                 audio_datas, _ = self.tts.audio_to_pcm_data(tts_file)
                             else:
                                 audio_datas, _ = self.tts.audio_to_opus_data(tts_file)
                             # 在这里上报TTS数据
-                            enqueue_tts_report(self, text, audio_datas)
+                            enqueue_tts_report(
+                                self, tts_file if text is None else text, audio_datas
+                            )
                         else:
                             self.logger.bind(tag=TAG).error(
                                 f"TTS出错：文件不存在{tts_file}"
@@ -837,6 +839,7 @@ def _tts_priority_thread(self):
                     self.tts.delete_audio_file
                     and tts_file is not None
                     and os.path.exists(tts_file)
+                    and tts_file.startswith(self.tts.output_file)
                 ):
                     os.remove(tts_file)
             except Exception as e:
@@ -903,18 +906,21 @@ def _report_worker(self):
 
         self.logger.bind(tag=TAG).info("聊天记录上报线程已退出")
 
-    def speak_and_play(self, text, text_index=0):
-        if text is None or len(text) <= 0:
-            self.logger.bind(tag=TAG).info(f"无需tts转换，query为空，{text}")
-            return None, text, text_index
-        tts_file = self.tts.to_tts(text)
+    def speak_and_play(self, file_path, content, text_index=0):
+        if file_path is not None:
+            self.logger.bind(tag=TAG).info(f"无需tts转换: 从文件播放，{file_path}")
+            return file_path, content, text_index
+        if content is None or len(content) <= 0:
+            self.logger.bind(tag=TAG).info(f"无需tts转换，query为空，{content}")
+            return None, content, text_index
+        tts_file = self.tts.to_tts(content)
         if tts_file is None:
-            self.logger.bind(tag=TAG).error(f"tts转换失败，{text}")
-            return None, text, text_index
+            self.logger.bind(tag=TAG).error(f"tts转换失败，{content}")
+            return None, content, text_index
         self.logger.bind(tag=TAG).debug(f"TTS 文件生成完毕: {tts_file}")
         if self.max_output_size > 0:
-            add_device_output(self.headers.get("device-id"), len(text))
-        return tts_file, text, text_index
+            add_device_output(self.headers.get("device-id"), len(content))
+        return tts_file, content, text_index
 
     def clearSpeakStatus(self):
         self.logger.bind(tag=TAG).debug(f"清除服务端讲话状态")
diff --git a/main/xiaozhi-server/core/handle/intentHandler.py b/main/xiaozhi-server/core/handle/intentHandler.py
@@ -109,29 +109,29 @@ def process_function_call():
                     if result.action == Action.RESPONSE:  # 直接回复前端
                         text = result.response
                         if text is not None:
-                            speak_and_play(conn, text)
+                            speak_txt(conn, text)
                     elif result.action == Action.REQLLM:  # 调用函数后再请求llm生成回复
                         text = result.result
                         conn.dialogue.put(Message(role="tool", content=text))
                         llm_result = conn.intent.replyResult(text, original_text)
                         if llm_result is None:
                             llm_result = text
-                        speak_and_play(conn, llm_result)
+                        speak_txt(conn, llm_result)
                     elif (
                         result.action == Action.NOTFOUND
                         or result.action == Action.ERROR
                     ):
                         text = result.result
                         if text is not None:
-                            speak_and_play(conn, text)
+                            speak_txt(conn, text)
                     elif function_name != "play_music":
                         # For backward compatibility with original code
                         # 获取当前最新的文本索引
                         text = result.response
                         if text is None:
                             text = result.result
                         if text is not None:
-                            speak_and_play(conn, text)
+                            speak_txt(conn, text)
 
             # 将函数执行放在线程池中
             conn.executor.submit(process_function_call)
@@ -142,12 +142,12 @@ def process_function_call():
         return False
 
 
-def speak_and_play(conn, text):
+def speak_txt(conn, text):
     text_index = (
         conn.tts_last_text_index + 1 if hasattr(conn, "tts_last_text_index") else 0
     )
     conn.recode_first_last_text(text, text_index)
-    future = conn.executor.submit(conn.speak_and_play, text, text_index)
+    future = conn.executor.submit(conn.speak_and_play, None, text, text_index)
     conn.llm_finish_task = True
     conn.tts_queue.put((future, text_index))
     conn.dialogue.put(Message(role="assistant", content=text))
diff --git a/main/xiaozhi-server/plugins_func/functions/play_music.py b/main/xiaozhi-server/plugins_func/functions/play_music.py
@@ -216,24 +216,16 @@ async def play_local_music(conn, specific_file=None):
         text = _get_random_play_prompt(selected_music)
         await send_stt_message(conn, text)
         conn.dialogue.put(Message(role="assistant", content=text))
-        conn.tts_first_text_index = 0
-        conn.tts_last_text_index = 0
 
-        tts_file = await asyncio.to_thread(conn.tts.to_tts, text)
-        if tts_file is not None and os.path.exists(tts_file):
-            conn.tts_last_text_index = 1
-            opus_packets, _ = conn.tts.audio_to_opus_data(tts_file)
-            conn.audio_play_queue.put((opus_packets, None, 0))
-            os.remove(tts_file)
+        conn.recode_first_last_text(text, 0)
+        future = conn.executor.submit(conn.speak_and_play, None, text, 0)
+        conn.tts_queue.put((future, 0))
 
+        conn.recode_first_last_text(text, 1)
+        future = conn.executor.submit(conn.speak_and_play, music_path, None, 1)
+        conn.tts_queue.put((future, 1))
         conn.llm_finish_task = True
 
-        if music_path.endswith(".p3"):
-            opus_packets, _ = p3.decode_opus_from_file(music_path)
-        else:
-            opus_packets, _ = conn.tts.audio_to_opus_data(music_path)
-        conn.audio_play_queue.put((opus_packets, None, conn.tts_last_text_index))
-
     except Exception as e:
         conn.logger.bind(tag=TAG).error(f"播放音乐失败: {str(e)}")
         conn.logger.bind(tag=TAG).error(f"详细错误: {traceback.format_exc()}")