Merge pull request #1366 from xinnan-tech/update-fix

xinnan-tech · web-flow · commit 76ee2c536521 · 2025-05-24T09:40:28.000+08:00
合并chat和chat_with_function_calling
diff --git a/main/manager-web/src/components/HeaderBar.vue b/main/manager-web/src/components/HeaderBar.vue
@@ -51,7 +51,7 @@
               字典管理
             </el-dropdown-item>
             <el-dropdown-item @click.native="goProviderManagement">
-              供应器管理
+              字段管理
             </el-dropdown-item>
             <el-dropdown-item @click.native="goServerSideManagement">
               服务端管理
diff --git a/main/manager-web/src/views/ProviderManagement.vue b/main/manager-web/src/views/ProviderManagement.vue
@@ -3,7 +3,7 @@
     <HeaderBar />
 
     <div class="operation-bar">
-      <h2 class="page-title">供应器管理</h2>
+      <h2 class="page-title">字段管理</h2>
       <div class="right-operations">
         <el-dropdown trigger="click" @command="handleSelectModelType" @visible-change="handleDropdownVisibleChange">
           <el-button class="category-btn">
diff --git a/main/xiaozhi-server/core/connection.py b/main/xiaozhi-server/core/connection.py
@@ -14,6 +14,8 @@
 from typing import Dict, Any
 from plugins_func.loadplugins import auto_import_modules
 from config.logger import setup_logging
+from config.config_loader import get_project_dir
+from core.utils import p3
 from core.utils.dialogue import Message, Dialogue
 from core.handle.textHandle import handleTextMessage
 from core.utils.util import (
@@ -506,106 +508,20 @@ def change_system_prompt(self, prompt):
         # 更新系统prompt至上下文
         self.dialogue.update_system_message(self.prompt)
 
-    def chat(self, query):
-
-        self.dialogue.put(Message(role="user", content=query))
-
-        response_message = []
-        processed_chars = 0  # 跟踪已处理的字符位置
-        try:
-            # 使用带记忆的对话
-            memory_str = None
-            if self.memory is not None:
-                future = asyncio.run_coroutine_threadsafe(
-                    self.memory.query_memory(query), self.loop
-                )
-                memory_str = future.result()
-
-            self.logger.bind(tag=TAG).debug(f"记忆内容: {memory_str}")
-            llm_responses = self.llm.response(
-                self.session_id, self.dialogue.get_llm_dialogue_with_memory(memory_str)
-            )
-        except Exception as e:
-            self.logger.bind(tag=TAG).error(f"LLM 处理出错 {query}: {e}")
-            return None
-
-        self.llm_finish_task = False
-        text_index = 0
-        for content in llm_responses:
-            response_message.append(content)
-            if self.client_abort:
-                break
-
-            # 合并当前全部文本并处理未分割部分
-            full_text = "".join(response_message)
-            current_text = full_text[processed_chars:]  # 从未处理的位置开始
-
-            # 查找最后一个有效标点
-            punctuations = ("。", ".", "？", "?", "！", "!", "；", ";", "：")
-            last_punct_pos = -1
-            number_flag = True
-            for punct in punctuations:
-                pos = current_text.rfind(punct)
-                prev_char = current_text[pos - 1] if pos - 1 >= 0 else ""
-                # 如果.前面是数字统一判断为小数
-                if prev_char.isdigit() and punct == ".":
-                    number_flag = False
-                if pos > last_punct_pos and number_flag:
-                    last_punct_pos = pos
-
-            # 找到分割点则处理
-            if last_punct_pos != -1:
-                segment_text_raw = current_text[: last_punct_pos + 1]
-                segment_text = get_string_no_punctuation_or_emoji(segment_text_raw)
-                if segment_text:
-                    # 强制设置空字符，测试TTS出错返回语音的健壮性
-                    # if text_index % 2 == 0:
-                    #     segment_text = " "
-                    text_index += 1
-                    self.recode_first_last_text(segment_text, text_index)
-                    future = self.executor.submit(
-                        self.speak_and_play, segment_text, text_index
-                    )
-                    self.tts_queue.put((future, text_index))
-                    processed_chars += len(segment_text_raw)  # 更新已处理字符位置
-
-        # 处理最后剩余的文本
-        full_text = "".join(response_message)
-        remaining_text = full_text[processed_chars:]
-        if remaining_text:
-            segment_text = get_string_no_punctuation_or_emoji(remaining_text)
-            if segment_text:
-                text_index += 1
-                self.recode_first_last_text(segment_text, text_index)
-                future = self.executor.submit(
-                    self.speak_and_play, segment_text, text_index
-                )
-                self.tts_queue.put((future, text_index))
-
-        self.llm_finish_task = True
-        self.dialogue.put(Message(role="assistant", content="".join(response_message)))
-        self.logger.bind(tag=TAG).debug(
-            json.dumps(self.dialogue.get_llm_dialogue(), indent=4, ensure_ascii=False)
-        )
-        return True
-
-    def chat_with_function_calling(self, query, tool_call=False):
-        self.logger.bind(tag=TAG).debug(f"Chat with function calling start: {query}")
-        """Chat with function calling for intent detection using streaming"""
+    def chat(self, query, tool_call=False):
+        self.logger.bind(tag=TAG).debug(f"Chat: {query}")
 
         if not tool_call:
             self.dialogue.put(Message(role="user", content=query))
 
         # Define intent functions
         functions = None
-        if hasattr(self, "func_handler"):
+        if self.intent_type == "function_call" and hasattr(self, "func_handler"):
             functions = self.func_handler.get_functions()
         response_message = []
         processed_chars = 0  # 跟踪已处理的字符位置
 
         try:
-            start_time = time.time()
-
             # 使用带记忆的对话
             memory_str = None
             if self.memory is not None:
@@ -614,14 +530,18 @@ def chat_with_function_calling(self, query, tool_call=False):
                 )
                 memory_str = future.result()
 
-            # self.logger.bind(tag=TAG).info(f"对话记录: {self.dialogue.get_llm_dialogue_with_memory(memory_str)}")
-
-            # 使用支持functions的streaming接口
-            llm_responses = self.llm.response_with_functions(
-                self.session_id,
-                self.dialogue.get_llm_dialogue_with_memory(memory_str),
-                functions=functions,
-            )
+            if functions is not None:
+                # 使用支持functions的streaming接口
+                llm_responses = self.llm.response_with_functions(
+                    self.session_id,
+                    self.dialogue.get_llm_dialogue_with_memory(memory_str),
+                    functions=functions,
+                )
+            else:
+                llm_responses = self.llm.response(
+                    self.session_id,
+                    self.dialogue.get_llm_dialogue_with_memory(memory_str),
+                )
         except Exception as e:
             self.logger.bind(tag=TAG).error(f"LLM 处理出错 {query}: {e}")
             return None
@@ -637,27 +557,28 @@ def chat_with_function_calling(self, query, tool_call=False):
         content_arguments = ""
 
         for response in llm_responses:
-            content, tools_call = response
-
-            if "content" in response:
-                content = response["content"]
-                tools_call = None
-            if content is not None and len(content) > 0:
-                content_arguments += content
-
-            if not tool_call_flag and content_arguments.startswith("<tool_call>"):
-                # print("content_arguments", content_arguments)
-                tool_call_flag = True
-
-            if tools_call is not None:
-                tool_call_flag = True
-                if tools_call[0].id is not None:
-                    function_id = tools_call[0].id
-                if tools_call[0].function.name is not None:
-                    function_name = tools_call[0].function.name
-                if tools_call[0].function.arguments is not None:
-                    function_arguments += tools_call[0].function.arguments
-
+            if self.intent_type == "function_call":
+                content, tools_call = response
+                if "content" in response:
+                    content = response["content"]
+                    tools_call = None
+                if content is not None and len(content) > 0:
+                    content_arguments += content
+
+                if not tool_call_flag and content_arguments.startswith("<tool_call>"):
+                    # print("content_arguments", content_arguments)
+                    tool_call_flag = True
+
+                if tools_call is not None:
+                    tool_call_flag = True
+                    if tools_call[0].id is not None:
+                        function_id = tools_call[0].id
+                    if tools_call[0].function.name is not None:
+                        function_name = tools_call[0].function.name
+                    if tools_call[0].function.arguments is not None:
+                        function_arguments += tools_call[0].function.arguments
+            else:
+                content = response
             if content is not None and len(content) > 0:
                 if not tool_call_flag:
                     response_message.append(content)
@@ -696,7 +617,7 @@ def chat_with_function_calling(self, query, tool_call=False):
                             text_index += 1
                             self.recode_first_last_text(segment_text, text_index)
                             future = self.executor.submit(
-                                self.speak_and_play, segment_text, text_index
+                                self.speak_and_play, None, segment_text, text_index
                             )
                             self.tts_queue.put((future, text_index))
                             # 更新已处理字符位置
@@ -755,7 +676,7 @@ def chat_with_function_calling(self, query, tool_call=False):
                 text_index += 1
                 self.recode_first_last_text(segment_text, text_index)
                 future = self.executor.submit(
-                    self.speak_and_play, segment_text, text_index
+                    self.speak_and_play, None, segment_text, text_index
                 )
                 self.tts_queue.put((future, text_index))
 
@@ -818,7 +739,7 @@ def _handle_function_result(self, result, function_call_data, text_index):
         if result.action == Action.RESPONSE:  # 直接回复前端
             text = result.response
             self.recode_first_last_text(text, text_index)
-            future = self.executor.submit(self.speak_and_play, text, text_index)
+            future = self.executor.submit(self.speak_and_play, None, text, text_index)
             self.tts_queue.put((future, text_index))
             self.dialogue.put(Message(role="assistant", content=text))
         elif result.action == Action.REQLLM:  # 调用函数后再请求llm生成回复
@@ -853,11 +774,11 @@ def _handle_function_result(self, result, function_call_data, text_index):
                         content=text,
                     )
                 )
-                self.chat_with_function_calling(text, tool_call=True)
+                self.chat(text, tool_call=True)
         elif result.action == Action.NOTFOUND or result.action == Action.ERROR:
             text = result.result
             self.recode_first_last_text(text, text_index)
-            future = self.executor.submit(self.speak_and_play, text, text_index)
+            future = self.executor.submit(self.speak_and_play, None, text, text_index)
             self.tts_queue.put((future, text_index))
             self.dialogue.put(Message(role="assistant", content=text))
         else:
@@ -884,11 +805,7 @@ def _tts_priority_thread(self):
                     self.logger.bind(tag=TAG).debug("正在处理TTS任务...")
                     tts_timeout = int(self.config.get("tts_timeout", 10))
                     tts_file, text, _ = future.result(timeout=tts_timeout)
-                    if text is None or len(text) <= 0:
-                        self.logger.bind(tag=TAG).error(
-                            f"TTS出错：{text_index}: tts text is empty"
-                        )
-                    elif tts_file is None:
+                    if tts_file is None:
                         self.logger.bind(tag=TAG).error(
                             f"TTS出错： file is empty: {text_index}: {text}"
                         )
@@ -897,12 +814,16 @@ def _tts_priority_thread(self):
                             f"TTS生成：文件路径: {tts_file}"
                         )
                         if os.path.exists(tts_file):
-                            if self.audio_format == "pcm":
+                            if tts_file.endswith(".p3"):
+                                audio_datas, _ = p3.decode_opus_from_file(tts_file)
+                            elif self.audio_format == "pcm":
                                 audio_datas, _ = self.tts.audio_to_pcm_data(tts_file)
                             else:
                                 audio_datas, _ = self.tts.audio_to_opus_data(tts_file)
                             # 在这里上报TTS数据
-                            enqueue_tts_report(self, text, audio_datas)
+                            enqueue_tts_report(
+                                self, tts_file if text is None else text, audio_datas
+                            )
                         else:
                             self.logger.bind(tag=TAG).error(
                                 f"TTS出错：文件不存在{tts_file}"
@@ -918,6 +839,7 @@ def _tts_priority_thread(self):
                     self.tts.delete_audio_file
                     and tts_file is not None
                     and os.path.exists(tts_file)
+                    and tts_file.startswith(self.tts.output_file)
                 ):
                     os.remove(tts_file)
             except Exception as e:
@@ -984,18 +906,21 @@ def _report_worker(self):
 
         self.logger.bind(tag=TAG).info("聊天记录上报线程已退出")
 
-    def speak_and_play(self, text, text_index=0):
-        if text is None or len(text) <= 0:
-            self.logger.bind(tag=TAG).info(f"无需tts转换，query为空，{text}")
-            return None, text, text_index
-        tts_file = self.tts.to_tts(text)
+    def speak_and_play(self, file_path, content, text_index=0):
+        if file_path is not None:
+            self.logger.bind(tag=TAG).info(f"无需tts转换: 从文件播放，{file_path}")
+            return file_path, content, text_index
+        if content is None or len(content) <= 0:
+            self.logger.bind(tag=TAG).info(f"无需tts转换，query为空，{content}")
+            return None, content, text_index
+        tts_file = self.tts.to_tts(content)
         if tts_file is None:
-            self.logger.bind(tag=TAG).error(f"tts转换失败，{text}")
-            return None, text, text_index
+            self.logger.bind(tag=TAG).error(f"tts转换失败，{content}")
+            return None, content, text_index
         self.logger.bind(tag=TAG).debug(f"TTS 文件生成完毕: {tts_file}")
         if self.max_output_size > 0:
-            add_device_output(self.headers.get("device-id"), len(text))
-        return tts_file, text, text_index
+            add_device_output(self.headers.get("device-id"), len(content))
+        return tts_file, content, text_index
 
     def clearSpeakStatus(self):
         self.logger.bind(tag=TAG).debug(f"清除服务端讲话状态")
diff --git a/main/xiaozhi-server/core/handle/intentHandler.py b/main/xiaozhi-server/core/handle/intentHandler.py
@@ -109,29 +109,29 @@ def process_function_call():
                     if result.action == Action.RESPONSE:  # 直接回复前端
                         text = result.response
                         if text is not None:
-                            speak_and_play(conn, text)
+                            speak_txt(conn, text)
                     elif result.action == Action.REQLLM:  # 调用函数后再请求llm生成回复
                         text = result.result
                         conn.dialogue.put(Message(role="tool", content=text))
                         llm_result = conn.intent.replyResult(text, original_text)
                         if llm_result is None:
                             llm_result = text
-                        speak_and_play(conn, llm_result)
+                        speak_txt(conn, llm_result)
                     elif (
                         result.action == Action.NOTFOUND
                         or result.action == Action.ERROR
                     ):
                         text = result.result
                         if text is not None:
-                            speak_and_play(conn, text)
+                            speak_txt(conn, text)
                     elif function_name != "play_music":
                         # For backward compatibility with original code
                         # 获取当前最新的文本索引
                         text = result.response
                         if text is None:
                             text = result.result
                         if text is not None:
-                            speak_and_play(conn, text)
+                            speak_txt(conn, text)
 
             # 将函数执行放在线程池中
             conn.executor.submit(process_function_call)
@@ -142,12 +142,12 @@ def process_function_call():
         return False
 
 
-def speak_and_play(conn, text):
+def speak_txt(conn, text):
     text_index = (
         conn.tts_last_text_index + 1 if hasattr(conn, "tts_last_text_index") else 0
     )
     conn.recode_first_last_text(text, text_index)
-    future = conn.executor.submit(conn.speak_and_play, text, text_index)
+    future = conn.executor.submit(conn.speak_and_play, None, text, text_index)
     conn.llm_finish_task = True
     conn.tts_queue.put((future, text_index))
     conn.dialogue.put(Message(role="assistant", content=text))
diff --git a/main/xiaozhi-server/core/handle/receiveAudioHandle.py b/main/xiaozhi-server/core/handle/receiveAudioHandle.py
@@ -39,7 +39,9 @@ async def handleAudioMessage(conn, audio):
         if len(conn.asr_audio) < 15:
             conn.asr_server_receive = True
         else:
-            raw_text, _ = await conn.asr.speech_to_text(conn.asr_audio, conn.session_id)  # 确保ASR模块返回原始文本
+            raw_text, _ = await conn.asr.speech_to_text(
+                conn.asr_audio, conn.session_id
+            )  # 确保ASR模块返回原始文本
             conn.logger.bind(tag=TAG).info(f"识别文本: {raw_text}")
             text_len, _ = remove_punctuation_and_length(raw_text)
             if text_len > 0:
@@ -76,11 +78,7 @@ async def startToChat(conn, text):
 
     # 意图未被处理，继续常规聊天流程
     await send_stt_message(conn, text)
-    if conn.intent_type == "function_call":
-        # 使用支持function calling的聊天方法
-        conn.executor.submit(conn.chat_with_function_calling, text)
-    else:
-        conn.executor.submit(conn.chat, text)
+    conn.executor.submit(conn.chat, text)
 
 
 async def no_voice_close_connect(conn):
diff --git a/main/xiaozhi-server/plugins_func/functions/play_music.py b/main/xiaozhi-server/plugins_func/functions/play_music.py