xinnan-tech · devinzhang91 · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025
diff --git a/README.md b/README.md
@@ -192,6 +192,7 @@ server:
 | TTS |     GPT_SOVITS_V2      | 接口调用 |  免费/自定义  |                         本地启动 TTS 服务，适用于个性化语音合成场景                          |
 | TTS |     GPT_SOVITS_V3      | 接口调用 |  免费/自定义  |                         本地启动 TTS 服务，适用于个性化语音合成场景                          |
 | TTS |       MinimaxTTS       | 接口调用 |  免费/自定义  |                         本地启动 TTS 服务，适用于个性化语音合成场景                          |
+| TTS |       Kokoro       | 本地使用 |  免费  |                         小型TTS模型，使用中文微调后的版本[ZH-v1.1](https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh)                          |
 
 ---
 

diff --git a/main/xiaozhi-server/config.yaml b/main/xiaozhi-server/config.yaml
@@ -74,7 +74,7 @@ selected_module:
   # 将根据配置名称对应的type调用实际的LLM适配器
   LLM: ChatGLMLLM
   # TTS将根据配置名称对应的type调用实际的TTS适配器
-  TTS: EdgeTTS
+  TTS: KokoroTTS
   # 记忆模块，默认不开启记忆；如果想使用超长记忆，推荐使用mem0ai；如果注重隐私，请使用本地的mem_local_short
   Memory: nomem
   # 意图识别模块，默认不开启。开启后，可以播放音乐、控制音量、识别退出指令
@@ -439,6 +439,12 @@ TTS:
       # Authorization: Bearer xxxx
     format: wav # 接口返回的音频格式
     output_file: tmp/
+  KokoroTTS:
+    # 本地部署的Kokoro TTS
+    type: kokoro_local
+    model_dir: models/Kokoro
+    output_file: tmp/
+    voice: zf_001
 # 模块测试配置
 module_test:
   test_sentences:  # 自定义测试语句

diff --git a/main/xiaozhi-server/core/providers/tts/kokoro_local.py b/main/xiaozhi-server/core/providers/tts/kokoro_local.py
@@ -0,0 +1,64 @@
+import os
+import base64
+import uuid
+from datetime import datetime
+from core.providers.tts.base import TTSProviderBase
+import torch
+from pathlib import Path
+import numpy as np
+import soundfile as sf
+from kokoro import KModel, KPipeline
+
+
+REPO_ID = 'hexgrad/Kokoro-82M-v1.1-zh'
+SAMPLE_RATE = 24000
+
+# HACK: Mitigate rushing caused by lack of training data beyond ~100 tokens
+# Simple piecewise linear fn that decreases speed as len_ps increases
+@staticmethod
+def _speed_callable(len_ps):
+    speed = 0.8
+    if len_ps <= 83:
+        speed = 1
+    elif len_ps < 183:
+        speed = 1 - (len_ps - 83) / 500
+    return speed * 1.3
+
+class TTSProvider(TTSProviderBase):
+
+    def __init__(self, config, delete_audio_file):
+        super().__init__(config, delete_audio_file)
+        self.appid = config.get("appid")
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.device = 'mps' if torch.backends.mps.is_available() else 'cpu'
+        self.model_dir = config.get("model_dir")
+
+        self.en_pipeline = KPipeline(lang_code='a', repo_id=REPO_ID, model=False)
+        model = KModel(repo_id=REPO_ID, 
+                       config=f"{self.model_dir}/config.json", 
+                       model=f"{self.model_dir}/kokoro-v1_1-zh.pth", 
+                       disable_complex=True).to(self.device).eval()
+        self.zh_pipeline = KPipeline(lang_code='z', repo_id=REPO_ID, model=model, en_callable=self.en_callable)
+        self.voice = f"{self.model_dir}/voices/{config.get('voice')}.pt"
+
+    def generate_filename(self, extension=".wav"):
+        return os.path.join(self.output_file, f"tts-{datetime.now().date()}@{uuid.uuid4().hex}{extension}")
+
+    async def text_to_speak(self, text, output_file):
+        try:
+            generator = self.zh_pipeline(text, voice=self.voice, speed=_speed_callable)
+            result = next(generator)
+            wav = result.audio
+            # 调整音量，乘以一个因子（例如 0.5 表示减小音量，2.0 表示增大音量）
+            volume_factor = 1.5
+            wav = wav * volume_factor
+            sf.write(output_file, wav, SAMPLE_RATE)
+        except Exception as e:
+            raise Exception(f"{__name__} error: {e}")
+
+
+    def en_callable(self, text):
+        return next(self.en_pipeline(text)).phonemes
+
+
+
diff --git a/main/xiaozhi-server/models/Kokoro/config.json b/main/xiaozhi-server/models/Kokoro/config.json
@@ -0,0 +1,207 @@
+{
+  "istftnet": {
+    "upsample_kernel_sizes": [20, 12],
+    "upsample_rates": [10, 6],
+    "gen_istft_hop_size": 5,
+    "gen_istft_n_fft": 20,
+    "resblock_dilation_sizes": [
+      [1, 3, 5],
+      [1, 3, 5],
+      [1, 3, 5]
+    ],
+    "resblock_kernel_sizes": [3, 7, 11],
+    "upsample_initial_channel": 512
+  },
+  "dim_in": 64,
+  "dropout": 0.2,
+  "hidden_dim": 512,
+  "max_conv_dim": 512,
+  "max_dur": 50,
+  "multispeaker": true,
+  "n_layer": 3,
+  "n_mels": 80,
+  "n_token": 178,
+  "style_dim": 128,
+  "text_encoder_kernel_size": 5,
+  "plbert": {
+    "hidden_size": 768,
+    "num_attention_heads": 12,
+    "intermediate_size": 2048,
+    "max_position_embeddings": 512,
+    "num_hidden_layers": 12,
+    "dropout": 0.1
+  },
+  "vocab": {
+    ";": 1,
+    ":": 2,
+    ",": 3,
+    ".": 4,
+    "!": 5,
+    "?": 6,
+    "/": 7,
+    "—": 9,
+    "…": 10,
+    "\"": 11,
+    "(": 12,
+    ")": 13,
+    "“": 14,
+    "”": 15,
+    " ": 16,
+    "\u0303": 17,
+    "ʣ": 18,
+    "ʥ": 19,
+    "ʦ": 20,
+    "ʨ": 21,
+    "ᵝ": 22,
+    "ㄓ": 23,
+    "A": 24,
+    "I": 25,
+    "ㄅ": 30,
+    "O": 31,
+    "ㄆ": 32,
+    "Q": 33,
+    "R": 34,
+    "S": 35,
+    "T": 36,
+    "ㄇ": 37,
+    "ㄈ": 38,
+    "W": 39,
+    "ㄉ": 40,
+    "Y": 41,
+    "ᵊ": 42,
+    "a": 43,
+    "b": 44,
+    "c": 45,
+    "d": 46,
+    "e": 47,
+    "f": 48,
+    "ㄊ": 49,
+    "h": 50,
+    "i": 51,
+    "j": 52,
+    "k": 53,
+    "l": 54,
+    "m": 55,
+    "n": 56,
+    "o": 57,
+    "p": 58,
+    "q": 59,
+    "r": 60,
+    "s": 61,
+    "t": 62,
+    "u": 63,
+    "v": 64,
+    "w": 65,
+    "x": 66,
+    "y": 67,
+    "z": 68,
+    "ɑ": 69,
+    "ɐ": 70,
+    "ɒ": 71,
+    "æ": 72,
+    "ㄋ": 73,
+    "ㄌ": 74,
+    "β": 75,
+    "ɔ": 76,
+    "ɕ": 77,
+    "ç": 78,
+    "ㄍ": 79,
+    "ɖ": 80,
+    "ð": 81,
+    "ʤ": 82,
+    "ə": 83,
+    "ㄎ": 84,
+    "ㄦ": 85,
+    "ɛ": 86,
+    "ɜ": 87,
+    "ㄏ": 88,
+    "ㄐ": 89,
+    "ɟ": 90,
+    "ㄑ": 91,
+    "ɡ": 92,
+    "ㄒ": 93,
+    "ㄔ": 94,
+    "ㄕ": 95,
+    "ㄗ": 96,
+    "ㄘ": 97,
+    "ㄙ": 98,
+    "月": 99,
+    "ㄚ": 100,
+    "ɨ": 101,
+    "ɪ": 102,
+    "ʝ": 103,
+    "ㄛ": 104,
+    "ㄝ": 105,
+    "ㄞ": 106,
+    "ㄟ": 107,
+    "ㄠ": 108,
+    "ㄡ": 109,
+    "ɯ": 110,
+    "ɰ": 111,
+    "ŋ": 112,
+    "ɳ": 113,
+    "ɲ": 114,
+    "ɴ": 115,
+    "ø": 116,
+    "ㄢ": 117,
+    "ɸ": 118,
+    "θ": 119,
+    "œ": 120,
+    "ㄣ": 121,
+    "ㄤ": 122,
+    "ɹ": 123,
+    "ㄥ": 124,
+    "ɾ": 125,
+    "ㄖ": 126,
+    "ㄧ": 127,
+    "ʁ": 128,
+    "ɽ": 129,
+    "ʂ": 130,
+    "ʃ": 131,
+    "ʈ": 132,
+    "ʧ": 133,
+    "ㄨ": 134,
+    "ʊ": 135,
+    "ʋ": 136,
+    "ㄩ": 137,
+    "ʌ": 138,
+    "ɣ": 139,
+    "ㄜ": 140,
+    "ㄭ": 141,
+    "χ": 142,
+    "ʎ": 143,
+    "十": 144,
+    "压": 145,
+    "言": 146,
+    "ʒ": 147,
+    "ʔ": 148,
+    "阳": 149,
+    "要": 150,
+    "阴": 151,
+    "应": 152,
+    "用": 153,
+    "又": 154,
+    "中": 155,
+    "ˈ": 156,
+    "ˌ": 157,
+    "ː": 158,
+    "穵": 159,
+    "外": 160,
+    "万": 161,
+    "ʰ": 162,
+    "王": 163,
+    "ʲ": 164,
+    "为": 165,
+    "文": 166,
+    "瓮": 167,
+    "我": 168,
+    "3": 169,
+    "5": 170,
+    "1": 171,
+    "2": 172,
+    "4": 173,
+    "元": 175,
+    "云": 176,
+    "ᵻ": 177
+  }
+}
diff --git a/main/xiaozhi-server/models/Kokoro/configuration.json b/main/xiaozhi-server/models/Kokoro/configuration.json
@@ -0,0 +1 @@
+{"framework": "pytorch", "task": "text-to-speech", "allow_remote": true}
diff --git a/main/xiaozhi-server/requirements.txt b/main/xiaozhi-server/requirements.txt
@@ -19,4 +19,7 @@ loguru==0.7.3
 requests==2.32.3
 cozepy==0.12.0
 mem0ai==0.1.62
+kokoro==0.8.4
+kokoro[misaki-zh]>=0.8.2
+soundfile==0.13.1
 bs4==0.0.2
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"framework": "pytorch", "task": "text-to-speech", "allow_remote": true}