diff --git a/README.md b/README.md index 121dfbe61..261ce630e 100644 --- a/README.md +++ b/README.md @@ -192,6 +192,7 @@ server: | TTS | GPT_SOVITS_V2 | 接口调用 | 免费/自定义 | 本地启动 TTS 服务,适用于个性化语音合成场景 | | TTS | GPT_SOVITS_V3 | 接口调用 | 免费/自定义 | 本地启动 TTS 服务,适用于个性化语音合成场景 | | TTS | MinimaxTTS | 接口调用 | 免费/自定义 | 本地启动 TTS 服务,适用于个性化语音合成场景 | +| TTS | Kokoro | 本地使用 | 免费 | 小型TTS模型,使用中文微调后的版本[ZH-v1.1](https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh) | --- diff --git a/main/xiaozhi-server/config.yaml b/main/xiaozhi-server/config.yaml index 7fa8a24ae..4f57e22c6 100644 --- a/main/xiaozhi-server/config.yaml +++ b/main/xiaozhi-server/config.yaml @@ -74,7 +74,7 @@ selected_module: # 将根据配置名称对应的type调用实际的LLM适配器 LLM: ChatGLMLLM # TTS将根据配置名称对应的type调用实际的TTS适配器 - TTS: EdgeTTS + TTS: KokoroTTS # 记忆模块,默认不开启记忆;如果想使用超长记忆,推荐使用mem0ai;如果注重隐私,请使用本地的mem_local_short Memory: nomem # 意图识别模块,默认不开启。开启后,可以播放音乐、控制音量、识别退出指令 @@ -439,6 +439,12 @@ TTS: # Authorization: Bearer xxxx format: wav # 接口返回的音频格式 output_file: tmp/ + KokoroTTS: + # 本地部署的Kokoro TTS + type: kokoro_local + model_dir: models/Kokoro + output_file: tmp/ + voice: zf_001 # 模块测试配置 module_test: test_sentences: # 自定义测试语句 diff --git a/main/xiaozhi-server/core/providers/tts/kokoro_local.py b/main/xiaozhi-server/core/providers/tts/kokoro_local.py new file mode 100644 index 000000000..73207f8e0 --- /dev/null +++ b/main/xiaozhi-server/core/providers/tts/kokoro_local.py @@ -0,0 +1,64 @@ +import os +import base64 +import uuid +from datetime import datetime +from core.providers.tts.base import TTSProviderBase +import torch +from pathlib import Path +import numpy as np +import soundfile as sf +from kokoro import KModel, KPipeline + + +REPO_ID = 'hexgrad/Kokoro-82M-v1.1-zh' +SAMPLE_RATE = 24000 + +# HACK: Mitigate rushing caused by lack of training data beyond ~100 tokens +# Simple piecewise linear fn that decreases speed as len_ps increases +@staticmethod +def _speed_callable(len_ps): + speed = 0.8 + if len_ps <= 83: + speed = 1 + elif len_ps < 183: + speed = 1 - (len_ps - 83) / 500 + return speed * 1.3 + +class TTSProvider(TTSProviderBase): + + def __init__(self, config, delete_audio_file): + super().__init__(config, delete_audio_file) + self.appid = config.get("appid") + self.device = 'cuda' if torch.cuda.is_available() else 'cpu' + self.device = 'mps' if torch.backends.mps.is_available() else 'cpu' + self.model_dir = config.get("model_dir") + + self.en_pipeline = KPipeline(lang_code='a', repo_id=REPO_ID, model=False) + model = KModel(repo_id=REPO_ID, + config=f"{self.model_dir}/config.json", + model=f"{self.model_dir}/kokoro-v1_1-zh.pth", + disable_complex=True).to(self.device).eval() + self.zh_pipeline = KPipeline(lang_code='z', repo_id=REPO_ID, model=model, en_callable=self.en_callable) + self.voice = f"{self.model_dir}/voices/{config.get('voice')}.pt" + + def generate_filename(self, extension=".wav"): + return os.path.join(self.output_file, f"tts-{datetime.now().date()}@{uuid.uuid4().hex}{extension}") + + async def text_to_speak(self, text, output_file): + try: + generator = self.zh_pipeline(text, voice=self.voice, speed=_speed_callable) + result = next(generator) + wav = result.audio + # 调整音量,乘以一个因子(例如 0.5 表示减小音量,2.0 表示增大音量) + volume_factor = 1.5 + wav = wav * volume_factor + sf.write(output_file, wav, SAMPLE_RATE) + except Exception as e: + raise Exception(f"{__name__} error: {e}") + + + def en_callable(self, text): + return next(self.en_pipeline(text)).phonemes + + + \ No newline at end of file diff --git a/main/xiaozhi-server/models/Kokoro/config.json b/main/xiaozhi-server/models/Kokoro/config.json new file mode 100644 index 000000000..ca3cdb357 --- /dev/null +++ b/main/xiaozhi-server/models/Kokoro/config.json @@ -0,0 +1,207 @@ +{ + "istftnet": { + "upsample_kernel_sizes": [20, 12], + "upsample_rates": [10, 6], + "gen_istft_hop_size": 5, + "gen_istft_n_fft": 20, + "resblock_dilation_sizes": [ + [1, 3, 5], + [1, 3, 5], + [1, 3, 5] + ], + "resblock_kernel_sizes": [3, 7, 11], + "upsample_initial_channel": 512 + }, + "dim_in": 64, + "dropout": 0.2, + "hidden_dim": 512, + "max_conv_dim": 512, + "max_dur": 50, + "multispeaker": true, + "n_layer": 3, + "n_mels": 80, + "n_token": 178, + "style_dim": 128, + "text_encoder_kernel_size": 5, + "plbert": { + "hidden_size": 768, + "num_attention_heads": 12, + "intermediate_size": 2048, + "max_position_embeddings": 512, + "num_hidden_layers": 12, + "dropout": 0.1 + }, + "vocab": { + ";": 1, + ":": 2, + ",": 3, + ".": 4, + "!": 5, + "?": 6, + "/": 7, + "—": 9, + "…": 10, + "\"": 11, + "(": 12, + ")": 13, + "“": 14, + "”": 15, + " ": 16, + "\u0303": 17, + "ʣ": 18, + "ʥ": 19, + "ʦ": 20, + "ʨ": 21, + "ᵝ": 22, + "ㄓ": 23, + "A": 24, + "I": 25, + "ㄅ": 30, + "O": 31, + "ㄆ": 32, + "Q": 33, + "R": 34, + "S": 35, + "T": 36, + "ㄇ": 37, + "ㄈ": 38, + "W": 39, + "ㄉ": 40, + "Y": 41, + "ᵊ": 42, + "a": 43, + "b": 44, + "c": 45, + "d": 46, + "e": 47, + "f": 48, + "ㄊ": 49, + "h": 50, + "i": 51, + "j": 52, + "k": 53, + "l": 54, + "m": 55, + "n": 56, + "o": 57, + "p": 58, + "q": 59, + "r": 60, + "s": 61, + "t": 62, + "u": 63, + "v": 64, + "w": 65, + "x": 66, + "y": 67, + "z": 68, + "ɑ": 69, + "ɐ": 70, + "ɒ": 71, + "æ": 72, + "ㄋ": 73, + "ㄌ": 74, + "β": 75, + "ɔ": 76, + "ɕ": 77, + "ç": 78, + "ㄍ": 79, + "ɖ": 80, + "ð": 81, + "ʤ": 82, + "ə": 83, + "ㄎ": 84, + "ㄦ": 85, + "ɛ": 86, + "ɜ": 87, + "ㄏ": 88, + "ㄐ": 89, + "ɟ": 90, + "ㄑ": 91, + "ɡ": 92, + "ㄒ": 93, + "ㄔ": 94, + "ㄕ": 95, + "ㄗ": 96, + "ㄘ": 97, + "ㄙ": 98, + "月": 99, + "ㄚ": 100, + "ɨ": 101, + "ɪ": 102, + "ʝ": 103, + "ㄛ": 104, + "ㄝ": 105, + "ㄞ": 106, + "ㄟ": 107, + "ㄠ": 108, + "ㄡ": 109, + "ɯ": 110, + "ɰ": 111, + "ŋ": 112, + "ɳ": 113, + "ɲ": 114, + "ɴ": 115, + "ø": 116, + "ㄢ": 117, + "ɸ": 118, + "θ": 119, + "œ": 120, + "ㄣ": 121, + "ㄤ": 122, + "ɹ": 123, + "ㄥ": 124, + "ɾ": 125, + "ㄖ": 126, + "ㄧ": 127, + "ʁ": 128, + "ɽ": 129, + "ʂ": 130, + "ʃ": 131, + "ʈ": 132, + "ʧ": 133, + "ㄨ": 134, + "ʊ": 135, + "ʋ": 136, + "ㄩ": 137, + "ʌ": 138, + "ɣ": 139, + "ㄜ": 140, + "ㄭ": 141, + "χ": 142, + "ʎ": 143, + "十": 144, + "压": 145, + "言": 146, + "ʒ": 147, + "ʔ": 148, + "阳": 149, + "要": 150, + "阴": 151, + "应": 152, + "用": 153, + "又": 154, + "中": 155, + "ˈ": 156, + "ˌ": 157, + "ː": 158, + "穵": 159, + "外": 160, + "万": 161, + "ʰ": 162, + "王": 163, + "ʲ": 164, + "为": 165, + "文": 166, + "瓮": 167, + "我": 168, + "3": 169, + "5": 170, + "1": 171, + "2": 172, + "4": 173, + "元": 175, + "云": 176, + "ᵻ": 177 + } +} \ No newline at end of file diff --git a/main/xiaozhi-server/models/Kokoro/configuration.json b/main/xiaozhi-server/models/Kokoro/configuration.json new file mode 100644 index 000000000..591fb74e8 --- /dev/null +++ b/main/xiaozhi-server/models/Kokoro/configuration.json @@ -0,0 +1 @@ +{"framework": "pytorch", "task": "text-to-speech", "allow_remote": true} \ No newline at end of file diff --git a/main/xiaozhi-server/requirements.txt b/main/xiaozhi-server/requirements.txt index bcd6b4ee4..fc0450237 100755 --- a/main/xiaozhi-server/requirements.txt +++ b/main/xiaozhi-server/requirements.txt @@ -19,4 +19,7 @@ loguru==0.7.3 requests==2.32.3 cozepy==0.12.0 mem0ai==0.1.62 +kokoro==0.8.4 +kokoro[misaki-zh]>=0.8.2 +soundfile==0.13.1 bs4==0.0.2 \ No newline at end of file