Skip to content

添加本地TTS服务,由Kokoro模型实现 #385

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@ server:
| TTS | GPT_SOVITS_V2 | 接口调用 | 免费/自定义 | 本地启动 TTS 服务,适用于个性化语音合成场景 |
| TTS | GPT_SOVITS_V3 | 接口调用 | 免费/自定义 | 本地启动 TTS 服务,适用于个性化语音合成场景 |
| TTS | MinimaxTTS | 接口调用 | 免费/自定义 | 本地启动 TTS 服务,适用于个性化语音合成场景 |
| TTS | Kokoro | 本地使用 | 免费 | 小型TTS模型,使用中文微调后的版本[ZH-v1.1](https://huggingface.co/hexgrad/Kokoro-82M-v1.1-zh) |

---

Expand Down
8 changes: 7 additions & 1 deletion main/xiaozhi-server/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ selected_module:
# 将根据配置名称对应的type调用实际的LLM适配器
LLM: ChatGLMLLM
# TTS将根据配置名称对应的type调用实际的TTS适配器
TTS: EdgeTTS
TTS: KokoroTTS
# 记忆模块,默认不开启记忆;如果想使用超长记忆,推荐使用mem0ai;如果注重隐私,请使用本地的mem_local_short
Memory: nomem
# 意图识别模块,默认不开启。开启后,可以播放音乐、控制音量、识别退出指令
Expand Down Expand Up @@ -439,6 +439,12 @@ TTS:
# Authorization: Bearer xxxx
format: wav # 接口返回的音频格式
output_file: tmp/
KokoroTTS:
# 本地部署的Kokoro TTS
type: kokoro_local
model_dir: models/Kokoro
output_file: tmp/
voice: zf_001
# 模块测试配置
module_test:
test_sentences: # 自定义测试语句
Expand Down
64 changes: 64 additions & 0 deletions main/xiaozhi-server/core/providers/tts/kokoro_local.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import os
import base64
import uuid
from datetime import datetime
from core.providers.tts.base import TTSProviderBase
import torch
from pathlib import Path
import numpy as np
import soundfile as sf
from kokoro import KModel, KPipeline


REPO_ID = 'hexgrad/Kokoro-82M-v1.1-zh'
SAMPLE_RATE = 24000

# HACK: Mitigate rushing caused by lack of training data beyond ~100 tokens
# Simple piecewise linear fn that decreases speed as len_ps increases
@staticmethod
def _speed_callable(len_ps):
speed = 0.8
if len_ps <= 83:
speed = 1
elif len_ps < 183:
speed = 1 - (len_ps - 83) / 500
return speed * 1.3

class TTSProvider(TTSProviderBase):

def __init__(self, config, delete_audio_file):
super().__init__(config, delete_audio_file)
self.appid = config.get("appid")
self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.device = 'mps' if torch.backends.mps.is_available() else 'cpu'
self.model_dir = config.get("model_dir")

self.en_pipeline = KPipeline(lang_code='a', repo_id=REPO_ID, model=False)
model = KModel(repo_id=REPO_ID,
config=f"{self.model_dir}/config.json",
model=f"{self.model_dir}/kokoro-v1_1-zh.pth",
disable_complex=True).to(self.device).eval()
self.zh_pipeline = KPipeline(lang_code='z', repo_id=REPO_ID, model=model, en_callable=self.en_callable)
self.voice = f"{self.model_dir}/voices/{config.get('voice')}.pt"

def generate_filename(self, extension=".wav"):
return os.path.join(self.output_file, f"tts-{datetime.now().date()}@{uuid.uuid4().hex}{extension}")

async def text_to_speak(self, text, output_file):
try:
generator = self.zh_pipeline(text, voice=self.voice, speed=_speed_callable)
result = next(generator)
wav = result.audio
# 调整音量,乘以一个因子(例如 0.5 表示减小音量,2.0 表示增大音量)
volume_factor = 1.5
wav = wav * volume_factor
sf.write(output_file, wav, SAMPLE_RATE)
except Exception as e:
raise Exception(f"{__name__} error: {e}")


def en_callable(self, text):
return next(self.en_pipeline(text)).phonemes



207 changes: 207 additions & 0 deletions main/xiaozhi-server/models/Kokoro/config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,207 @@
{
"istftnet": {
"upsample_kernel_sizes": [20, 12],
"upsample_rates": [10, 6],
"gen_istft_hop_size": 5,
"gen_istft_n_fft": 20,
"resblock_dilation_sizes": [
[1, 3, 5],
[1, 3, 5],
[1, 3, 5]
],
"resblock_kernel_sizes": [3, 7, 11],
"upsample_initial_channel": 512
},
"dim_in": 64,
"dropout": 0.2,
"hidden_dim": 512,
"max_conv_dim": 512,
"max_dur": 50,
"multispeaker": true,
"n_layer": 3,
"n_mels": 80,
"n_token": 178,
"style_dim": 128,
"text_encoder_kernel_size": 5,
"plbert": {
"hidden_size": 768,
"num_attention_heads": 12,
"intermediate_size": 2048,
"max_position_embeddings": 512,
"num_hidden_layers": 12,
"dropout": 0.1
},
"vocab": {
";": 1,
":": 2,
",": 3,
".": 4,
"!": 5,
"?": 6,
"/": 7,
"—": 9,
"…": 10,
"\"": 11,
"(": 12,
")": 13,
"“": 14,
"”": 15,
" ": 16,
"\u0303": 17,
"ʣ": 18,
"ʥ": 19,
"ʦ": 20,
"ʨ": 21,
"ᵝ": 22,
"ㄓ": 23,
"A": 24,
"I": 25,
"ㄅ": 30,
"O": 31,
"ㄆ": 32,
"Q": 33,
"R": 34,
"S": 35,
"T": 36,
"ㄇ": 37,
"ㄈ": 38,
"W": 39,
"ㄉ": 40,
"Y": 41,
"ᵊ": 42,
"a": 43,
"b": 44,
"c": 45,
"d": 46,
"e": 47,
"f": 48,
"ㄊ": 49,
"h": 50,
"i": 51,
"j": 52,
"k": 53,
"l": 54,
"m": 55,
"n": 56,
"o": 57,
"p": 58,
"q": 59,
"r": 60,
"s": 61,
"t": 62,
"u": 63,
"v": 64,
"w": 65,
"x": 66,
"y": 67,
"z": 68,
"ɑ": 69,
"ɐ": 70,
"ɒ": 71,
"æ": 72,
"ㄋ": 73,
"ㄌ": 74,
"β": 75,
"ɔ": 76,
"ɕ": 77,
"ç": 78,
"ㄍ": 79,
"ɖ": 80,
"ð": 81,
"ʤ": 82,
"ə": 83,
"ㄎ": 84,
"ㄦ": 85,
"ɛ": 86,
"ɜ": 87,
"ㄏ": 88,
"ㄐ": 89,
"ɟ": 90,
"ㄑ": 91,
"ɡ": 92,
"ㄒ": 93,
"ㄔ": 94,
"ㄕ": 95,
"ㄗ": 96,
"ㄘ": 97,
"ㄙ": 98,
"月": 99,
"ㄚ": 100,
"ɨ": 101,
"ɪ": 102,
"ʝ": 103,
"ㄛ": 104,
"ㄝ": 105,
"ㄞ": 106,
"ㄟ": 107,
"ㄠ": 108,
"ㄡ": 109,
"ɯ": 110,
"ɰ": 111,
"ŋ": 112,
"ɳ": 113,
"ɲ": 114,
"ɴ": 115,
"ø": 116,
"ㄢ": 117,
"ɸ": 118,
"θ": 119,
"œ": 120,
"ㄣ": 121,
"ㄤ": 122,
"ɹ": 123,
"ㄥ": 124,
"ɾ": 125,
"ㄖ": 126,
"ㄧ": 127,
"ʁ": 128,
"ɽ": 129,
"ʂ": 130,
"ʃ": 131,
"ʈ": 132,
"ʧ": 133,
"ㄨ": 134,
"ʊ": 135,
"ʋ": 136,
"ㄩ": 137,
"ʌ": 138,
"ɣ": 139,
"ㄜ": 140,
"ㄭ": 141,
"χ": 142,
"ʎ": 143,
"十": 144,
"压": 145,
"言": 146,
"ʒ": 147,
"ʔ": 148,
"阳": 149,
"要": 150,
"阴": 151,
"应": 152,
"用": 153,
"又": 154,
"中": 155,
"ˈ": 156,
"ˌ": 157,
"ː": 158,
"穵": 159,
"外": 160,
"万": 161,
"ʰ": 162,
"王": 163,
"ʲ": 164,
"为": 165,
"文": 166,
"瓮": 167,
"我": 168,
"3": 169,
"5": 170,
"1": 171,
"2": 172,
"4": 173,
"元": 175,
"云": 176,
"ᵻ": 177
}
}
1 change: 1 addition & 0 deletions main/xiaozhi-server/models/Kokoro/configuration.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"framework": "pytorch", "task": "text-to-speech", "allow_remote": true}
3 changes: 3 additions & 0 deletions main/xiaozhi-server/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,7 @@ loguru==0.7.3
requests==2.32.3
cozepy==0.12.0
mem0ai==0.1.62
kokoro==0.8.4
kokoro[misaki-zh]>=0.8.2
soundfile==0.13.1
bs4==0.0.2