Skip to content

Commit 260155a

Browse files
authored
[NeuralChat] Multi-language TTS support (intel#1139)
1 parent 8147541 commit 260155a

36 files changed

+5457
-164
lines changed

intel_extension_for_transformers/neural_chat/chatbot.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,18 @@
2626
from .utils.error_utils import set_latest_error, get_latest_error, clear_latest_error
2727
from intel_extension_for_transformers.utils.logger import logging
2828
import importlib
29+
import sys
2930

3031
def check_tts_dependency():
3132
try:
32-
importlib.import_module('paddlespeech')
33-
importlib.import_module('paddle')
34-
importlib.import_module('soundfile')
35-
importlib.import_module('pydub')
36-
importlib.import_module('speechbrain')
37-
importlib.import_module('librosa')
33+
for module in ['soundfile', 'pydub', 'speechbrain', 'librosa', 'zhconv', 'urllib3', 'langid',
34+
'vector_quantize_pytorch', 'cn2an', 'pypinyin', 'jaconv', 'webrtcvad', 'g2p_en', 'inflect',
35+
'jieba']:
36+
importlib.import_module(module)
37+
if sys.platform == 'linux':
38+
importlib.import_module('pyopenjtalk')
39+
else:
40+
importlib.import_module('openjtalk')
3841
return True
3942
except ImportError:
4043
return False
@@ -178,7 +181,7 @@ def build_chatbot(config: PipelineConfig=None):
178181
for plugin_name, plugin_value in config.plugins.items():
179182
enable_plugin = plugin_value.get('enable', False)
180183
if enable_plugin:
181-
if plugin_name == "tts" or plugin_name == "tts_chinese" or plugin_name == "asr":
184+
if plugin_name == "tts" or plugin_name == "tts_multilang" or plugin_name == "asr":
182185
if not check_tts_dependency():
183186
raise ImportError(
184187
f"Unable to initialize 'tts' plugin due to missing dependency packages.\n" \
@@ -229,9 +232,9 @@ def build_chatbot(config: PipelineConfig=None):
229232
if plugin_name == "tts":
230233
from .pipeline.plugins.audio.tts import TextToSpeech
231234
plugins[plugin_name]['class'] = TextToSpeech
232-
elif plugin_name == "tts_chinese":
233-
from .pipeline.plugins.audio.tts_chinese import ChineseTextToSpeech
234-
plugins[plugin_name]['class'] = ChineseTextToSpeech
235+
elif plugin_name == "tts_multilang":
236+
from .pipeline.plugins.audio.tts_multilang import MultilangTextToSpeech
237+
plugins[plugin_name]['class'] = MultilangTextToSpeech
235238
elif plugin_name == "asr":
236239
from .pipeline.plugins.audio.asr import AudioSpeechRecognition
237240
plugins[plugin_name]['class'] = AudioSpeechRecognition

intel_extension_for_transformers/neural_chat/examples/deployment/plugin/audio/README.md

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -47,22 +47,27 @@ pip install -r ../../../requirements.txt
4747

4848
You can customize the configuration file 'audio_service.yaml' to match your environment setup. Here's a table to help you understand the configurable options:
4949

50-
| Item | Value |
51-
| --------------------------------- | ---------------------------------------|
52-
| host | 127.0.0.1 |
53-
| port | 7777 |
54-
| device | "auto" |
55-
| asr.enable | true |
56-
| asr.args.device | "cpu" |
57-
| asr.args.model_name_or_path | "openai/whisper-small" |
58-
| asr.args.bf16 | false |
59-
| tts.enable | true |
60-
| tts.args.device | "cpu" |
61-
| tts.args.voice | "default" |
62-
| tts.args.stream_mode | false |
63-
| tts.args.output_audio_path | "./output_audio.wav" |
64-
| tts.args.speedup | 1.0 |
65-
| tasks_list | ['plugin_audio'] |
50+
| Item | Value |
51+
| ---------------------------------------| ---------------------------------------|
52+
| host | 127.0.0.1 |
53+
| port | 7777 |
54+
| device | "auto" |
55+
| asr.enable | true |
56+
| asr.args.device | "cpu" |
57+
| asr.args.model_name_or_path | "openai/whisper-small" |
58+
| asr.args.bf16 | false |
59+
| tts.enable | true |
60+
| tts.args.device | "cpu" |
61+
| tts.args.voice | "default" |
62+
| tts.args.stream_mode | false |
63+
| tts.args.output_audio_path | "./output_audio.wav" |
64+
| tts_multilang.enable | true |
65+
| tts_multilang.args.device | "cpu" |
66+
| tts_multilang.args.voice | "default" |
67+
| tts_multilang.args.output_audio_path | "./output_audio.wav" |
68+
| tts_multilang.args.precision | "bf16" |
69+
| tasks_list | ['plugin_audio'] |
70+
6671

6772

6873
# Run the audio service server
@@ -77,3 +82,4 @@ To call the started audio service, the APIs are listed as follows:
7782
1. http://127.0.0.1:7777/plugin/audio/asr , upload an audio file and return the text contents.
7883
2. http://127.0.0.1:7777/plugin/audio/tts , input text string and return the binary content of the audio.
7984
3. http://127.0.0.1:7777/plugin/audio/create_embedding, upload an audio file and create an embedding of your voice.
85+
4. http://127.0.0.1:7777/plugin/audio/tts_multilang , input text string and return the binary content of the audio.

intel_extension_for_transformers/neural_chat/examples/deployment/plugin/audio/audio_service.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,10 @@ tts:
4444
stream_mode: false
4545
output_audio_path: "./output_audio.wav"
4646

47+
tts_multilang:
48+
enable: true
49+
args:
50+
device: "cpu"
51+
precision: "bf16"
52+
4753
tasks_list: ['plugin_audio']

intel_extension_for_transformers/neural_chat/models/base_model.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -421,8 +421,8 @@ def face_animate(self, image_path, audio_path=None, text=None, voice=None) -> st
421421
plugin_name = "tts"
422422
if is_plugin_enabled("tts"):
423423
plugin_name = "tts"
424-
elif is_plugin_enabled("tts_chinese"):
425-
plugin_name = "tts_chinese"
424+
elif is_plugin_enabled("tts_multilang"):
425+
plugin_name = "tts_multilang"
426426
else:
427427
raise Exception("Please specify the TTS plugin!")
428428
plugin_instance = get_plugin_instance(plugin_name)
@@ -495,12 +495,10 @@ def register_plugin_instance(self, plugin_name, instance):
495495
"""
496496
if plugin_name == "tts":
497497
self.tts = instance
498-
if plugin_name == "tts_chinese":
499-
self.tts_chinese = instance
498+
if plugin_name == "tts_multilang":
499+
self.tts_multilang = instance
500500
if plugin_name == "asr":
501501
self.asr = instance
502-
if plugin_name == "asr_chinese":
503-
self.asr_chinese = instance
504502
if plugin_name == "retrieval":
505503
self.retrieval = instance
506504
if plugin_name == "cache":

intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/README.md

Lines changed: 16 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,9 @@ The Audio Processing and Text-to-Speech (TTS) Plugin is a software component des
44

55
- **Text-to-Speech (TTS) Conversion**: The TTS plugin can convert written text into natural-sounding speech by synthesizing human-like voices. Users can customize the voice, tone, and speed of the generated speech to suit their specific requirements.
66

7-
- **Speech Recognition**: The ASR plugin support speech recognition, allowing it to transcribe spoken words into text. This can be used for applications like voice commands, transcription services, and voice-controlled interfaces. It supports both English and Chinese.
7+
- **Audio Speech Recognition (ASR)**: The ASR plugin support speech recognition, allowing it to transcribe spoken words into text. This can be used for applications like voice commands, transcription services, and voice-controlled interfaces. It supports both English and Chinese.
88

9-
- **Multi-Language Support**: The plugin typically supports multiple languages and accents, making it versatile for global applications and catering to diverse user bases. It supports both English and Chinese now.
9+
- **Multi-Language Support**: The plugins typically support multiple languages and accents, making it versatile for global applications and catering to diverse user bases. The ASR plugin supports tens of languages that the Whisper model supports. The TTS plugin supports English, Chinese and Japanese currently.
1010

1111
- **Integration**: Developers can easily integrate this plugin into their applications or systems using APIs.
1212

@@ -22,24 +22,18 @@ sudo dpkg -i libssl1.1_1.1.1f-1ubuntu2.19_amd64.deb
2222

2323
For other operating systems such as CentOS, you will need to make slight adjustments.
2424

25-
# Multilingual Automatic Speech Recognition (ASR)
25+
# Multi Language Automatic Speech Recognition (ASR)
2626

27-
## Dependencies Installation
28-
29-
To use the ASR module, you need to install the necessary dependencies. You can do this by running the following command:
30-
31-
```bash
32-
pip install transformers datasets pydub
33-
```
27+
We support multi-language Automatic Speech Recognition using Whisper.
3428

3529
## Usage
3630

37-
The AudioSpeechRecognition class provides functionality for converting English/Multiligual audio to text. Here's how to use it:
31+
The AudioSpeechRecognition class provides functionality for converting multi-language audio to text. Here's how to use it:
3832

3933
```python
4034
from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.asr import AudioSpeechRecognition
4135
# pass the parameter language="auto" to let the asr model automatically detect language
42-
# otherwise, you can pass an arbitrary language to the model (e.g. en/zh/de/fr)
36+
# otherwise, you can pass an arbitrary language to the model (e.g. en/zh/de/fr...)
4337
asr = AudioSpeechRecognition("openai/whisper-small", language="auto", device=self.device)
4438
audio_path = "~/audio.wav" # Replace with the path to your English audio file (supports MP3 and WAV)
4539
result = asr.audio2text(audio_path)
@@ -49,6 +43,7 @@ print("ASR Result:", result)
4943

5044
# English Text-to-Speech (TTS)
5145

46+
We support English-only TTS based on [SpeechT5](https://arxiv.org/pdf/2110.07205.pdf) and its checkpoints are directly downloaded from [HuggingFace](https://huggingface.co/microsoft/speecht5_tts). It is a two-stage TTS model composed of an acoustic model and a vocoder, and it uses a speaker embedding to distinguish between different voices. In our early experiments and development, this model with the pretrained weights can output relatively good English-only audio results and do voice cloning with few-shot audios from new speakers.
5247
## Dependencies Installation
5348

5449
To use the English TTS module, you need to install the required dependencies. Run the following command:
@@ -70,33 +65,26 @@ voice = "default" # You can choose between "default," "pat," or a custom voice
7065
tts.text2speech(text_to_speak, output_audio_path, voice)
7166
```
7267

73-
# Chinese Text-to-Speech (TTS)
68+
# Multi Language Text-to-Speech (TTS)
7469

75-
## Dependencies Installation
76-
77-
To use the Chinese TTS module, you need to install the required dependencies. Run the following command:
70+
We support multi-language multi-speaker text to speech functionalities (Chinese, English, Japanese) on top of the project [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2), with [IPEX](https://github.com/intel/intel-extension-for-pytorch) BFloat16 inference optimization on Xeon CPU. We finetune our [checkpoints](https://huggingface.co/spycsh/bert-vits-thchs-6-8000) with partial data (6 speakers) from the audio dataset [THCHS-30](https://www.openslr.org/18/). It has a backbone of [VITS](https://arxiv.org/pdf/2106.06103.pdf) and VITS itself is an end-to-end TTS model. Together with Bert to convert the text embedding, VITS is proved to combine more complex latent text features with audios to obtain high-quality TTS results with multiple speakers' voices.
7871

79-
```bash
80-
pip install paddlespeech paddlepaddle
81-
```
8272

8373
## Usage
8474

85-
The ChineseTextToSpeech class within your module provides functionality for TTS. Here's how to use it:
75+
The `MultilangTextToSpeech` class within your module provides functionality for TTS. Here's how to use it:
8676

8777
```python
88-
from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts_chinese import ChineseTextToSpeech
78+
from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts_multilang import MultilangTextToSpeech
8979
# Initialize the TTS module
90-
tts = ChineseTextToSpeech()
80+
tts = MultilangTextToSpeech()
9181
# Define the text you want to convert to speech
92-
text_to_speak = "你好,这是一个示例文本。" # Replace with your Chinese text
82+
text_to_speak = "欢迎来到英特尔,welcome to Intel。こんにちは!" # Replace with your multi-language text
9383
# Specify the output audio path
9484
output_audio_path = "./output.wav" # Replace with your desired output audio path
9585
# Perform text-to-speech conversion
96-
tts.text2speech(text_to_speak)
86+
tts.text2speech(text_to_speak, output_audio_path)
9787

98-
# If you want to stream the generation of audio from a text generator (e.g., a language model),
99-
# you can use the following method:
100-
# audio_generator = your_text_generator_function() # Replace with your text generator
101-
# tts.stream_text2speech(audio_generator)
88+
# If you want to change the speaker, change the sid
89+
# tts.text2speech(text_to_speak, output_audio_path, sid=1)
10290
```
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
#
4+
# Copyright (c) 2023 Intel Corporation
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
#
4+
# Copyright (c) 2023 Intel Corporation
5+
#
6+
# Licensed under the Apache License, Version 2.0 (the "License");
7+
# you may not use this file except in compliance with the License.
8+
# You may obtain a copy of the License at
9+
#
10+
# http://www.apache.org/licenses/LICENSE-2.0
11+
#
12+
# Unless required by applicable law or agreed to in writing, software
13+
# distributed under the License is distributed on an "AS IS" BASIS,
14+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
# See the License for the specific language governing permissions and
16+
# limitations under the License.

0 commit comments

Comments
 (0)