sramakintel
diff --git a/‎intel_extension_for_transformers/neural_chat/chatbot.py
Lines changed: 13 additions & 10 deletions b/‎intel_extension_for_transformers/neural_chat/chatbot.py
Lines changed: 13 additions & 10 deletions
diff --git a/‎intel_extension_for_transformers/neural_chat/examples/deployment/plugin/audio/README.md
Lines changed: 22 additions & 16 deletions b/‎intel_extension_for_transformers/neural_chat/examples/deployment/plugin/audio/README.md
Lines changed: 22 additions & 16 deletions
diff --git a/‎intel_extension_for_transformers/neural_chat/examples/deployment/plugin/audio/audio_service.yaml
Lines changed: 6 additions & 0 deletions b/‎intel_extension_for_transformers/neural_chat/examples/deployment/plugin/audio/audio_service.yaml
Lines changed: 6 additions & 0 deletions
diff --git a/‎intel_extension_for_transformers/neural_chat/models/base_model.py
Lines changed: 4 additions & 6 deletions b/‎intel_extension_for_transformers/neural_chat/models/base_model.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/README.md
Lines changed: 16 additions & 28 deletions b/‎intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/README.md
Lines changed: 16 additions & 28 deletions
diff --git a/‎intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/models/__init__.py
Lines changed: 16 additions & 0 deletions b/‎intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/models/__init__.py
Lines changed: 16 additions & 0 deletions
diff --git a/‎intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/models/bert_vits2/__init__.py
Lines changed: 16 additions & 0 deletions b/‎intel_extension_for_transformers/neural_chat/pipeline/plugins/audio/models/bert_vits2/__init__.py
Lines changed: 16 additions & 0 deletions
@@ -26,15 +26,18 @@
 from .utils.error_utils import set_latest_error, get_latest_error, clear_latest_error
 from intel_extension_for_transformers.utils.logger import logging
 import importlib
+import sys
 
 def check_tts_dependency():
     try:
-        importlib.import_module('paddlespeech')
-        importlib.import_module('paddle')
-        importlib.import_module('soundfile')
-        importlib.import_module('pydub')
-        importlib.import_module('speechbrain')
-        importlib.import_module('librosa')
+        for module in ['soundfile', 'pydub', 'speechbrain', 'librosa', 'zhconv', 'urllib3', 'langid',
+                       'vector_quantize_pytorch', 'cn2an', 'pypinyin', 'jaconv', 'webrtcvad', 'g2p_en', 'inflect',
+                        'jieba']:
+            importlib.import_module(module)
+        if sys.platform == 'linux':
+            importlib.import_module('pyopenjtalk')
+        else:
+            importlib.import_module('openjtalk')
         return True
     except ImportError:
         return False
@@ -178,7 +181,7 @@ def build_chatbot(config: PipelineConfig=None):
         for plugin_name, plugin_value in config.plugins.items():
             enable_plugin = plugin_value.get('enable', False)
             if enable_plugin:
-                if plugin_name == "tts" or plugin_name == "tts_chinese" or plugin_name == "asr":
+                if plugin_name == "tts" or plugin_name == "tts_multilang" or plugin_name == "asr":
                     if not check_tts_dependency():
                         raise ImportError(
                             f"Unable to initialize 'tts' plugin due to missing dependency packages.\n" \
@@ -229,9 +232,9 @@ def build_chatbot(config: PipelineConfig=None):
                 if plugin_name == "tts":
                     from .pipeline.plugins.audio.tts import TextToSpeech
                     plugins[plugin_name]['class'] = TextToSpeech
-                elif plugin_name == "tts_chinese":
-                    from .pipeline.plugins.audio.tts_chinese import ChineseTextToSpeech
-                    plugins[plugin_name]['class'] = ChineseTextToSpeech
+                elif plugin_name == "tts_multilang":
+                    from .pipeline.plugins.audio.tts_multilang import MultilangTextToSpeech
+                    plugins[plugin_name]['class'] = MultilangTextToSpeech
                 elif plugin_name == "asr":
                     from .pipeline.plugins.audio.asr import AudioSpeechRecognition
                     plugins[plugin_name]['class'] = AudioSpeechRecognition
 
@@ -47,22 +47,27 @@ pip install -r ../../../requirements.txt
 
 You can customize the configuration file 'audio_service.yaml' to match your environment setup. Here's a table to help you understand the configurable options:
 
-|  Item                             | Value                                  |
-| --------------------------------- | ---------------------------------------|
-| host                              | 127.0.0.1                              |
-| port                              | 7777                                   |
-| device                            | "auto"                                 |
-| asr.enable                        | true                                   |
-| asr.args.device                   | "cpu"                                  |
-| asr.args.model_name_or_path       | "openai/whisper-small"                 |
-| asr.args.bf16                     | false                                  |
-| tts.enable                        | true                                   |
-| tts.args.device                   | "cpu"                                  |
-| tts.args.voice                    | "default"                              |
-| tts.args.stream_mode              | false                                  |
-| tts.args.output_audio_path        | "./output_audio.wav"                   |
-| tts.args.speedup                  | 1.0                                    |
-| tasks_list                        | ['plugin_audio']              |
+|  Item                                  | Value                                  |
+| ---------------------------------------| ---------------------------------------|
+| host                                   | 127.0.0.1                              |
+| port                                   | 7777                                   |
+| device                                 | "auto"                                 |
+| asr.enable                             | true                                   |
+| asr.args.device                        | "cpu"                                  |
+| asr.args.model_name_or_path            | "openai/whisper-small"                 |
+| asr.args.bf16                          | false                                  |
+| tts.enable                             | true                                   |
+| tts.args.device                        | "cpu"                                  |
+| tts.args.voice                         | "default"                              |
+| tts.args.stream_mode                   | false                                  |
+| tts.args.output_audio_path             | "./output_audio.wav"                   |
+| tts_multilang.enable                   | true                                   |
+| tts_multilang.args.device              | "cpu"                                  |
+| tts_multilang.args.voice               | "default"                              |
+| tts_multilang.args.output_audio_path   | "./output_audio.wav"                   |
+| tts_multilang.args.precision           | "bf16"                                 |
+| tasks_list                             | ['plugin_audio']                       |
+
 
 
 # Run the audio service server
@@ -77,3 +82,4 @@ To call the started audio service, the APIs are listed as follows:
 1. http://127.0.0.1:7777/plugin/audio/asr , upload an audio file and return the text contents.
 2. http://127.0.0.1:7777/plugin/audio/tts , input text string and return the binary content of the audio.
 3. http://127.0.0.1:7777/plugin/audio/create_embedding, upload an audio file and create an embedding of your voice.
+4. http://127.0.0.1:7777/plugin/audio/tts_multilang , input text string and return the binary content of the audio.
@@ -44,4 +44,10 @@ tts:
         stream_mode: false
         output_audio_path: "./output_audio.wav"
 
+tts_multilang:
+    enable: true
+    args:
+        device: "cpu"
+        precision: "bf16"
+
 tasks_list: ['plugin_audio']
@@ -421,8 +421,8 @@ def face_animate(self, image_path, audio_path=None, text=None, voice=None) -> st
             plugin_name = "tts"
             if is_plugin_enabled("tts"):
                 plugin_name = "tts"
-            elif  is_plugin_enabled("tts_chinese"):
-                plugin_name = "tts_chinese"
+            elif  is_plugin_enabled("tts_multilang"):
+                plugin_name = "tts_multilang"
             else:
                 raise Exception("Please specify the TTS plugin!")
             plugin_instance = get_plugin_instance(plugin_name)
@@ -495,12 +495,10 @@ def register_plugin_instance(self, plugin_name, instance):
         """
         if plugin_name == "tts":
             self.tts = instance
-        if plugin_name == "tts_chinese":
-            self.tts_chinese = instance
+        if plugin_name == "tts_multilang":
+            self.tts_multilang = instance
         if plugin_name == "asr":
             self.asr = instance
-        if plugin_name == "asr_chinese":
-            self.asr_chinese = instance
         if plugin_name == "retrieval":
             self.retrieval = instance
         if plugin_name == "cache":
 
@@ -4,9 +4,9 @@ The Audio Processing and Text-to-Speech (TTS) Plugin is a software component des
 
 - **Text-to-Speech (TTS) Conversion**: The TTS plugin can convert written text into natural-sounding speech by synthesizing human-like voices. Users can customize the voice, tone, and speed of the generated speech to suit their specific requirements.
 
-- **Speech Recognition**: The ASR plugin support speech recognition, allowing it to transcribe spoken words into text. This can be used for applications like voice commands, transcription services, and voice-controlled interfaces. It supports both English and Chinese.
+- **Audio Speech Recognition (ASR)**: The ASR plugin support speech recognition, allowing it to transcribe spoken words into text. This can be used for applications like voice commands, transcription services, and voice-controlled interfaces. It supports both English and Chinese.
 
-- **Multi-Language Support**: The plugin typically supports multiple languages and accents, making it versatile for global applications and catering to diverse user bases. It supports both English and Chinese now.
+- **Multi-Language Support**: The plugins typically support multiple languages and accents, making it versatile for global applications and catering to diverse user bases. The ASR plugin supports tens of languages that the Whisper model supports. The TTS plugin supports English, Chinese and Japanese currently.
 
 - **Integration**: Developers can easily integrate this plugin into their applications or systems using APIs.
 
@@ -22,24 +22,18 @@ sudo dpkg -i libssl1.1_1.1.1f-1ubuntu2.19_amd64.deb
 
 For other operating systems such as CentOS, you will need to make slight adjustments.
 
-# Multilingual Automatic Speech Recognition (ASR)
+# Multi Language Automatic Speech Recognition (ASR)
 
-## Dependencies Installation
-
-To use the ASR module, you need to install the necessary dependencies. You can do this by running the following command:
-
-```bash
-pip install transformers datasets pydub
-```
+We support multi-language Automatic Speech Recognition using Whisper.
 
 ## Usage
 
-The AudioSpeechRecognition class provides functionality for converting English/Multiligual audio to text. Here's how to use it:
+The AudioSpeechRecognition class provides functionality for converting multi-language audio to text. Here's how to use it:
 
 ```python
 from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.asr import AudioSpeechRecognition
 # pass the parameter language="auto" to let the asr model automatically detect language
-# otherwise, you can pass an arbitrary language to the model (e.g. en/zh/de/fr)
+# otherwise, you can pass an arbitrary language to the model (e.g. en/zh/de/fr...)
 asr = AudioSpeechRecognition("openai/whisper-small", language="auto", device=self.device)
 audio_path = "~/audio.wav"  # Replace with the path to your English audio file (supports MP3 and WAV)
 result = asr.audio2text(audio_path)
@@ -49,6 +43,7 @@ print("ASR Result:", result)
 
 # English Text-to-Speech (TTS)
 
+We support English-only TTS based on [SpeechT5](https://arxiv.org/pdf/2110.07205.pdf) and its checkpoints are directly downloaded from [HuggingFace](https://huggingface.co/microsoft/speecht5_tts). It is a two-stage TTS model composed of an acoustic model and a vocoder, and it uses a speaker embedding to distinguish between different voices. In our early experiments and development, this model with the pretrained weights can output relatively good English-only audio results and do voice cloning with few-shot audios from new speakers.
 ## Dependencies Installation
 
 To use the English TTS module, you need to install the required dependencies. Run the following command:
@@ -70,33 +65,26 @@ voice = "default"  # You can choose between "default," "pat," or a custom voice
 tts.text2speech(text_to_speak, output_audio_path, voice)
 ```
 
-# Chinese Text-to-Speech (TTS)
+# Multi Language Text-to-Speech (TTS)
 
-## Dependencies Installation
-
-To use the Chinese TTS module, you need to install the required dependencies. Run the following command:
+We support multi-language multi-speaker text to speech functionalities (Chinese, English, Japanese) on top of the project [Bert-VITS2](https://github.com/fishaudio/Bert-VITS2), with [IPEX](https://github.com/intel/intel-extension-for-pytorch) BFloat16 inference optimization on Xeon CPU. We finetune our [checkpoints](https://huggingface.co/spycsh/bert-vits-thchs-6-8000) with partial data (6 speakers) from the audio dataset [THCHS-30](https://www.openslr.org/18/). It has a backbone of [VITS](https://arxiv.org/pdf/2106.06103.pdf) and VITS itself is an end-to-end TTS model. Together with Bert to convert the text embedding, VITS is proved to combine more complex latent text features with audios to obtain high-quality TTS results with multiple speakers' voices.
 
-```bash
-pip install paddlespeech paddlepaddle
-```
 
 ## Usage
 
-The ChineseTextToSpeech class within your module provides functionality for TTS. Here's how to use it:
+The `MultilangTextToSpeech` class within your module provides functionality for TTS. Here's how to use it:
 
 ```python
-from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts_chinese import ChineseTextToSpeech
+from intel_extension_for_transformers.neural_chat.pipeline.plugins.audio.tts_multilang import MultilangTextToSpeech
 # Initialize the TTS module
-tts = ChineseTextToSpeech()
+tts = MultilangTextToSpeech()
 # Define the text you want to convert to speech
-text_to_speak = "你好，这是一个示例文本。"  # Replace with your Chinese text
+text_to_speak = "欢迎来到英特尔，welcome to Intel。こんにちは！"  # Replace with your multi-language text
 # Specify the output audio path
 output_audio_path = "./output.wav"  # Replace with your desired output audio path
 # Perform text-to-speech conversion
-tts.text2speech(text_to_speak)
+tts.text2speech(text_to_speak, output_audio_path)
 
-# If you want to stream the generation of audio from a text generator (e.g., a language model),
-# you can use the following method:
-# audio_generator = your_text_generator_function()  # Replace with your text generator
-# tts.stream_text2speech(audio_generator)
+# If you want to change the speaker, change the sid
+# tts.text2speech(text_to_speak, output_audio_path, sid=1)
 ```
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.