diff --git a/hub_module/modules/audio/tts/deepvoice3_ljspeech/README.md b/hub_module/modules/audio/tts/deepvoice3_ljspeech/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fc7128b9bbca160993cb5dc3e17aa95f08331357
--- /dev/null
+++ b/hub_module/modules/audio/tts/deepvoice3_ljspeech/README.md
@@ -0,0 +1,119 @@
+## 概述
+
+Deep Voice 3是百度研究院2017年发布的端到端的TTS模型(论文录用于ICLR 2018)。它是一个基于卷积神经网络和注意力机制的seq2seq模型,由于不包含循环神经网络,它可以并行训练,远快于基于循环神经网络的模型。Deep Voice 3可以学习到多个说话人的特征,也支持搭配多种声码器使用。deepvoice3_ljspeech是基于ljspeech英文语音数据集预训练得到的英文TTS模型,仅支持预测。
+
+
+
+
+
+更多详情参考论文[Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654)
+
+## 命令行预测
+
+```shell
+$ hub run deepvoice3_ljspeech --input_text='Simple as this proposition is, it is necessary to be stated'
+```
+
+## API
+
+```python
+def synthesize(texts, use_gpu=False, vocoder="griffin-lim"):
+```
+
+预测API,由输入文本合成对应音频波形。
+
+**参数**
+
+* texts (list\[str\]): 待预测文本;
+* use\_gpu (bool): 是否使用 GPU;**若使用GPU,请先设置CUDA\_VISIBLE\_DEVICES环境变量**;
+* vocoder: 指定声码器,可选 "griffin-lim"或"waveflow"
+
+**返回**
+
+* wavs (list): 语音合成结果列表,列表中每一个元素为对应输入文本的音频波形,可使用`soundfile.write`进一步处理或保存。
+* sample\_rate (int): 合成音频的采样率。
+
+**代码示例**
+
+```python
+import paddlehub as hub
+import soundfile as sf
+
+# Load deepvoice3_ljspeech module.
+module = hub.Module(name="deepvoice3_ljspeech")
+
+# Predict sentiment label
+test_texts = ['Simple as this proposition is, it is necessary to be stated',
+ 'Parakeet stands for Paddle PARAllel text-to-speech toolkit']
+wavs, sample_rate = module.synthesize(texts=test_texts)
+for index, wav in enumerate(wavs):
+ sf.write(f"{index}.wav", wav, sample_rate)
+```
+
+## 服务部署
+
+PaddleHub Serving 可以部署在线服务。
+
+### 第一步:启动PaddleHub Serving
+
+运行启动命令:
+```shell
+$ hub serving start -m deepvoice3_ljspeech
+```
+
+这样就完成了一个服务化API的部署,默认端口号为8866。
+
+**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA\_VISIBLE\_DEVICES环境变量,否则不用设置。
+
+### 第二步:发送预测请求
+
+配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果
+
+```python
+import requests
+import json
+
+import soundfile as sf
+
+# 发送HTTP请求
+
+data = {'texts':['Simple as this proposition is, it is necessary to be stated',
+ 'Parakeet stands for Paddle PARAllel text-to-speech toolkit'],
+ 'use_gpu':False}
+headers = {"Content-type": "application/json"}
+url = "http://127.0.0.1:8866/predict/deepvoice3_ljspeech"
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+
+# 保存结果
+result = r.json()["results"]
+wavs = result["wavs"]
+sample_rate = result["sample_rate"]
+for index, wav in enumerate(wavs):
+ sf.write(f"{index}.wav", wav, sample_rate)
+```
+
+## 查看代码
+
+https://github.com/PaddlePaddle/Parakeet
+
+### 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.7.0
+
+**NOTE:** 除了python依赖外还必须安装libsndfile库
+对于Ubuntu用户,请执行:
+```
+sudo apt-get install libsndfile1
+```
+对于Centos用户,请执行:
+```
+sudo yum install libsndfile
+```
+
+## 更新历史
+
+* 1.0.0
+
+ 初始发布
diff --git a/hub_module/modules/audio/tts/deepvoice3_ljspeech/__init__.py b/hub_module/modules/audio/tts/deepvoice3_ljspeech/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hub_module/modules/audio/tts/deepvoice3_ljspeech/module.py b/hub_module/modules/audio/tts/deepvoice3_ljspeech/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..0847cc3cec2a7df8874354e1898eb151f8c55593
--- /dev/null
+++ b/hub_module/modules/audio/tts/deepvoice3_ljspeech/module.py
@@ -0,0 +1,359 @@
+# coding:utf-8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import ast
+import importlib.util
+
+import nltk
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dg
+import paddlehub as hub
+from paddlehub.common.logger import logger
+from paddlehub.module.module import moduleinfo, serving
+from paddlehub.common.dir import THIRD_PARTY_HOME
+from paddlehub.common.utils import mkdir
+from paddlehub.common.downloader import default_downloader
+from paddlehub.module.module import runnable
+from paddlehub.module.nlp_module import DataFormatError
+
+lack_dependency = []
+for dependency in ["ruamel", "parakeet", "soundfile", "librosa"]:
+ if not importlib.util.find_spec(dependency):
+ lack_dependency.append(dependency)
+
+# Accelerate NLTK package download via paddlehub. 'import parakeet' will use the package.
+_PUNKT_URL = "https://paddlehub.bj.bcebos.com/paddlehub-thirdparty/punkt.tar.gz"
+_CMUDICT_URL = "https://paddlehub.bj.bcebos.com/paddlehub-thirdparty/cmudict.tar.gz"
+nltk_path = os.path.join(THIRD_PARTY_HOME, "nltk_data")
+tokenizers_path = os.path.join(nltk_path, "tokenizers")
+corpora_path = os.path.join(nltk_path, "corpora")
+punkt_path = os.path.join(tokenizers_path, "punkt")
+cmudict_path = os.path.join(corpora_path, "cmudict")
+
+if not os.path.exists(punkt_path):
+ default_downloader.download_file_and_uncompress(
+ url=_PUNKT_URL, save_path=tokenizers_path, print_progress=True)
+if not os.path.exists(cmudict_path):
+ default_downloader.download_file_and_uncompress(
+ url=_CMUDICT_URL, save_path=corpora_path, print_progress=True)
+nltk.data.path.append(nltk_path)
+
+if not lack_dependency:
+ import soundfile as sf
+ import librosa
+ import ruamel.yaml
+ from parakeet.utils import io
+ from parakeet.g2p import en
+ from parakeet.models.deepvoice3 import Encoder, Decoder, PostNet, SpectraNet
+ from parakeet.models.waveflow import WaveFlowModule
+else:
+ raise ImportError(
+ "The module requires additional dependencies: %s. You can install parakeet via 'git clone https://github.com/PaddlePaddle/Parakeet && cd Parakeet && pip install -e .' and others via pip install"
+ % ", ".join(lack_dependency))
+
+
+class AttrDict(dict):
+ def __init__(self, *args, **kwargs):
+ super(AttrDict, self).__init__(*args, **kwargs)
+ self.__dict__ = self
+
+
+class WaveflowVocoder(object):
+ def __init__(self, config_path, checkpoint_path):
+ with open(config_path, 'rt') as f:
+ config = ruamel.yaml.safe_load(f)
+ ns = argparse.Namespace()
+ for k, v in config.items():
+ setattr(ns, k, v)
+ ns.use_fp16 = False
+
+ self.model = WaveFlowModule(ns)
+ io.load_parameters(self.model, checkpoint_path=checkpoint_path)
+
+ def __call__(self, mel):
+ with dg.no_grad():
+ self.model.eval()
+ audio = self.model.synthesize(mel)
+ self.model.train()
+ return audio
+
+
+class GriffinLimVocoder(object):
+ def __init__(self,
+ sharpening_factor=1.4,
+ sample_rate=22050,
+ n_fft=1024,
+ win_length=1024,
+ hop_length=256):
+ self.sample_rate = sample_rate
+ self.n_fft = n_fft
+ self.sharpening_factor = sharpening_factor
+ self.win_length = win_length
+ self.hop_length = hop_length
+
+ def __call__(self, mel):
+ spec = librosa.feature.inverse.mel_to_stft(
+ np.exp(mel),
+ sr=self.sample_rate,
+ n_fft=self.n_fft,
+ fmin=0,
+ fmax=8000.0,
+ power=1.0)
+ audio = librosa.core.griffinlim(
+ spec**self.sharpening_factor,
+ win_length=self.win_length,
+ hop_length=self.hop_length)
+ return audio
+
+
+@moduleinfo(
+ name="deepvoice3_ljspeech",
+ version="1.0.0",
+ summary=
+ "Deep Voice 3, a fully-convolutional attention-based neural text-to-speech (TTS) system.",
+ author="paddlepaddle",
+ author_email="",
+ type="nlp/tts",
+)
+class DeepVoice3(hub.NLPPredictionModule):
+ def _initialize(self):
+ """
+ initialize with the necessary elements
+ """
+ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts",
+ "step-1780000")
+ self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
+ "vocoder", "step-2000000")
+ self.waveflow_config_path = os.path.join(
+ self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml")
+ tts_checkpoint_path = os.path.join(self.directory, "assets", "tts",
+ "ljspeech.yaml")
+ with open(tts_checkpoint_path) as f:
+ self.tts_config = ruamel.yaml.safe_load(f)
+
+ with fluid.dygraph.guard(fluid.CPUPlace()):
+ char_embedding = dg.Embedding((en.n_vocab,
+ self.tts_config["char_dim"]))
+ multi_speaker = self.tts_config["n_speakers"] > 1
+ speaker_embedding = dg.Embedding((self.tts_config["n_speakers"], self.tts_config["speaker_dim"])) \
+ if multi_speaker else None
+ encoder = Encoder(
+ self.tts_config["encoder_layers"],
+ self.tts_config["char_dim"],
+ self.tts_config["encoder_dim"],
+ self.tts_config["kernel_size"],
+ has_bias=multi_speaker,
+ bias_dim=self.tts_config["speaker_dim"],
+ keep_prob=1.0 - self.tts_config["dropout"])
+ decoder = Decoder(
+ self.tts_config["n_mels"],
+ self.tts_config["reduction_factor"],
+ list(self.tts_config["prenet_sizes"]) +
+ [self.tts_config["char_dim"]],
+ self.tts_config["decoder_layers"],
+ self.tts_config["kernel_size"],
+ self.tts_config["attention_dim"],
+ position_encoding_weight=self.tts_config["position_weight"],
+ omega=self.tts_config["position_rate"],
+ has_bias=multi_speaker,
+ bias_dim=self.tts_config["speaker_dim"],
+ keep_prob=1.0 - self.tts_config["dropout"])
+ postnet = PostNet(
+ self.tts_config["postnet_layers"],
+ self.tts_config["char_dim"],
+ self.tts_config["postnet_dim"],
+ self.tts_config["kernel_size"],
+ self.tts_config["n_mels"],
+ self.tts_config["reduction_factor"],
+ has_bias=multi_speaker,
+ bias_dim=self.tts_config["speaker_dim"],
+ keep_prob=1.0 - self.tts_config["dropout"])
+ self.tts_model = SpectraNet(char_embedding, speaker_embedding,
+ encoder, decoder, postnet)
+ io.load_parameters(
+ model=self.tts_model, checkpoint_path=self.tts_checkpoint_path)
+
+ self.waveflow = WaveflowVocoder(
+ config_path=self.waveflow_config_path,
+ checkpoint_path=self.waveflow_checkpoint_path)
+ self.griffin = GriffinLimVocoder(
+ sharpening_factor=self.tts_config["sharpening_factor"],
+ sample_rate=self.tts_config["sample_rate"],
+ n_fft=self.tts_config["n_fft"],
+ win_length=self.tts_config["win_length"],
+ hop_length=self.tts_config["hop_length"])
+
+ def synthesize(self, texts, use_gpu=False, vocoder="griffin-lim"):
+ """
+ Get the synthetic wavs from the texts.
+
+ Args:
+ texts(list): the input texts to be predicted.
+ use_gpu(bool): whether use gpu to predict or not
+ vocoder(str): the vocoder name, "griffin-lim" or "waveflow"
+
+ Returns:
+ wavs(str): the audio wav with sample rate . You can use soundfile.write to save it.
+ sample_rate(int): the audio sample rate.
+ """
+ if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
+ use_gpu = False
+ logger.warning(
+ "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
+ )
+
+ place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+
+ if texts and isinstance(texts, list):
+ predicted_data = texts
+ else:
+ raise ValueError(
+ "The input data is inconsistent with expectations.")
+
+ wavs = []
+ with fluid.dygraph.guard(place):
+ self.tts_model.eval()
+ self.waveflow.model.eval()
+ monotonic_layers = [4]
+ for text in predicted_data:
+ # init input
+ logger.info("Processing sentence: %s" % text)
+ text = en.text_to_sequence(text, p=1.0)
+ text = np.expand_dims(np.array(text, dtype="int64"), 0)
+ lengths = np.array([text.size], dtype=np.int64)
+ text_seqs = dg.to_variable(text)
+ text_lengths = dg.to_variable(lengths)
+
+ decoder_layers = self.tts_config["decoder_layers"]
+ force_monotonic_attention = [False] * decoder_layers
+ for i in monotonic_layers:
+ force_monotonic_attention[i] = True
+
+ outputs = self.tts_model(
+ text_seqs,
+ text_lengths,
+ speakers=None,
+ force_monotonic_attention=force_monotonic_attention,
+ window=(self.tts_config["backward_step"],
+ self.tts_config["forward_step"]))
+ decoded, refined, attentions = outputs
+ if vocoder == 'griffin-lim':
+ # synthesis use griffin-lim
+ wav = self.griffin(refined.numpy()[0].T)
+ elif vocoder == 'waveflow':
+ # synthesis use waveflow
+ wav = self.waveflow(
+ fluid.layers.transpose(refined, [0, 2, 1])).numpy()[0]
+ else:
+ raise ValueError(
+ 'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
+ % vocoder)
+ wavs.append(wav)
+ return wavs, self.tts_config["sample_rate"]
+
+ @serving
+ def serving_method(self, texts, use_gpu=False, vocoder="griffin-lim"):
+ """
+ Run as a service.
+ """
+ wavs, sample_rate = self.synthesize(texts, use_gpu, vocoder)
+ wavs = [wav.tolist() for wav in wavs]
+ result = {"wavs": wavs, "sample_rate": sample_rate}
+ return result
+
+ def add_module_config_arg(self):
+ """
+ Add the command config options
+ """
+ self.arg_config_group.add_argument(
+ '--use_gpu',
+ type=ast.literal_eval,
+ default=False,
+ help="whether use GPU for prediction")
+
+ self.arg_config_group.add_argument(
+ '--vocoder',
+ type=str,
+ default="griffin-lim",
+ choices=['griffin-lim', 'waveflow'],
+ help="the vocoder name")
+
+ def add_module_output_arg(self):
+ """
+ Add the command config options
+ """
+ self.arg_config_group.add_argument(
+ '--output_path',
+ type=str,
+ default=os.path.abspath(
+ os.path.join(os.path.curdir, f"{self.name}_prediction")),
+ help="path to save experiment results")
+
+ @runnable
+ def run_cmd(self, argvs):
+ """
+ Run as a command
+ """
+ self.parser = argparse.ArgumentParser(
+ description='Run the %s module.' % self.name,
+ prog='hub run %s' % self.name,
+ usage='%(prog)s',
+ add_help=True)
+
+ self.arg_input_group = self.parser.add_argument_group(
+ title="Input options", description="Input data. Required")
+ self.arg_input_group = self.parser.add_argument_group(
+ title="Ouput options", description="Ouput path. Optional.")
+ self.arg_config_group = self.parser.add_argument_group(
+ title="Config options",
+ description=
+ "Run configuration for controlling module behavior, optional.")
+
+ self.add_module_config_arg()
+ self.add_module_input_arg()
+ self.add_module_output_arg()
+
+ args = self.parser.parse_args(argvs)
+
+ try:
+ input_data = self.check_input_data(args)
+ except DataFormatError and RuntimeError:
+ self.parser.print_help()
+ return None
+
+ mkdir(args.output_path)
+ wavs, sample_rate = self.synthesize(
+ texts=input_data, use_gpu=args.use_gpu, vocoder=args.vocoder)
+
+ for index, wav in enumerate(wavs):
+ sf.write(
+ os.path.join(args.output_path, f"{index}.wav"), wav,
+ sample_rate)
+
+ ret = f"The synthesized wav files have been saved in {args.output_path}"
+ return ret
+
+
+if __name__ == "__main__":
+ module = DeepVoice3()
+ test_text = [
+ "Simple as this proposition is, it is necessary to be stated",
+ "Parakeet stands for Paddle PARAllel text-to-speech toolkit.",
+ ]
+ wavs, sample_rate = module.synthesize(texts=test_text, vocoder="waveflow")
+ for index, wav in enumerate(wavs):
+ sf.write(f"{index}.wav", wav, sample_rate)
diff --git a/hub_module/modules/audio/tts/fastspeech_ljspeech/README.md b/hub_module/modules/audio/tts/fastspeech_ljspeech/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5709ee1795fbb01166cd45c541e0ce5d88339fd6
--- /dev/null
+++ b/hub_module/modules/audio/tts/fastspeech_ljspeech/README.md
@@ -0,0 +1,120 @@
+## 概述
+
+FastSpeech是基于Transformer的前馈神经网络,作者从encoder-decoder结构的teacher model中提取attention对角线来做发音持续时间预测,即使用长度调节器对文本序列进行扩展来匹配目标梅尔频谱的长度,以便并行生成梅尔频谱。该模型基本上消除了复杂情况下的跳词和重复的问题,并且可以平滑地调整语音速度,更重要的是,该模型大幅度提升了梅尔频谱的生成速度。fastspeech_ljspeech是基于ljspeech英文语音数据集预训练得到的英文TTS模型,仅支持预测。
+
+
+
+
+
+更多详情参考论文[FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263)
+
+## 命令行预测
+
+```shell
+$ hub run fastspeech_ljspeech --input_text='Simple as this proposition is, it is necessary to be stated'
+```
+
+## API
+
+```python
+def synthesize(texts, use_gpu=False, speed=1.0, vocoder="griffin-lim"):
+```
+
+预测API,由输入文本合成对应音频波形。
+
+**参数**
+
+* texts (list\[str\]): 待预测文本;
+* use\_gpu (bool): 是否使用 GPU;**若使用GPU,请先设置CUDA\_VISIBLE\_DEVICES环境变量**;
+* speed(float): 音频速度,1.0表示以原速输出。
+* vocoder: 指定声码器,可选 "griffin-lim"或"waveflow"
+
+**返回**
+
+* wavs (list): 语音合成结果列表,列表中每一个元素为对应输入文本的音频波形,可使用`soundfile.write`进一步处理或保存。
+* sample\_rate (int): 合成音频的采样率。
+
+**代码示例**
+
+```python
+import paddlehub as hub
+import soundfile as sf
+
+# Load fastspeech_ljspeech module.
+module = hub.Module(name="fastspeech_ljspeech")
+
+# Predict sentiment label
+test_texts = ['Simple as this proposition is, it is necessary to be stated',
+ 'Parakeet stands for Paddle PARAllel text-to-speech toolkit']
+wavs, sample_rate = module.synthesize(texts=test_texts)
+for index, wav in enumerate(wavs):
+ sf.write(f"{index}.wav", wav, sample_rate)
+```
+
+## 服务部署
+
+PaddleHub Serving 可以部署在线服务。
+
+### 第一步:启动PaddleHub Serving
+
+运行启动命令:
+```shell
+$ hub serving start -m fastspeech_ljspeech
+```
+
+这样就完成了一个服务化API的部署,默认端口号为8866。
+
+**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA\_VISIBLE\_DEVICES环境变量,否则不用设置。
+
+### 第二步:发送预测请求
+
+配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果
+
+```python
+import requests
+import json
+
+import soundfile as sf
+
+# 发送HTTP请求
+
+data = {'texts':['Simple as this proposition is, it is necessary to be stated',
+ 'Parakeet stands for Paddle PARAllel text-to-speech toolkit'],
+ 'use_gpu':False}
+headers = {"Content-type": "application/json"}
+url = "http://127.0.0.1:8866/predict/fastspeech_ljspeech"
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+
+# 保存结果
+result = r.json()["results"]
+wavs = result["wavs"]
+sample_rate = result["sample_rate"]
+for index, wav in enumerate(wavs):
+ sf.write(f"{index}.wav", wav, sample_rate)
+```
+
+## 查看代码
+
+https://github.com/PaddlePaddle/Parakeet
+
+### 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.7.0
+
+**NOTE:** 除了python依赖外还必须安装libsndfile库
+对于Ubuntu用户,请执行:
+```
+sudo apt-get install libsndfile1
+```
+对于Centos用户,请执行:
+```
+sudo yum install libsndfile
+```
+
+## 更新历史
+
+* 1.0.0
+
+ 初始发布
diff --git a/hub_module/modules/audio/tts/fastspeech_ljspeech/__init__.py b/hub_module/modules/audio/tts/fastspeech_ljspeech/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hub_module/modules/audio/tts/fastspeech_ljspeech/module.py b/hub_module/modules/audio/tts/fastspeech_ljspeech/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..e308d53a84f6c6307578ba58ec8ca0910c3a1aa8
--- /dev/null
+++ b/hub_module/modules/audio/tts/fastspeech_ljspeech/module.py
@@ -0,0 +1,311 @@
+# coding:utf-8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import ast
+import argparse
+import importlib.util
+
+import nltk
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dg
+import paddlehub as hub
+from paddlehub.module.module import runnable
+from paddlehub.common.utils import mkdir
+from paddlehub.module.nlp_module import DataFormatError
+from paddlehub.common.logger import logger
+from paddlehub.module.module import moduleinfo, serving
+from paddlehub.common.dir import THIRD_PARTY_HOME
+from paddlehub.common.downloader import default_downloader
+
+lack_dependency = []
+for dependency in ["ruamel", "parakeet", "soundfile", "librosa"]:
+ if not importlib.util.find_spec(dependency):
+ lack_dependency.append(dependency)
+
+# Accelerate NLTK package download via paddlehub. 'import parakeet' will use the package.
+_PUNKT_URL = "https://paddlehub.bj.bcebos.com/paddlehub-thirdparty/punkt.tar.gz"
+_CMUDICT_URL = "https://paddlehub.bj.bcebos.com/paddlehub-thirdparty/cmudict.tar.gz"
+nltk_path = os.path.join(THIRD_PARTY_HOME, "nltk_data")
+tokenizers_path = os.path.join(nltk_path, "tokenizers")
+corpora_path = os.path.join(nltk_path, "corpora")
+punkt_path = os.path.join(tokenizers_path, "punkt")
+cmudict_path = os.path.join(corpora_path, "cmudict")
+
+if not os.path.exists(punkt_path):
+ default_downloader.download_file_and_uncompress(
+ url=_PUNKT_URL, save_path=tokenizers_path, print_progress=True)
+if not os.path.exists(cmudict_path):
+ default_downloader.download_file_and_uncompress(
+ url=_CMUDICT_URL, save_path=corpora_path, print_progress=True)
+nltk.data.path.append(nltk_path)
+
+if not lack_dependency:
+ import soundfile as sf
+ import librosa
+ from ruamel import yaml
+ from parakeet.models.fastspeech.fastspeech import FastSpeech as FastSpeechModel
+ from parakeet.g2p.en import text_to_sequence
+ from parakeet.models.transformer_tts.utils import *
+ from parakeet.utils import io
+ from parakeet.modules.weight_norm import WeightNormWrapper
+ from parakeet.models.waveflow import WaveFlowModule
+else:
+ raise ImportError(
+ "The module requires additional dependencies: %s. You can install parakeet via 'git clone https://github.com/PaddlePaddle/Parakeet && cd Parakeet && pip install -e .' and others via pip install"
+ % ", ".join(lack_dependency))
+
+
+class AttrDict(dict):
+ def __init__(self, *args, **kwargs):
+ super(AttrDict, self).__init__(*args, **kwargs)
+ self.__dict__ = self
+
+
+@moduleinfo(
+ name="fastspeech_ljspeech",
+ version="1.0.0",
+ summary=
+ "FastSpeech proposes a novel feed-forward network based on Transformer to generate mel-spectrogram in parallel for TTS. See https://arxiv.org/abs/1905.09263 for details.",
+ author="baidu-nlp",
+ author_email="",
+ type="nlp/tts",
+)
+class FastSpeech(hub.NLPPredictionModule):
+ def _initialize(self):
+ """
+ initialize with the necessary elements
+ """
+ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts",
+ "step-162000")
+ self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
+ "vocoder", "step-2000000")
+ self.waveflow_config_path = os.path.join(
+ self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml")
+
+ tts_config_path = os.path.join(self.directory, "assets", "tts",
+ "ljspeech.yaml")
+ with open(tts_config_path) as f:
+ self.tts_config = yaml.load(f, Loader=yaml.Loader)
+ with fluid.dygraph.guard(fluid.CPUPlace()):
+ self.tts_model = FastSpeechModel(
+ self.tts_config['network'],
+ num_mels=self.tts_config['audio']['num_mels'])
+ io.load_parameters(
+ model=self.tts_model, checkpoint_path=self.tts_checkpoint_path)
+
+ # Build vocoder.
+ args = AttrDict()
+ args.config = self.waveflow_config_path
+ args.use_fp16 = False
+ self.waveflow_config = io.add_yaml_config_to_args(args)
+ self.waveflow = WaveFlowModule(self.waveflow_config)
+ io.load_parameters(
+ model=self.waveflow,
+ checkpoint_path=self.waveflow_checkpoint_path)
+
+ def synthesize(self, texts, use_gpu=False, speed=1.0,
+ vocoder="griffin-lim"):
+ """
+ Get the synthetic wavs from the texts.
+
+ Args:
+ texts(list): the input texts to be predicted.
+ use_gpu(bool): whether use gpu to predict or not. Default False.
+ speed(float): Controlling the voice speed. Default 1.0.
+ vocoder(str): the vocoder name, "griffin-lim" or "waveflow".
+
+ Returns:
+ wavs(str): the audio wav with sample rate . You can use soundfile.write to save it.
+ sample_rate(int): the audio sample rate.
+ """
+ if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
+ use_gpu = False
+ logger.warning(
+ "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
+ )
+ if use_gpu:
+ place = fluid.CUDAPlace(0)
+ else:
+ place = fluid.CPUPlace()
+
+ if texts and isinstance(texts, list):
+ predicted_data = texts
+ else:
+ raise ValueError(
+ "The input data is inconsistent with expectations.")
+
+ wavs = []
+ with fluid.dygraph.guard(place):
+ self.tts_model.eval()
+ self.waveflow.eval()
+ for text in predicted_data:
+ # init input
+ logger.info("Processing sentence: %s" % text)
+ text = np.asarray(text_to_sequence(text))
+ text = np.expand_dims(text, axis=0)
+ pos_text = np.arange(1, text.shape[1] + 1)
+ pos_text = np.expand_dims(pos_text, axis=0)
+
+ text = dg.to_variable(text).astype(np.int64)
+ pos_text = dg.to_variable(pos_text).astype(np.int64)
+
+ _, mel_output_postnet = self.tts_model(
+ text, pos_text, alpha=1 / speed)
+
+ if vocoder == 'griffin-lim':
+ # synthesis use griffin-lim
+ wav = self.synthesis_with_griffinlim(
+ mel_output_postnet, self.tts_config['audio'])
+ elif vocoder == 'waveflow':
+ wav = self.synthesis_with_waveflow(
+ mel_output_postnet, self.waveflow_config.sigma)
+ else:
+ raise ValueError(
+ 'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
+ % vocoder)
+ wavs.append(wav)
+ return wavs, self.tts_config['audio']['sr']
+
+ def synthesis_with_griffinlim(self, mel_output, cfg):
+ # synthesis with griffin-lim
+ mel_output = fluid.layers.transpose(
+ fluid.layers.squeeze(mel_output, [0]), [1, 0])
+ mel_output = np.exp(mel_output.numpy())
+ basis = librosa.filters.mel(
+ cfg['sr'],
+ cfg['n_fft'],
+ cfg['num_mels'],
+ fmin=cfg['fmin'],
+ fmax=cfg['fmax'])
+ inv_basis = np.linalg.pinv(basis)
+ spec = np.maximum(1e-10, np.dot(inv_basis, mel_output))
+
+ wav = librosa.core.griffinlim(
+ spec**cfg['power'],
+ hop_length=cfg['hop_length'],
+ win_length=cfg['win_length'])
+
+ return wav
+
+ def synthesis_with_waveflow(self, mel_output, sigma):
+ mel_spectrogram = fluid.layers.transpose(
+ fluid.layers.squeeze(mel_output, [0]), [1, 0])
+ mel_spectrogram = fluid.layers.unsqueeze(mel_spectrogram, [0])
+
+ for layer in self.waveflow.sublayers():
+ if isinstance(layer, WeightNormWrapper):
+ layer.remove_weight_norm()
+
+ # Run model inference.
+ wav = self.waveflow.synthesize(mel_spectrogram, sigma=sigma)
+ return wav.numpy()[0]
+
+ @serving
+ def serving_method(self,
+ texts,
+ use_gpu=False,
+ speed=1.0,
+ vocoder="griffin-lim"):
+ """
+ Run as a service.
+ """
+ wavs, sample_rate = self.synthesize(texts, use_gpu, speed, vocoder)
+ wavs = [wav.tolist() for wav in wavs]
+ result = {"wavs": wavs, "sample_rate": sample_rate}
+ return result
+
+ def add_module_config_arg(self):
+ """
+ Add the command config options
+ """
+ self.arg_config_group.add_argument(
+ '--use_gpu',
+ type=ast.literal_eval,
+ default=False,
+ help="whether use GPU for prediction")
+
+ self.arg_config_group.add_argument(
+ '--vocoder',
+ type=str,
+ default="griffin-lim",
+ choices=['griffin-lim', 'waveflow'],
+ help="the vocoder name")
+
+ def add_module_output_arg(self):
+ """
+ Add the command config options
+ """
+ self.arg_config_group.add_argument(
+ '--output_path',
+ type=str,
+ default=os.path.abspath(
+ os.path.join(os.path.curdir, f"{self.name}_prediction")),
+ help="path to save experiment results")
+
+ @runnable
+ def run_cmd(self, argvs):
+ """
+ Run as a command
+ """
+ self.parser = argparse.ArgumentParser(
+ description='Run the %s module.' % self.name,
+ prog='hub run %s' % self.name,
+ usage='%(prog)s',
+ add_help=True)
+
+ self.arg_input_group = self.parser.add_argument_group(
+ title="Input options", description="Input data. Required")
+ self.arg_input_group = self.parser.add_argument_group(
+ title="Ouput options", description="Ouput path. Optional.")
+ self.arg_config_group = self.parser.add_argument_group(
+ title="Config options",
+ description=
+ "Run configuration for controlling module behavior, optional.")
+
+ self.add_module_config_arg()
+ self.add_module_input_arg()
+ self.add_module_output_arg()
+
+ args = self.parser.parse_args(argvs)
+
+ try:
+ input_data = self.check_input_data(args)
+ except DataFormatError and RuntimeError:
+ self.parser.print_help()
+ return None
+
+ mkdir(args.output_path)
+ wavs, sample_rate = self.synthesize(
+ texts=input_data, use_gpu=args.use_gpu, vocoder=args.vocoder)
+
+ for index, wav in enumerate(wavs):
+ sf.write(
+ os.path.join(args.output_path, f"{index}.wav"), wav,
+ sample_rate)
+
+ ret = f"The synthesized wav files have been saved in {args.output_path}"
+ return ret
+
+
+if __name__ == "__main__":
+
+ module = FastSpeech()
+ test_text = [
+ "Simple as this proposition is, it is necessary to be stated",
+ ]
+ wavs, sample_rate = module.synthesize(
+ texts=test_text, speed=1, vocoder="waveflow")
+ for index, wav in enumerate(wavs):
+ sf.write(f"{index}.wav", wav, sample_rate)
diff --git a/hub_module/modules/audio/tts/transformer_tts_ljspeech/README.md b/hub_module/modules/audio/tts/transformer_tts_ljspeech/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..933e02748e2a6cd4cd23813a5a4a5ffbe3a98ca7
--- /dev/null
+++ b/hub_module/modules/audio/tts/transformer_tts_ljspeech/README.md
@@ -0,0 +1,118 @@
+## 概述
+
+TansformerTTS 是使用了 Transformer 结构的端到端语音合成模型,对 Transformer 和 Tacotron2 进行了融合,取得了令人满意的效果。因为删除了 RNN 的循环连接,可并行的提供 decoder 的输入,进行并行训练,大大提升了模型的训练速度。transformer_tts_ljspeech是基于ljspeech英文语音数据集预训练得到的英文TTS模型,仅支持预测。
+
+
+
+
+
+更多详情参考论文[Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895)
+
+## 命令行预测
+
+```shell
+$ hub run transformer_tts_ljspeech --input_text="Life was like a box of chocolates, you never know what you're gonna get." --use_gpu True --vocoder griffin-lim
+```
+
+## API
+
+```python
+def synthesize(texts, use_gpu=False, vocoder="griffin-lim"):
+```
+
+预测API,由输入文本合成对应音频波形。
+
+**参数**
+
+* texts (list\[str\]): 待预测文本;
+* use\_gpu (bool): 是否使用 GPU;**若使用GPU,请先设置CUDA\_VISIBLE\_DEVICES环境变量**;
+* vocoder: 指定声码器,可选 "griffin-lim"或"waveflow"
+
+**返回**
+
+* wavs (list): 语音合成结果列表,列表中每一个元素为对应输入文本的音频波形,可使用`soundfile.write`进一步处理或保存。
+* sample\_rate (int): 合成音频的采样率。
+
+**代码示例**
+
+```python
+import paddlehub as hub
+import soundfile as sf
+
+# Load transformer_tts_ljspeech module.
+module = hub.Module(name="transformer_tts_ljspeech")
+
+# Predict sentiment label
+test_texts = ["Life was like a box of chocolates, you never know what you're gonna get."]
+wavs, sample_rate = module.synthesize(texts=test_texts, use_gpu=True, vocoder="waveflow")
+for index, wav in enumerate(wavs):
+ sf.write(f"{index}.wav", wav, sample_rate)
+```
+
+## 服务部署
+
+PaddleHub Serving 可以部署在线服务。
+
+### 第一步:启动PaddleHub Serving
+
+运行启动命令:
+```shell
+$ hub serving start -m transformer_tts_ljspeech
+```
+
+这样就完成了一个服务化API的部署,默认端口号为8866。
+
+**NOTE:** 如使用GPU预测,则需要在启动服务之前,请设置CUDA\_VISIBLE\_DEVICES环境变量,否则不用设置。
+
+### 第二步:发送预测请求
+
+配置好服务端,以下数行代码即可实现发送预测请求,获取预测结果
+
+```python
+import requests
+import json
+
+import soundfile as sf
+
+# 发送HTTP请求
+
+data = {'texts':['Simple as this proposition is, it is necessary to be stated',
+ 'Parakeet stands for Paddle PARAllel text-to-speech toolkit'],
+ 'use_gpu':False}
+headers = {"Content-type": "application/json"}
+url = "http://127.0.0.1:8866/predict/transformer_tts_ljspeech"
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+
+# 保存结果
+result = r.json()["results"]
+wavs = result["wavs"]
+sample_rate = result["sample_rate"]
+for index, wav in enumerate(wavs):
+ sf.write(f"{index}.wav", wav, sample_rate)
+```
+
+## 查看代码
+
+https://github.com/PaddlePaddle/Parakeet
+
+### 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.7.0
+
+**NOTE:** 除了python依赖外还必须安装libsndfile库
+对于Ubuntu用户,请执行:
+```
+sudo apt-get install libsndfile1
+```
+对于Centos用户,请执行:
+```
+sudo yum install libsndfile
+```
+
+## 更新历史
+
+* 1.0.0
+
+ 初始发布
diff --git a/hub_module/modules/audio/tts/transformer_tts_ljspeech/__init__.py b/hub_module/modules/audio/tts/transformer_tts_ljspeech/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/hub_module/modules/audio/tts/transformer_tts_ljspeech/module.py b/hub_module/modules/audio/tts/transformer_tts_ljspeech/module.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f58c083d6ba9b20cb434ff58110fbd0fb6e23b3
--- /dev/null
+++ b/hub_module/modules/audio/tts/transformer_tts_ljspeech/module.py
@@ -0,0 +1,329 @@
+# coding:utf-8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import ast
+import argparse
+import importlib.util
+
+import nltk
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dg
+import paddlehub as hub
+from paddlehub.module.module import runnable
+from paddlehub.common.utils import mkdir
+from paddlehub.module.nlp_module import DataFormatError
+from paddlehub.common.logger import logger
+from paddlehub.module.module import moduleinfo, serving
+from paddlehub.common.dir import THIRD_PARTY_HOME
+from paddlehub.common.downloader import default_downloader
+
+lack_dependency = []
+for dependency in ["ruamel", "parakeet", "scipy", "soundfile", "librosa"]:
+ if not importlib.util.find_spec(dependency):
+ lack_dependency.append(dependency)
+
+# Accelerate NLTK package download via paddlehub. 'import parakeet' will use the package.
+_PUNKT_URL = "https://paddlehub.bj.bcebos.com/paddlehub-thirdparty/punkt.tar.gz"
+_CMUDICT_URL = "https://paddlehub.bj.bcebos.com/paddlehub-thirdparty/cmudict.tar.gz"
+nltk_path = os.path.join(THIRD_PARTY_HOME, "nltk_data")
+tokenizers_path = os.path.join(nltk_path, "tokenizers")
+corpora_path = os.path.join(nltk_path, "corpora")
+punkt_path = os.path.join(tokenizers_path, "punkt")
+cmudict_path = os.path.join(corpora_path, "cmudict")
+
+if not os.path.exists(punkt_path):
+ default_downloader.download_file_and_uncompress(
+ url=_PUNKT_URL, save_path=tokenizers_path, print_progress=True)
+if not os.path.exists(cmudict_path):
+ default_downloader.download_file_and_uncompress(
+ url=_CMUDICT_URL, save_path=corpora_path, print_progress=True)
+nltk.data.path.append(nltk_path)
+
+if not lack_dependency:
+ import soundfile as sf
+ import librosa
+ from ruamel import yaml
+ from scipy.io.wavfile import write
+ from parakeet.g2p.en import text_to_sequence
+ from parakeet.models.transformer_tts.utils import *
+ from parakeet.models.transformer_tts import TransformerTTS as TransformerTTSModel
+ from parakeet.models.waveflow import WaveFlowModule
+ from parakeet.utils import io
+ from parakeet.modules.weight_norm import WeightNormWrapper
+else:
+ raise ImportError(
+ "The module requires additional dependencies: %s. You can install parakeet via 'git clone https://github.com/PaddlePaddle/Parakeet && cd Parakeet && pip install -e .' and others via pip install"
+ % ", ".join(lack_dependency))
+
+
+class AttrDict(dict):
+ def __init__(self, *args, **kwargs):
+ super(AttrDict, self).__init__(*args, **kwargs)
+ self.__dict__ = self
+
+
+@moduleinfo(
+ name="transformer_tts_ljspeech",
+ version="1.0.0",
+ summary=
+ "Transformer TTS introduces and adapts the multi-head attention mechanism to replace the RNN structures and also the original attention mechanism in Tacotron2. See https://arxiv.org/abs/1809.08895 for details",
+ author="baidu-nlp",
+ author_email="",
+ type="nlp/tts",
+)
+class TransformerTTS(hub.NLPPredictionModule):
+ def _initialize(self):
+ """
+ initialize with the necessary elements
+ """
+ self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts",
+ "step-120000")
+ self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
+ "vocoder", "step-2000000")
+ self.waveflow_config_path = os.path.join(
+ self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml")
+
+ tts_config_path = os.path.join(self.directory, "assets", "tts",
+ "ljspeech.yaml")
+ with open(tts_config_path) as f:
+ self.tts_config = yaml.load(f, Loader=yaml.Loader)
+
+ # The max length of audio when synthsis.
+ self.max_len = 1000
+ # The threshold of stop token which indicates the time step should stop generate spectrum or not.
+ self.stop_threshold = 0.5
+
+ with fluid.dygraph.guard(fluid.CPUPlace()):
+ # Build TTS.
+ with fluid.unique_name.guard():
+ network_cfg = self.tts_config['network']
+ self.tts_model = TransformerTTSModel(
+ network_cfg['embedding_size'], network_cfg['hidden_size'],
+ network_cfg['encoder_num_head'],
+ network_cfg['encoder_n_layers'],
+ self.tts_config['audio']['num_mels'],
+ network_cfg['outputs_per_step'],
+ network_cfg['decoder_num_head'],
+ network_cfg['decoder_n_layers'])
+ io.load_parameters(
+ model=self.tts_model,
+ checkpoint_path=self.tts_checkpoint_path)
+
+ # Build vocoder.
+ args = AttrDict()
+ args.config = self.waveflow_config_path
+ args.use_fp16 = False
+ self.waveflow_config = io.add_yaml_config_to_args(args)
+ self.waveflow = WaveFlowModule(self.waveflow_config)
+ io.load_parameters(
+ model=self.waveflow,
+ checkpoint_path=self.waveflow_checkpoint_path)
+
+ def synthesize(self, texts, use_gpu=False, vocoder="griffin-lim"):
+ """
+ Get the synthetic wavs from the texts.
+
+ Args:
+ texts(list): the input texts to be predicted.
+ use_gpu(bool): whether use gpu to predict or not
+ vocoder(str): the vocoder name, "griffin-lim" or "waveflow"
+
+ Returns:
+ wavs(str): the audio wav with sample rate . You can use soundfile.write to save it.
+ sample_rate(int): the audio sample rate.
+ """
+ if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
+ use_gpu = False
+ logger.warning(
+ "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
+ )
+ if use_gpu:
+ place = fluid.CUDAPlace(0)
+ else:
+ place = fluid.CPUPlace()
+
+ if texts and isinstance(texts, list):
+ predicted_data = texts
+ else:
+ raise ValueError(
+ "The input data is inconsistent with expectations.")
+
+ wavs = []
+ with fluid.dygraph.guard(place):
+ self.tts_model.eval()
+ self.waveflow.eval()
+ for text in predicted_data:
+ # init input
+ logger.info("Processing sentence: %s" % text)
+ text = np.asarray(text_to_sequence(text))
+ text = fluid.layers.unsqueeze(
+ dg.to_variable(text).astype(np.int64), [0])
+ mel_input = dg.to_variable(np.zeros([1, 1,
+ 80])).astype(np.float32)
+ pos_text = np.arange(1, text.shape[1] + 1)
+ pos_text = fluid.layers.unsqueeze(
+ dg.to_variable(pos_text).astype(np.int64), [0])
+
+ for i in range(self.max_len):
+ pos_mel = np.arange(1, mel_input.shape[1] + 1)
+ pos_mel = fluid.layers.unsqueeze(
+ dg.to_variable(pos_mel).astype(np.int64), [0])
+ mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = self.tts_model(
+ text, mel_input, pos_text, pos_mel)
+ if stop_preds.numpy()[0, -1] > self.stop_threshold:
+ break
+ mel_input = fluid.layers.concat(
+ [mel_input, postnet_pred[:, -1:, :]], axis=1)
+ if vocoder == 'griffin-lim':
+ # synthesis use griffin-lim
+ wav = self.synthesis_with_griffinlim(
+ postnet_pred, self.tts_config['audio'])
+ elif vocoder == 'waveflow':
+ # synthesis use waveflow
+ wav = self.synthesis_with_waveflow(
+ postnet_pred, self.waveflow_config.sigma)
+ else:
+ raise ValueError(
+ 'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
+ % vocoder)
+ wavs.append(wav)
+ return wavs, self.tts_config['audio']['sr']
+
+ def synthesis_with_griffinlim(self, mel_output, cfg):
+ # synthesis with griffin-lim
+ mel_output = fluid.layers.transpose(
+ fluid.layers.squeeze(mel_output, [0]), [1, 0])
+ mel_output = np.exp(mel_output.numpy())
+ basis = librosa.filters.mel(
+ cfg['sr'],
+ cfg['n_fft'],
+ cfg['num_mels'],
+ fmin=cfg['fmin'],
+ fmax=cfg['fmax'])
+ inv_basis = np.linalg.pinv(basis)
+ spec = np.maximum(1e-10, np.dot(inv_basis, mel_output))
+
+ wav = librosa.core.griffinlim(
+ spec**cfg['power'],
+ hop_length=cfg['hop_length'],
+ win_length=cfg['win_length'])
+
+ return wav
+
+ def synthesis_with_waveflow(self, mel_output, sigma):
+ mel_spectrogram = fluid.layers.transpose(
+ fluid.layers.squeeze(mel_output, [0]), [1, 0])
+ mel_spectrogram = fluid.layers.unsqueeze(mel_spectrogram, [0])
+
+ for layer in self.waveflow.sublayers():
+ if isinstance(layer, WeightNormWrapper):
+ layer.remove_weight_norm()
+
+ # Run model inference.
+ wav = self.waveflow.synthesize(mel_spectrogram, sigma=sigma)
+ return wav.numpy()[0]
+
+ @serving
+ def serving_method(self, texts, use_gpu=False, vocoder="griffin-lim"):
+ """
+ Run as a service.
+ """
+ wavs, sample_rate = self.synthesize(texts, use_gpu, vocoder)
+ wavs = [wav.tolist() for wav in wavs]
+ result = {"wavs": wavs, "sample_rate": sample_rate}
+ return result
+
+ def add_module_config_arg(self):
+ """
+ Add the command config options
+ """
+ self.arg_config_group.add_argument(
+ '--use_gpu',
+ type=ast.literal_eval,
+ default=False,
+ help="whether use GPU for prediction")
+
+ self.arg_config_group.add_argument(
+ '--vocoder',
+ type=str,
+ default="griffin-lim",
+ choices=['griffin-lim', 'waveflow'],
+ help="the vocoder name")
+
+ def add_module_output_arg(self):
+ """
+ Add the command config options
+ """
+ self.arg_config_group.add_argument(
+ '--output_path',
+ type=str,
+ default=os.path.abspath(
+ os.path.join(os.path.curdir, f"{self.name}_prediction")),
+ help="path to save experiment results")
+
+ @runnable
+ def run_cmd(self, argvs):
+ """
+ Run as a command
+ """
+ self.parser = argparse.ArgumentParser(
+ description='Run the %s module.' % self.name,
+ prog='hub run %s' % self.name,
+ usage='%(prog)s',
+ add_help=True)
+
+ self.arg_input_group = self.parser.add_argument_group(
+ title="Input options", description="Input data. Required")
+ self.arg_input_group = self.parser.add_argument_group(
+ title="Ouput options", description="Ouput path. Optional.")
+ self.arg_config_group = self.parser.add_argument_group(
+ title="Config options",
+ description=
+ "Run configuration for controlling module behavior, optional.")
+
+ self.add_module_config_arg()
+ self.add_module_input_arg()
+ self.add_module_output_arg()
+
+ args = self.parser.parse_args(argvs)
+
+ try:
+ input_data = self.check_input_data(args)
+ except DataFormatError and RuntimeError:
+ self.parser.print_help()
+ return None
+
+ mkdir(args.output_path)
+ wavs, sample_rate = self.synthesize(
+ texts=input_data, use_gpu=args.use_gpu, vocoder=args.vocoder)
+
+ for index, wav in enumerate(wavs):
+ sf.write(
+ os.path.join(args.output_path, f"{index}.wav"), wav,
+ sample_rate)
+
+ ret = f"The synthesized wav files have been saved in {args.output_path}"
+ return ret
+
+
+if __name__ == "__main__":
+
+ module = TransformerTTS()
+ test_text = [
+ "Life was like a box of chocolates, you never know what you're gonna get.",
+ ]
+ wavs, sample_rate = module.synthesize(texts=test_text, vocoder="waveflow")
+ for index, wav in enumerate(wavs):
+ sf.write(f"{index}.wav", wav, sample_rate)
diff --git a/hub_module/scripts/configs/deep_voice3.yml b/hub_module/scripts/configs/deep_voice3.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4acdb7133bf022031e2bafe2cfd10d8bb0a47a08
--- /dev/null
+++ b/hub_module/scripts/configs/deep_voice3.yml
@@ -0,0 +1,9 @@
+name: ace2p
+dir: "modules/audio/tts/deep_voice3"
+exclude:
+ - README.md
+resources:
+ -
+ url: https://paddlespeech.bj.bcebos.com/Parakeet/deepvoice3_ljspeech_griffin-lim_ckpt_1.0.zip
+ dest: assets
+ uncompress: True
diff --git a/paddlehub/module/nlp_module.py b/paddlehub/module/nlp_module.py
index 435f1a9294e02bda4ea5dd959cade5a9706cf901..b696d52e90c3335b87dfa1023efdcac62187d30d 100644
--- a/paddlehub/module/nlp_module.py
+++ b/paddlehub/module/nlp_module.py
@@ -19,7 +19,6 @@ from __future__ import print_function
import argparse
import ast
-import json
import os
import re
import six
@@ -31,7 +30,6 @@ import paddle.fluid as fluid
import paddlehub as hub
from paddle.fluid.core import PaddleTensor, AnalysisConfig, create_paddle_predictor
from paddlehub.common import paddle_helper, tmp_dir
-from paddlehub.common.logger import logger
from paddlehub.common.utils import sys_stdin_encoding, version_compare
from paddlehub.io.parser import txt_parser
from paddlehub.module.module import runnable
diff --git a/requirements.txt b/requirements.txt
index 37372665c93089f01b9331dd9e1bd9ff6b9f116b..d430c04e9c85fd24becadc468d81601506b5588e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ cma >= 2.7.0
sentencepiece
colorlog
tqdm
+nltk
# pandas no longer support python2 in version 0.25 and above
pandas ; python_version >= "3"