Add Parakeet tts (#711)

03a3c2ff · kinghuin · GitHub · 76c52fd1 · 03a3c2ff · 03a3c2ff
12 changed file
--- a/hub_module/modules/audio/tts/deepvoice3_ljspeech/README.md
+++ b/hub_module/modules/audio/tts/deepvoice3_ljspeech/README.md
+## 概述
+
+Deep Voice 3是百度研究院2017年发布的端到端的TTS模型（论文录用于ICLR 2018）。它是一个基于卷积神经网络和注意力机制的seq2seq模型,由于不包含循环神经网络，它可以并行训练，远快于基于循环神经网络的模型。Deep Voice 3可以学习到多个说话人的特征，也支持搭配多种声码器使用。deepvoice3_ljspeech是基于ljspeech英文语音数据集预训练得到的英文TTS模型，仅支持预测。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/Parakeet/blob/develop/examples/deepvoice3/images/model_architecture.png" hspace='10'/> <br />
+</p>
+
+更多详情参考论文[Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654)
+
+## 命令行预测
+
+```shell
+$ hub run deepvoice3_ljspeech --input_text='Simple as this proposition is, it is necessary to be stated'
+```
+
+## API
+
+```python
+def synthesize(texts, use_gpu=False, vocoder="griffin-lim"):
+```
+
+预测API，由输入文本合成对应音频波形。
+
+**参数**
+
+* texts (list\[str\]): 待预测文本；
+* use\_gpu (bool): 是否使用 GPU；**若使用GPU，请先设置CUDA\_VISIBLE\_DEVICES环境变量**；
+* vocoder: 指定声码器，可选 "griffin-lim"或"waveflow"
+
+**返回**
+
+* wavs (list): 语音合成结果列表，列表中每一个元素为对应输入文本的音频波形，可使用`soundfile.write`进一步处理或保存。
+* sample\_rate (int): 合成音频的采样率。
+
+**代码示例**
+
+```python
+import paddlehub as hub
+import soundfile as sf
+
+# Load deepvoice3_ljspeech module.
+module = hub.Module(name="deepvoice3_ljspeech")
+
+# Predict sentiment label
+test_texts = ['Simple as this proposition is, it is necessary to be stated',
+              'Parakeet stands for Paddle PARAllel text-to-speech toolkit']
+wavs, sample_rate = module.synthesize(texts=test_texts)
+for index, wav in enumerate(wavs):
+    sf.write(f"{index}.wav", wav, sample_rate)
+```
+
+## 服务部署
+
+PaddleHub Serving 可以部署在线服务。
+
+### 第一步：启动PaddleHub Serving
+
+运行启动命令：
+```shell
+$ hub serving start -m deepvoice3_ljspeech
+```
+
+这样就完成了一个服务化API的部署，默认端口号为8866。
+
+**NOTE:** 如使用GPU预测，则需要在启动服务之前，请设置CUDA\_VISIBLE\_DEVICES环境变量，否则不用设置。
+
+### 第二步：发送预测请求
+
+配置好服务端，以下数行代码即可实现发送预测请求，获取预测结果
+
+```python
+import requests
+import json
+
+import soundfile as sf
+
+# 发送HTTP请求
+
+data = {'texts':['Simple as this proposition is, it is necessary to be stated',
+                 'Parakeet stands for Paddle PARAllel text-to-speech toolkit'],
+        'use_gpu':False}
+headers = {"Content-type": "application/json"}
+url = "http://127.0.0.1:8866/predict/deepvoice3_ljspeech"
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+
+# 保存结果
+result = r.json()["results"]
+wavs = result["wavs"]
+sample_rate = result["sample_rate"]
+for index, wav in enumerate(wavs):
+    sf.write(f"{index}.wav", wav, sample_rate)
+```
+
+## 查看代码
+
+https://github.com/PaddlePaddle/Parakeet
+
+### 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.7.0
+
+**NOTE:** 除了python依赖外还必须安装libsndfile库
+对于Ubuntu用户，请执行：
+```
+sudo apt-get install libsndfile1
+```
+对于Centos用户，请执行：
+```
+sudo yum install libsndfile
+```
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/hub_module/modules/audio/tts/deepvoice3_ljspeech/__init__.py
+++ b/hub_module/modules/audio/tts/deepvoice3_ljspeech/__init__.py
--- a/hub_module/modules/audio/tts/deepvoice3_ljspeech/module.py
+++ b/hub_module/modules/audio/tts/deepvoice3_ljspeech/module.py
+# coding:utf-8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import argparse
+import ast
+import importlib.util
+
+import nltk
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dg
+import paddlehub as hub
+from paddlehub.common.logger import logger
+from paddlehub.module.module import moduleinfo, serving
+from paddlehub.common.dir import THIRD_PARTY_HOME
+from paddlehub.common.utils import mkdir
+from paddlehub.common.downloader import default_downloader
+from paddlehub.module.module import runnable
+from paddlehub.module.nlp_module import DataFormatError
+
+lack_dependency = []
+for dependency in ["ruamel", "parakeet", "soundfile", "librosa"]:
+    if not importlib.util.find_spec(dependency):
+        lack_dependency.append(dependency)
+
+# Accelerate NLTK package download via paddlehub. 'import parakeet' will use the package.
+_PUNKT_URL = "https://paddlehub.bj.bcebos.com/paddlehub-thirdparty/punkt.tar.gz"
+_CMUDICT_URL = "https://paddlehub.bj.bcebos.com/paddlehub-thirdparty/cmudict.tar.gz"
+nltk_path = os.path.join(THIRD_PARTY_HOME, "nltk_data")
+tokenizers_path = os.path.join(nltk_path, "tokenizers")
+corpora_path = os.path.join(nltk_path, "corpora")
+punkt_path = os.path.join(tokenizers_path, "punkt")
+cmudict_path = os.path.join(corpora_path, "cmudict")
+
+if not os.path.exists(punkt_path):
+    default_downloader.download_file_and_uncompress(
+        url=_PUNKT_URL, save_path=tokenizers_path, print_progress=True)
+if not os.path.exists(cmudict_path):
+    default_downloader.download_file_and_uncompress(
+        url=_CMUDICT_URL, save_path=corpora_path, print_progress=True)
+nltk.data.path.append(nltk_path)
+
+if not lack_dependency:
+    import soundfile as sf
+    import librosa
+    import ruamel.yaml
+    from parakeet.utils import io
+    from parakeet.g2p import en
+    from parakeet.models.deepvoice3 import Encoder, Decoder, PostNet, SpectraNet
+    from parakeet.models.waveflow import WaveFlowModule
+else:
+    raise ImportError(
+        "The module requires additional dependencies: %s. You can install parakeet via 'git clone https://github.com/PaddlePaddle/Parakeet && cd Parakeet && pip install -e .' and others via pip install"
+        % ", ".join(lack_dependency))
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+class WaveflowVocoder(object):
+    def __init__(self, config_path, checkpoint_path):
+        with open(config_path, 'rt') as f:
+            config = ruamel.yaml.safe_load(f)
+        ns = argparse.Namespace()
+        for k, v in config.items():
+            setattr(ns, k, v)
+        ns.use_fp16 = False
+
+        self.model = WaveFlowModule(ns)
+        io.load_parameters(self.model, checkpoint_path=checkpoint_path)
+
+    def __call__(self, mel):
+        with dg.no_grad():
+            self.model.eval()
+            audio = self.model.synthesize(mel)
+        self.model.train()
+        return audio
+
+
+class GriffinLimVocoder(object):
+    def __init__(self,
+                 sharpening_factor=1.4,
+                 sample_rate=22050,
+                 n_fft=1024,
+                 win_length=1024,
+                 hop_length=256):
+        self.sample_rate = sample_rate
+        self.n_fft = n_fft
+        self.sharpening_factor = sharpening_factor
+        self.win_length = win_length
+        self.hop_length = hop_length
+
+    def __call__(self, mel):
+        spec = librosa.feature.inverse.mel_to_stft(
+            np.exp(mel),
+            sr=self.sample_rate,
+            n_fft=self.n_fft,
+            fmin=0,
+            fmax=8000.0,
+            power=1.0)
+        audio = librosa.core.griffinlim(
+            spec**self.sharpening_factor,
+            win_length=self.win_length,
+            hop_length=self.hop_length)
+        return audio
+
+
+@moduleinfo(
+    name="deepvoice3_ljspeech",
+    version="1.0.0",
+    summary=
+    "Deep Voice 3, a fully-convolutional attention-based neural text-to-speech (TTS) system.",
+    author="paddlepaddle",
+    author_email="",
+    type="nlp/tts",
+)
+class DeepVoice3(hub.NLPPredictionModule):
+    def _initialize(self):
+        """
+        initialize with the necessary elements
+        """
+        self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts",
+                                                "step-1780000")
+        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
+                                                     "vocoder", "step-2000000")
+        self.waveflow_config_path = os.path.join(
+            self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml")
+        tts_checkpoint_path = os.path.join(self.directory, "assets", "tts",
+                                           "ljspeech.yaml")
+        with open(tts_checkpoint_path) as f:
+            self.tts_config = ruamel.yaml.safe_load(f)
+
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            char_embedding = dg.Embedding((en.n_vocab,
+                                           self.tts_config["char_dim"]))
+            multi_speaker = self.tts_config["n_speakers"] > 1
+            speaker_embedding = dg.Embedding((self.tts_config["n_speakers"], self.tts_config["speaker_dim"])) \
+                if multi_speaker else None
+            encoder = Encoder(
+                self.tts_config["encoder_layers"],
+                self.tts_config["char_dim"],
+                self.tts_config["encoder_dim"],
+                self.tts_config["kernel_size"],
+                has_bias=multi_speaker,
+                bias_dim=self.tts_config["speaker_dim"],
+                keep_prob=1.0 - self.tts_config["dropout"])
+            decoder = Decoder(
+                self.tts_config["n_mels"],
+                self.tts_config["reduction_factor"],
+                list(self.tts_config["prenet_sizes"]) +
+                [self.tts_config["char_dim"]],
+                self.tts_config["decoder_layers"],
+                self.tts_config["kernel_size"],
+                self.tts_config["attention_dim"],
+                position_encoding_weight=self.tts_config["position_weight"],
+                omega=self.tts_config["position_rate"],
+                has_bias=multi_speaker,
+                bias_dim=self.tts_config["speaker_dim"],
+                keep_prob=1.0 - self.tts_config["dropout"])
+            postnet = PostNet(
+                self.tts_config["postnet_layers"],
+                self.tts_config["char_dim"],
+                self.tts_config["postnet_dim"],
+                self.tts_config["kernel_size"],
+                self.tts_config["n_mels"],
+                self.tts_config["reduction_factor"],
+                has_bias=multi_speaker,
+                bias_dim=self.tts_config["speaker_dim"],
+                keep_prob=1.0 - self.tts_config["dropout"])
+            self.tts_model = SpectraNet(char_embedding, speaker_embedding,
+                                        encoder, decoder, postnet)
+            io.load_parameters(
+                model=self.tts_model, checkpoint_path=self.tts_checkpoint_path)
+
+            self.waveflow = WaveflowVocoder(
+                config_path=self.waveflow_config_path,
+                checkpoint_path=self.waveflow_checkpoint_path)
+            self.griffin = GriffinLimVocoder(
+                sharpening_factor=self.tts_config["sharpening_factor"],
+                sample_rate=self.tts_config["sample_rate"],
+                n_fft=self.tts_config["n_fft"],
+                win_length=self.tts_config["win_length"],
+                hop_length=self.tts_config["hop_length"])
+
+    def synthesize(self, texts, use_gpu=False, vocoder="griffin-lim"):
+        """
+        Get the synthetic wavs from the texts.
+
+        Args:
+             texts(list): the input texts to be predicted.
+             use_gpu(bool): whether use gpu to predict or not
+             vocoder(str): the vocoder name, "griffin-lim" or "waveflow"
+
+        Returns:
+             wavs(str): the audio wav with sample rate . You can use soundfile.write to save it.
+             sample_rate(int): the audio sample rate.
+        """
+        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
+            use_gpu = False
+            logger.warning(
+                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
+            )
+
+        place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
+
+        if texts and isinstance(texts, list):
+            predicted_data = texts
+        else:
+            raise ValueError(
+                "The input data is inconsistent with expectations.")
+
+        wavs = []
+        with fluid.dygraph.guard(place):
+            self.tts_model.eval()
+            self.waveflow.model.eval()
+            monotonic_layers = [4]
+            for text in predicted_data:
+                # init input
+                logger.info("Processing sentence: %s" % text)
+                text = en.text_to_sequence(text, p=1.0)
+                text = np.expand_dims(np.array(text, dtype="int64"), 0)
+                lengths = np.array([text.size], dtype=np.int64)
+                text_seqs = dg.to_variable(text)
+                text_lengths = dg.to_variable(lengths)
+
+                decoder_layers = self.tts_config["decoder_layers"]
+                force_monotonic_attention = [False] * decoder_layers
+                for i in monotonic_layers:
+                    force_monotonic_attention[i] = True
+
+                outputs = self.tts_model(
+                    text_seqs,
+                    text_lengths,
+                    speakers=None,
+                    force_monotonic_attention=force_monotonic_attention,
+                    window=(self.tts_config["backward_step"],
+                            self.tts_config["forward_step"]))
+                decoded, refined, attentions = outputs
+                if vocoder == 'griffin-lim':
+                    # synthesis use griffin-lim
+                    wav = self.griffin(refined.numpy()[0].T)
+                elif vocoder == 'waveflow':
+                    # synthesis use waveflow
+                    wav = self.waveflow(
+                        fluid.layers.transpose(refined, [0, 2, 1])).numpy()[0]
+                else:
+                    raise ValueError(
+                        'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
+                        % vocoder)
+                wavs.append(wav)
+        return wavs, self.tts_config["sample_rate"]
+
+    @serving
+    def serving_method(self, texts, use_gpu=False, vocoder="griffin-lim"):
+        """
+        Run as a service.
+        """
+        wavs, sample_rate = self.synthesize(texts, use_gpu, vocoder)
+        wavs = [wav.tolist() for wav in wavs]
+        result = {"wavs": wavs, "sample_rate": sample_rate}
+        return result
+
+    def add_module_config_arg(self):
+        """
+        Add the command config options
+        """
+        self.arg_config_group.add_argument(
+            '--use_gpu',
+            type=ast.literal_eval,
+            default=False,
+            help="whether use GPU for prediction")
+
+        self.arg_config_group.add_argument(
+            '--vocoder',
+            type=str,
+            default="griffin-lim",
+            choices=['griffin-lim', 'waveflow'],
+            help="the vocoder name")
+
+    def add_module_output_arg(self):
+        """
+        Add the command config options
+        """
+        self.arg_config_group.add_argument(
+            '--output_path',
+            type=str,
+            default=os.path.abspath(
+                os.path.join(os.path.curdir, f"{self.name}_prediction")),
+            help="path to save experiment results")
+
+    @runnable
+    def run_cmd(self, argvs):
+        """
+        Run as a command
+        """
+        self.parser = argparse.ArgumentParser(
+            description='Run the %s module.' % self.name,
+            prog='hub run %s' % self.name,
+            usage='%(prog)s',
+            add_help=True)
+
+        self.arg_input_group = self.parser.add_argument_group(
+            title="Input options", description="Input data. Required")
+        self.arg_input_group = self.parser.add_argument_group(
+            title="Ouput options", description="Ouput path. Optional.")
+        self.arg_config_group = self.parser.add_argument_group(
+            title="Config options",
+            description=
+            "Run configuration for controlling module behavior, optional.")
+
+        self.add_module_config_arg()
+        self.add_module_input_arg()
+        self.add_module_output_arg()
+
+        args = self.parser.parse_args(argvs)
+
+        try:
+            input_data = self.check_input_data(args)
+        except DataFormatError and RuntimeError:
+            self.parser.print_help()
+            return None
+
+        mkdir(args.output_path)
+        wavs, sample_rate = self.synthesize(
+            texts=input_data, use_gpu=args.use_gpu, vocoder=args.vocoder)
+
+        for index, wav in enumerate(wavs):
+            sf.write(
+                os.path.join(args.output_path, f"{index}.wav"), wav,
+                sample_rate)
+
+        ret = f"The synthesized wav files have been saved in {args.output_path}"
+        return ret
+
+
+if __name__ == "__main__":
+    module = DeepVoice3()
+    test_text = [
+        "Simple as this proposition is, it is necessary to be stated",
+        "Parakeet stands for Paddle PARAllel text-to-speech toolkit.",
+    ]
+    wavs, sample_rate = module.synthesize(texts=test_text, vocoder="waveflow")
+    for index, wav in enumerate(wavs):
+        sf.write(f"{index}.wav", wav, sample_rate)
--- a/hub_module/modules/audio/tts/fastspeech_ljspeech/README.md
+++ b/hub_module/modules/audio/tts/fastspeech_ljspeech/README.md
+## 概述
+
+FastSpeech是基于Transformer的前馈神经网络，作者从encoder-decoder结构的teacher model中提取attention对角线来做发音持续时间预测，即使用长度调节器对文本序列进行扩展来匹配目标梅尔频谱的长度，以便并行生成梅尔频谱。该模型基本上消除了复杂情况下的跳词和重复的问题，并且可以平滑地调整语音速度，更重要的是，该模型大幅度提升了梅尔频谱的生成速度。fastspeech_ljspeech是基于ljspeech英文语音数据集预训练得到的英文TTS模型，仅支持预测。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/Parakeet/blob/develop/examples/fastspeech/images/model_architecture.png" hspace='10'/> <br />
+</p>
+
+更多详情参考论文[FastSpeech: Fast, Robust and Controllable Text to Speech](https://arxiv.org/abs/1905.09263)
+
+## 命令行预测
+
+```shell
+$ hub run fastspeech_ljspeech --input_text='Simple as this proposition is, it is necessary to be stated'
+```
+
+## API
+
+```python
+def synthesize(texts, use_gpu=False, speed=1.0, vocoder="griffin-lim"):
+```
+
+预测API，由输入文本合成对应音频波形。
+
+**参数**
+
+* texts (list\[str\]): 待预测文本；
+* use\_gpu (bool): 是否使用 GPU；**若使用GPU，请先设置CUDA\_VISIBLE\_DEVICES环境变量**；
+* speed(float): 音频速度，1.0表示以原速输出。
+* vocoder: 指定声码器，可选 "griffin-lim"或"waveflow"
+
+**返回**
+
+* wavs (list): 语音合成结果列表，列表中每一个元素为对应输入文本的音频波形，可使用`soundfile.write`进一步处理或保存。
+* sample\_rate (int): 合成音频的采样率。
+
+**代码示例**
+
+```python
+import paddlehub as hub
+import soundfile as sf
+
+# Load fastspeech_ljspeech module.
+module = hub.Module(name="fastspeech_ljspeech")
+
+# Predict sentiment label
+test_texts = ['Simple as this proposition is, it is necessary to be stated',
+              'Parakeet stands for Paddle PARAllel text-to-speech toolkit']
+wavs, sample_rate = module.synthesize(texts=test_texts)
+for index, wav in enumerate(wavs):
+    sf.write(f"{index}.wav", wav, sample_rate)
+```
+
+## 服务部署
+
+PaddleHub Serving 可以部署在线服务。
+
+### 第一步：启动PaddleHub Serving
+
+运行启动命令：
+```shell
+$ hub serving start -m fastspeech_ljspeech
+```
+
+这样就完成了一个服务化API的部署，默认端口号为8866。
+
+**NOTE:** 如使用GPU预测，则需要在启动服务之前，请设置CUDA\_VISIBLE\_DEVICES环境变量，否则不用设置。
+
+### 第二步：发送预测请求
+
+配置好服务端，以下数行代码即可实现发送预测请求，获取预测结果
+
+```python
+import requests
+import json
+
+import soundfile as sf
+
+# 发送HTTP请求
+
+data = {'texts':['Simple as this proposition is, it is necessary to be stated',
+                 'Parakeet stands for Paddle PARAllel text-to-speech toolkit'],
+        'use_gpu':False}
+headers = {"Content-type": "application/json"}
+url = "http://127.0.0.1:8866/predict/fastspeech_ljspeech"
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+
+# 保存结果
+result = r.json()["results"]
+wavs = result["wavs"]
+sample_rate = result["sample_rate"]
+for index, wav in enumerate(wavs):
+    sf.write(f"{index}.wav", wav, sample_rate)
+```
+
+## 查看代码
+
+https://github.com/PaddlePaddle/Parakeet
+
+### 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.7.0
+
+**NOTE:** 除了python依赖外还必须安装libsndfile库
+对于Ubuntu用户，请执行：
+```
+sudo apt-get install libsndfile1
+```
+对于Centos用户，请执行：
+```
+sudo yum install libsndfile
+```
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/hub_module/modules/audio/tts/fastspeech_ljspeech/__init__.py
+++ b/hub_module/modules/audio/tts/fastspeech_ljspeech/__init__.py
--- a/hub_module/modules/audio/tts/fastspeech_ljspeech/module.py
+++ b/hub_module/modules/audio/tts/fastspeech_ljspeech/module.py
+# coding:utf-8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import ast
+import argparse
+import importlib.util
+
+import nltk
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dg
+import paddlehub as hub
+from paddlehub.module.module import runnable
+from paddlehub.common.utils import mkdir
+from paddlehub.module.nlp_module import DataFormatError
+from paddlehub.common.logger import logger
+from paddlehub.module.module import moduleinfo, serving
+from paddlehub.common.dir import THIRD_PARTY_HOME
+from paddlehub.common.downloader import default_downloader
+
+lack_dependency = []
+for dependency in ["ruamel", "parakeet", "soundfile", "librosa"]:
+    if not importlib.util.find_spec(dependency):
+        lack_dependency.append(dependency)
+
+# Accelerate NLTK package download via paddlehub. 'import parakeet' will use the package.
+_PUNKT_URL = "https://paddlehub.bj.bcebos.com/paddlehub-thirdparty/punkt.tar.gz"
+_CMUDICT_URL = "https://paddlehub.bj.bcebos.com/paddlehub-thirdparty/cmudict.tar.gz"
+nltk_path = os.path.join(THIRD_PARTY_HOME, "nltk_data")
+tokenizers_path = os.path.join(nltk_path, "tokenizers")
+corpora_path = os.path.join(nltk_path, "corpora")
+punkt_path = os.path.join(tokenizers_path, "punkt")
+cmudict_path = os.path.join(corpora_path, "cmudict")
+
+if not os.path.exists(punkt_path):
+    default_downloader.download_file_and_uncompress(
+        url=_PUNKT_URL, save_path=tokenizers_path, print_progress=True)
+if not os.path.exists(cmudict_path):
+    default_downloader.download_file_and_uncompress(
+        url=_CMUDICT_URL, save_path=corpora_path, print_progress=True)
+nltk.data.path.append(nltk_path)
+
+if not lack_dependency:
+    import soundfile as sf
+    import librosa
+    from ruamel import yaml
+    from parakeet.models.fastspeech.fastspeech import FastSpeech as FastSpeechModel
+    from parakeet.g2p.en import text_to_sequence
+    from parakeet.models.transformer_tts.utils import *
+    from parakeet.utils import io
+    from parakeet.modules.weight_norm import WeightNormWrapper
+    from parakeet.models.waveflow import WaveFlowModule
+else:
+    raise ImportError(
+        "The module requires additional dependencies: %s. You can install parakeet via 'git clone https://github.com/PaddlePaddle/Parakeet && cd Parakeet && pip install -e .' and others via pip install"
+        % ", ".join(lack_dependency))
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+@moduleinfo(
+    name="fastspeech_ljspeech",
+    version="1.0.0",
+    summary=
+    "FastSpeech proposes a novel feed-forward network based on Transformer to generate mel-spectrogram in parallel for TTS. See https://arxiv.org/abs/1905.09263 for details.",
+    author="baidu-nlp",
+    author_email="",
+    type="nlp/tts",
+)
+class FastSpeech(hub.NLPPredictionModule):
+    def _initialize(self):
+        """
+        initialize with the necessary elements
+        """
+        self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts",
+                                                "step-162000")
+        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
+                                                     "vocoder", "step-2000000")
+        self.waveflow_config_path = os.path.join(
+            self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml")
+
+        tts_config_path = os.path.join(self.directory, "assets", "tts",
+                                       "ljspeech.yaml")
+        with open(tts_config_path) as f:
+            self.tts_config = yaml.load(f, Loader=yaml.Loader)
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            self.tts_model = FastSpeechModel(
+                self.tts_config['network'],
+                num_mels=self.tts_config['audio']['num_mels'])
+            io.load_parameters(
+                model=self.tts_model, checkpoint_path=self.tts_checkpoint_path)
+
+            # Build vocoder.
+            args = AttrDict()
+            args.config = self.waveflow_config_path
+            args.use_fp16 = False
+            self.waveflow_config = io.add_yaml_config_to_args(args)
+            self.waveflow = WaveFlowModule(self.waveflow_config)
+            io.load_parameters(
+                model=self.waveflow,
+                checkpoint_path=self.waveflow_checkpoint_path)
+
+    def synthesize(self, texts, use_gpu=False, speed=1.0,
+                   vocoder="griffin-lim"):
+        """
+        Get the synthetic wavs from the texts.
+
+        Args:
+             texts(list): the input texts to be predicted.
+             use_gpu(bool): whether use gpu to predict or not. Default False.
+             speed(float): Controlling the voice speed. Default 1.0.
+             vocoder(str): the vocoder name, "griffin-lim" or "waveflow".
+
+        Returns:
+             wavs(str): the audio wav with sample rate . You can use soundfile.write to save it.
+             sample_rate(int): the audio sample rate.
+        """
+        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
+            use_gpu = False
+            logger.warning(
+                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
+            )
+        if use_gpu:
+            place = fluid.CUDAPlace(0)
+        else:
+            place = fluid.CPUPlace()
+
+        if texts and isinstance(texts, list):
+            predicted_data = texts
+        else:
+            raise ValueError(
+                "The input data is inconsistent with expectations.")
+
+        wavs = []
+        with fluid.dygraph.guard(place):
+            self.tts_model.eval()
+            self.waveflow.eval()
+            for text in predicted_data:
+                # init input
+                logger.info("Processing sentence: %s" % text)
+                text = np.asarray(text_to_sequence(text))
+                text = np.expand_dims(text, axis=0)
+                pos_text = np.arange(1, text.shape[1] + 1)
+                pos_text = np.expand_dims(pos_text, axis=0)
+
+                text = dg.to_variable(text).astype(np.int64)
+                pos_text = dg.to_variable(pos_text).astype(np.int64)
+
+                _, mel_output_postnet = self.tts_model(
+                    text, pos_text, alpha=1 / speed)
+
+                if vocoder == 'griffin-lim':
+                    # synthesis use griffin-lim
+                    wav = self.synthesis_with_griffinlim(
+                        mel_output_postnet, self.tts_config['audio'])
+                elif vocoder == 'waveflow':
+                    wav = self.synthesis_with_waveflow(
+                        mel_output_postnet, self.waveflow_config.sigma)
+                else:
+                    raise ValueError(
+                        'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
+                        % vocoder)
+                wavs.append(wav)
+        return wavs, self.tts_config['audio']['sr']
+
+    def synthesis_with_griffinlim(self, mel_output, cfg):
+        # synthesis with griffin-lim
+        mel_output = fluid.layers.transpose(
+            fluid.layers.squeeze(mel_output, [0]), [1, 0])
+        mel_output = np.exp(mel_output.numpy())
+        basis = librosa.filters.mel(
+            cfg['sr'],
+            cfg['n_fft'],
+            cfg['num_mels'],
+            fmin=cfg['fmin'],
+            fmax=cfg['fmax'])
+        inv_basis = np.linalg.pinv(basis)
+        spec = np.maximum(1e-10, np.dot(inv_basis, mel_output))
+
+        wav = librosa.core.griffinlim(
+            spec**cfg['power'],
+            hop_length=cfg['hop_length'],
+            win_length=cfg['win_length'])
+
+        return wav
+
+    def synthesis_with_waveflow(self, mel_output, sigma):
+        mel_spectrogram = fluid.layers.transpose(
+            fluid.layers.squeeze(mel_output, [0]), [1, 0])
+        mel_spectrogram = fluid.layers.unsqueeze(mel_spectrogram, [0])
+
+        for layer in self.waveflow.sublayers():
+            if isinstance(layer, WeightNormWrapper):
+                layer.remove_weight_norm()
+
+        # Run model inference.
+        wav = self.waveflow.synthesize(mel_spectrogram, sigma=sigma)
+        return wav.numpy()[0]
+
+    @serving
+    def serving_method(self,
+                       texts,
+                       use_gpu=False,
+                       speed=1.0,
+                       vocoder="griffin-lim"):
+        """
+        Run as a service.
+        """
+        wavs, sample_rate = self.synthesize(texts, use_gpu, speed, vocoder)
+        wavs = [wav.tolist() for wav in wavs]
+        result = {"wavs": wavs, "sample_rate": sample_rate}
+        return result
+
+    def add_module_config_arg(self):
+        """
+        Add the command config options
+        """
+        self.arg_config_group.add_argument(
+            '--use_gpu',
+            type=ast.literal_eval,
+            default=False,
+            help="whether use GPU for prediction")
+
+        self.arg_config_group.add_argument(
+            '--vocoder',
+            type=str,
+            default="griffin-lim",
+            choices=['griffin-lim', 'waveflow'],
+            help="the vocoder name")
+
+    def add_module_output_arg(self):
+        """
+        Add the command config options
+        """
+        self.arg_config_group.add_argument(
+            '--output_path',
+            type=str,
+            default=os.path.abspath(
+                os.path.join(os.path.curdir, f"{self.name}_prediction")),
+            help="path to save experiment results")
+
+    @runnable
+    def run_cmd(self, argvs):
+        """
+        Run as a command
+        """
+        self.parser = argparse.ArgumentParser(
+            description='Run the %s module.' % self.name,
+            prog='hub run %s' % self.name,
+            usage='%(prog)s',
+            add_help=True)
+
+        self.arg_input_group = self.parser.add_argument_group(
+            title="Input options", description="Input data. Required")
+        self.arg_input_group = self.parser.add_argument_group(
+            title="Ouput options", description="Ouput path. Optional.")
+        self.arg_config_group = self.parser.add_argument_group(
+            title="Config options",
+            description=
+            "Run configuration for controlling module behavior, optional.")
+
+        self.add_module_config_arg()
+        self.add_module_input_arg()
+        self.add_module_output_arg()
+
+        args = self.parser.parse_args(argvs)
+
+        try:
+            input_data = self.check_input_data(args)
+        except DataFormatError and RuntimeError:
+            self.parser.print_help()
+            return None
+
+        mkdir(args.output_path)
+        wavs, sample_rate = self.synthesize(
+            texts=input_data, use_gpu=args.use_gpu, vocoder=args.vocoder)
+
+        for index, wav in enumerate(wavs):
+            sf.write(
+                os.path.join(args.output_path, f"{index}.wav"), wav,
+                sample_rate)
+
+        ret = f"The synthesized wav files have been saved in {args.output_path}"
+        return ret
+
+
+if __name__ == "__main__":
+
+    module = FastSpeech()
+    test_text = [
+        "Simple as this proposition is, it is necessary to be stated",
+    ]
+    wavs, sample_rate = module.synthesize(
+        texts=test_text, speed=1, vocoder="waveflow")
+    for index, wav in enumerate(wavs):
+        sf.write(f"{index}.wav", wav, sample_rate)
--- a/hub_module/modules/audio/tts/transformer_tts_ljspeech/README.md
+++ b/hub_module/modules/audio/tts/transformer_tts_ljspeech/README.md
+## 概述
+
+TansformerTTS 是使用了 Transformer 结构的端到端语音合成模型，对 Transformer 和 Tacotron2 进行了融合，取得了令人满意的效果。因为删除了 RNN 的循环连接，可并行的提供 decoder 的输入，进行并行训练，大大提升了模型的训练速度。transformer_tts_ljspeech是基于ljspeech英文语音数据集预训练得到的英文TTS模型，仅支持预测。
+
+<p align="center">
+<img src="https://github.com/PaddlePaddle/Parakeet/blob/develop/examples/transformer_tts/images/model_architecture.jpg" hspace='10'/> <br />
+</p>
+
+更多详情参考论文[Neural Speech Synthesis with Transformer Network](https://arxiv.org/abs/1809.08895)
+
+## 命令行预测
+
+```shell
+$ hub run transformer_tts_ljspeech --input_text="Life was like a box of chocolates, you never know what you're gonna get." --use_gpu True --vocoder griffin-lim
+```
+
+## API
+
+```python
+def synthesize(texts, use_gpu=False, vocoder="griffin-lim"):
+```
+
+预测API，由输入文本合成对应音频波形。
+
+**参数**
+
+* texts (list\[str\]): 待预测文本；
+* use\_gpu (bool): 是否使用 GPU；**若使用GPU，请先设置CUDA\_VISIBLE\_DEVICES环境变量**；
+* vocoder: 指定声码器，可选 "griffin-lim"或"waveflow"
+
+**返回**
+
+* wavs (list): 语音合成结果列表，列表中每一个元素为对应输入文本的音频波形，可使用`soundfile.write`进一步处理或保存。
+* sample\_rate (int): 合成音频的采样率。
+
+**代码示例**
+
+```python
+import paddlehub as hub
+import soundfile as sf
+
+# Load transformer_tts_ljspeech module.
+module = hub.Module(name="transformer_tts_ljspeech")
+
+# Predict sentiment label
+test_texts = ["Life was like a box of chocolates, you never know what you're gonna get."]
+wavs, sample_rate = module.synthesize(texts=test_texts, use_gpu=True, vocoder="waveflow")
+for index, wav in enumerate(wavs):
+    sf.write(f"{index}.wav", wav, sample_rate)
+```
+
+## 服务部署
+
+PaddleHub Serving 可以部署在线服务。
+
+### 第一步：启动PaddleHub Serving
+
+运行启动命令：
+```shell
+$ hub serving start -m transformer_tts_ljspeech
+```
+
+这样就完成了一个服务化API的部署，默认端口号为8866。
+
+**NOTE:** 如使用GPU预测，则需要在启动服务之前，请设置CUDA\_VISIBLE\_DEVICES环境变量，否则不用设置。
+
+### 第二步：发送预测请求
+
+配置好服务端，以下数行代码即可实现发送预测请求，获取预测结果
+
+```python
+import requests
+import json
+
+import soundfile as sf
+
+# 发送HTTP请求
+
+data = {'texts':['Simple as this proposition is, it is necessary to be stated',
+                 'Parakeet stands for Paddle PARAllel text-to-speech toolkit'],
+        'use_gpu':False}
+headers = {"Content-type": "application/json"}
+url = "http://127.0.0.1:8866/predict/transformer_tts_ljspeech"
+r = requests.post(url=url, headers=headers, data=json.dumps(data))
+
+# 保存结果
+result = r.json()["results"]
+wavs = result["wavs"]
+sample_rate = result["sample_rate"]
+for index, wav in enumerate(wavs):
+    sf.write(f"{index}.wav", wav, sample_rate)
+```
+
+## 查看代码
+
+https://github.com/PaddlePaddle/Parakeet
+
+### 依赖
+
+paddlepaddle >= 1.8.2
+
+paddlehub >= 1.7.0
+
+**NOTE:** 除了python依赖外还必须安装libsndfile库
+对于Ubuntu用户，请执行：
+```
+sudo apt-get install libsndfile1
+```
+对于Centos用户，请执行：
+```
+sudo yum install libsndfile
+```
+
+## 更新历史
+
+* 1.0.0
+
+  初始发布
--- a/hub_module/modules/audio/tts/transformer_tts_ljspeech/__init__.py
+++ b/hub_module/modules/audio/tts/transformer_tts_ljspeech/__init__.py
--- a/hub_module/modules/audio/tts/transformer_tts_ljspeech/module.py
+++ b/hub_module/modules/audio/tts/transformer_tts_ljspeech/module.py
+# coding:utf-8
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import ast
+import argparse
+import importlib.util
+
+import nltk
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dg
+import paddlehub as hub
+from paddlehub.module.module import runnable
+from paddlehub.common.utils import mkdir
+from paddlehub.module.nlp_module import DataFormatError
+from paddlehub.common.logger import logger
+from paddlehub.module.module import moduleinfo, serving
+from paddlehub.common.dir import THIRD_PARTY_HOME
+from paddlehub.common.downloader import default_downloader
+
+lack_dependency = []
+for dependency in ["ruamel", "parakeet", "scipy", "soundfile", "librosa"]:
+    if not importlib.util.find_spec(dependency):
+        lack_dependency.append(dependency)
+
+# Accelerate NLTK package download via paddlehub. 'import parakeet' will use the package.
+_PUNKT_URL = "https://paddlehub.bj.bcebos.com/paddlehub-thirdparty/punkt.tar.gz"
+_CMUDICT_URL = "https://paddlehub.bj.bcebos.com/paddlehub-thirdparty/cmudict.tar.gz"
+nltk_path = os.path.join(THIRD_PARTY_HOME, "nltk_data")
+tokenizers_path = os.path.join(nltk_path, "tokenizers")
+corpora_path = os.path.join(nltk_path, "corpora")
+punkt_path = os.path.join(tokenizers_path, "punkt")
+cmudict_path = os.path.join(corpora_path, "cmudict")
+
+if not os.path.exists(punkt_path):
+    default_downloader.download_file_and_uncompress(
+        url=_PUNKT_URL, save_path=tokenizers_path, print_progress=True)
+if not os.path.exists(cmudict_path):
+    default_downloader.download_file_and_uncompress(
+        url=_CMUDICT_URL, save_path=corpora_path, print_progress=True)
+nltk.data.path.append(nltk_path)
+
+if not lack_dependency:
+    import soundfile as sf
+    import librosa
+    from ruamel import yaml
+    from scipy.io.wavfile import write
+    from parakeet.g2p.en import text_to_sequence
+    from parakeet.models.transformer_tts.utils import *
+    from parakeet.models.transformer_tts import TransformerTTS as TransformerTTSModel
+    from parakeet.models.waveflow import WaveFlowModule
+    from parakeet.utils import io
+    from parakeet.modules.weight_norm import WeightNormWrapper
+else:
+    raise ImportError(
+        "The module requires additional dependencies: %s. You can install parakeet via 'git clone https://github.com/PaddlePaddle/Parakeet && cd Parakeet && pip install -e .' and others via pip install"
+        % ", ".join(lack_dependency))
+
+
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+@moduleinfo(
+    name="transformer_tts_ljspeech",
+    version="1.0.0",
+    summary=
+    "Transformer TTS introduces and adapts the multi-head attention mechanism to replace the RNN structures and also the original attention mechanism in Tacotron2. See https://arxiv.org/abs/1809.08895 for details",
+    author="baidu-nlp",
+    author_email="",
+    type="nlp/tts",
+)
+class TransformerTTS(hub.NLPPredictionModule):
+    def _initialize(self):
+        """
+        initialize with the necessary elements
+        """
+        self.tts_checkpoint_path = os.path.join(self.directory, "assets", "tts",
+                                                "step-120000")
+        self.waveflow_checkpoint_path = os.path.join(self.directory, "assets",
+                                                     "vocoder", "step-2000000")
+        self.waveflow_config_path = os.path.join(
+            self.directory, "assets", "vocoder", "waveflow_ljspeech.yaml")
+
+        tts_config_path = os.path.join(self.directory, "assets", "tts",
+                                       "ljspeech.yaml")
+        with open(tts_config_path) as f:
+            self.tts_config = yaml.load(f, Loader=yaml.Loader)
+
+        # The max length of audio when synthsis.
+        self.max_len = 1000
+        # The threshold of stop token which indicates the time step should stop generate spectrum or not.
+        self.stop_threshold = 0.5
+
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            # Build TTS.
+            with fluid.unique_name.guard():
+                network_cfg = self.tts_config['network']
+                self.tts_model = TransformerTTSModel(
+                    network_cfg['embedding_size'], network_cfg['hidden_size'],
+                    network_cfg['encoder_num_head'],
+                    network_cfg['encoder_n_layers'],
+                    self.tts_config['audio']['num_mels'],
+                    network_cfg['outputs_per_step'],
+                    network_cfg['decoder_num_head'],
+                    network_cfg['decoder_n_layers'])
+                io.load_parameters(
+                    model=self.tts_model,
+                    checkpoint_path=self.tts_checkpoint_path)
+
+            # Build vocoder.
+            args = AttrDict()
+            args.config = self.waveflow_config_path
+            args.use_fp16 = False
+            self.waveflow_config = io.add_yaml_config_to_args(args)
+            self.waveflow = WaveFlowModule(self.waveflow_config)
+            io.load_parameters(
+                model=self.waveflow,
+                checkpoint_path=self.waveflow_checkpoint_path)
+
+    def synthesize(self, texts, use_gpu=False, vocoder="griffin-lim"):
+        """
+        Get the synthetic wavs from the texts.
+
+        Args:
+             texts(list): the input texts to be predicted.
+             use_gpu(bool): whether use gpu to predict or not
+             vocoder(str): the vocoder name, "griffin-lim" or "waveflow"
+
+        Returns:
+             wavs(str): the audio wav with sample rate . You can use soundfile.write to save it.
+             sample_rate(int): the audio sample rate.
+        """
+        if use_gpu and "CUDA_VISIBLE_DEVICES" not in os.environ:
+            use_gpu = False
+            logger.warning(
+                "use_gpu has been set False as you didn't set the environment variable CUDA_VISIBLE_DEVICES while using use_gpu=True"
+            )
+        if use_gpu:
+            place = fluid.CUDAPlace(0)
+        else:
+            place = fluid.CPUPlace()
+
+        if texts and isinstance(texts, list):
+            predicted_data = texts
+        else:
+            raise ValueError(
+                "The input data is inconsistent with expectations.")
+
+        wavs = []
+        with fluid.dygraph.guard(place):
+            self.tts_model.eval()
+            self.waveflow.eval()
+            for text in predicted_data:
+                # init input
+                logger.info("Processing sentence: %s" % text)
+                text = np.asarray(text_to_sequence(text))
+                text = fluid.layers.unsqueeze(
+                    dg.to_variable(text).astype(np.int64), [0])
+                mel_input = dg.to_variable(np.zeros([1, 1,
+                                                     80])).astype(np.float32)
+                pos_text = np.arange(1, text.shape[1] + 1)
+                pos_text = fluid.layers.unsqueeze(
+                    dg.to_variable(pos_text).astype(np.int64), [0])
+
+                for i in range(self.max_len):
+                    pos_mel = np.arange(1, mel_input.shape[1] + 1)
+                    pos_mel = fluid.layers.unsqueeze(
+                        dg.to_variable(pos_mel).astype(np.int64), [0])
+                    mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = self.tts_model(
+                        text, mel_input, pos_text, pos_mel)
+                    if stop_preds.numpy()[0, -1] > self.stop_threshold:
+                        break
+                    mel_input = fluid.layers.concat(
+                        [mel_input, postnet_pred[:, -1:, :]], axis=1)
+                if vocoder == 'griffin-lim':
+                    # synthesis use griffin-lim
+                    wav = self.synthesis_with_griffinlim(
+                        postnet_pred, self.tts_config['audio'])
+                elif vocoder == 'waveflow':
+                    # synthesis use waveflow
+                    wav = self.synthesis_with_waveflow(
+                        postnet_pred, self.waveflow_config.sigma)
+                else:
+                    raise ValueError(
+                        'vocoder error, we only support griffinlim and waveflow, but recevied %s.'
+                        % vocoder)
+                wavs.append(wav)
+        return wavs, self.tts_config['audio']['sr']
+
+    def synthesis_with_griffinlim(self, mel_output, cfg):
+        # synthesis with griffin-lim
+        mel_output = fluid.layers.transpose(
+            fluid.layers.squeeze(mel_output, [0]), [1, 0])
+        mel_output = np.exp(mel_output.numpy())
+        basis = librosa.filters.mel(
+            cfg['sr'],
+            cfg['n_fft'],
+            cfg['num_mels'],
+            fmin=cfg['fmin'],
+            fmax=cfg['fmax'])
+        inv_basis = np.linalg.pinv(basis)
+        spec = np.maximum(1e-10, np.dot(inv_basis, mel_output))
+
+        wav = librosa.core.griffinlim(
+            spec**cfg['power'],
+            hop_length=cfg['hop_length'],
+            win_length=cfg['win_length'])
+
+        return wav
+
+    def synthesis_with_waveflow(self, mel_output, sigma):
+        mel_spectrogram = fluid.layers.transpose(
+            fluid.layers.squeeze(mel_output, [0]), [1, 0])
+        mel_spectrogram = fluid.layers.unsqueeze(mel_spectrogram, [0])
+
+        for layer in self.waveflow.sublayers():
+            if isinstance(layer, WeightNormWrapper):
+                layer.remove_weight_norm()
+
+        # Run model inference.
+        wav = self.waveflow.synthesize(mel_spectrogram, sigma=sigma)
+        return wav.numpy()[0]
+
+    @serving
+    def serving_method(self, texts, use_gpu=False, vocoder="griffin-lim"):
+        """
+        Run as a service.
+        """
+        wavs, sample_rate = self.synthesize(texts, use_gpu, vocoder)
+        wavs = [wav.tolist() for wav in wavs]
+        result = {"wavs": wavs, "sample_rate": sample_rate}
+        return result
+
+    def add_module_config_arg(self):
+        """
+        Add the command config options
+        """
+        self.arg_config_group.add_argument(
+            '--use_gpu',
+            type=ast.literal_eval,
+            default=False,
+            help="whether use GPU for prediction")
+
+        self.arg_config_group.add_argument(
+            '--vocoder',
+            type=str,
+            default="griffin-lim",
+            choices=['griffin-lim', 'waveflow'],
+            help="the vocoder name")
+
+    def add_module_output_arg(self):
+        """
+        Add the command config options
+        """
+        self.arg_config_group.add_argument(
+            '--output_path',
+            type=str,
+            default=os.path.abspath(
+                os.path.join(os.path.curdir, f"{self.name}_prediction")),
+            help="path to save experiment results")
+
+    @runnable
+    def run_cmd(self, argvs):
+        """
+        Run as a command
+        """
+        self.parser = argparse.ArgumentParser(
+            description='Run the %s module.' % self.name,
+            prog='hub run %s' % self.name,
+            usage='%(prog)s',
+            add_help=True)
+
+        self.arg_input_group = self.parser.add_argument_group(
+            title="Input options", description="Input data. Required")
+        self.arg_input_group = self.parser.add_argument_group(
+            title="Ouput options", description="Ouput path. Optional.")
+        self.arg_config_group = self.parser.add_argument_group(
+            title="Config options",
+            description=
+            "Run configuration for controlling module behavior, optional.")
+
+        self.add_module_config_arg()
+        self.add_module_input_arg()
+        self.add_module_output_arg()
+
+        args = self.parser.parse_args(argvs)
+
+        try:
+            input_data = self.check_input_data(args)
+        except DataFormatError and RuntimeError:
+            self.parser.print_help()
+            return None
+
+        mkdir(args.output_path)
+        wavs, sample_rate = self.synthesize(
+            texts=input_data, use_gpu=args.use_gpu, vocoder=args.vocoder)
+
+        for index, wav in enumerate(wavs):
+            sf.write(
+                os.path.join(args.output_path, f"{index}.wav"), wav,
+                sample_rate)
+
+        ret = f"The synthesized wav files have been saved in {args.output_path}"
+        return ret
+
+
+if __name__ == "__main__":
+
+    module = TransformerTTS()
+    test_text = [
+        "Life was like a box of chocolates, you never know what you're gonna get.",
+    ]
+    wavs, sample_rate = module.synthesize(texts=test_text, vocoder="waveflow")
+    for index, wav in enumerate(wavs):
+        sf.write(f"{index}.wav", wav, sample_rate)
--- a/hub_module/scripts/configs/deep_voice3.yml
+++ b/hub_module/scripts/configs/deep_voice3.yml
+name: ace2p
+dir: "modules/audio/tts/deep_voice3"
+exclude:
+    - README.md
+resources:
+  -
+    url: https://paddlespeech.bj.bcebos.com/Parakeet/deepvoice3_ljspeech_griffin-lim_ckpt_1.0.zip
+    dest: assets
+    uncompress: True
--- a/paddlehub/module/nlp_module.py
+++ b/paddlehub/module/nlp_module.py
@@ -19,7 +19,6 @@ from __future__ import print_function

 import argparse
 import ast
-import json
 import os
 import re
 import six
@@ -31,7 +30,6 @@ import paddle.fluid as fluid
 import paddlehub as hub
 from paddle.fluid.core import PaddleTensor, AnalysisConfig, create_paddle_predictor
 from paddlehub.common import paddle_helper, tmp_dir
-from paddlehub.common.logger import logger
 from paddlehub.common.utils import sys_stdin_encoding, version_compare
 from paddlehub.io.parser import txt_parser
 from paddlehub.module.module import runnable

--- a/requirements.txt
+++ b/requirements.txt
@@ -9,6 +9,7 @@ cma >= 2.7.0
 sentencepiece
 colorlog
 tqdm
+nltk

 # pandas no longer support python2 in version 0.25 and above
 pandas ; python_version >= "3"