add license

9d796994 · lifuchen · f84d6bec · 9d796994 · 9d796994 · 9d796994
92 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,3 +25,11 @@
        files: \.md$
    -   id: remove-tabs
        files: \.md$
+-   repo: local
+    hooks:
+    -   id: copyright_checker
+        name: copyright_checker
+        entry: python ./tools/copyright.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
+        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
--- a/examples/deepvoice3/README.md
+++ b/examples/deepvoice3/README.md
@@ -112,4 +112,3 @@ example script:
 ```bash
 python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated
 ```
--- a/examples/deepvoice3/data.py
+++ b/examples/deepvoice3/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import csv
 from pathlib import Path
@@ -79,7 +93,8 @@ class Transform(object):
        y = signal.lfilter([1., -self.preemphasis], [1.], wav)
        # STFT
-        D = librosa.stft(y=y,
+        D = librosa.stft(
+            y=y,
            n_fft=self.n_fft,
            win_length=self.win_length,
            hop_length=self.hop_length)
@@ -96,11 +111,8 @@ class Transform(object):
        # mel scale and to db and normalize to 0-1,
        # CAUTION: pass linear scale S, not dbscaled S
-        S_mel = librosa.feature.melspectrogram(S=S,
+        S_mel = librosa.feature.melspectrogram(
-                                               n_mels=self.n_mels,
+            S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
-                                               fmin=self.fmin,
-                                               fmax=self.fmax,
-                                               power=1.)
        S_mel = 20 * np.log10(np.maximum(amplitude_min,
                                         S_mel)) - self.ref_level_db
        S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
@@ -148,20 +160,18 @@ class DataCollector(object):
            (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
             S_mel_norm, num_frames) = example
            text_sequences.append(
-                np.pad(mix_grapheme_phonemes,
+                np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
-                       (0, max_text_length - text_length)))
+                                               )))
            lin_specs.append(
-                np.pad(S_norm,
+                np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
-                       ((0, 0), (self._pad_begin,
+                                         self._pad_begin - num_frames))))
-                                 max_frames - self._pad_begin - num_frames))))
            mel_specs.append(
-                np.pad(S_mel_norm,
+                np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
-                       ((0, 0), (self._pad_begin,
+                                             self._pad_begin - num_frames))))
-                                 max_frames - self._pad_begin - num_frames))))
            done_flags.append(
                np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
-                       (0, max_decoder_length -
+                       (0, max_decoder_length - int(
-                        int(np.ceil(num_frames // self._factor))),
+                           np.ceil(num_frames // self._factor))),
                       constant_values=1))
        text_sequences = np.array(text_sequences).astype(np.int64)
        lin_specs = np.transpose(np.array(lin_specs),

--- a/examples/deepvoice3/synthesis.py
+++ b/examples/deepvoice3/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import argparse
 import ruamel.yaml
@@ -22,11 +36,8 @@ if __name__ == "__main__":
    parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
    parser.add_argument("text", type=str, help="text file to synthesize")
    parser.add_argument("output_path", type=str, help="path to save results")
-    parser.add_argument("-g",
+    parser.add_argument(
-                        "--device",
+        "-g", "--device", type=int, default=-1, help="device to use")
-                        type=int,
-                        default=-1,
-                        help="device to use")
    args = parser.parse_args()
    with open(args.config, 'rt') as f:
@@ -76,15 +87,14 @@ if __name__ == "__main__":
        window_ahead = model_config["window_ahead"]
        key_projection = model_config["key_projection"]
        value_projection = model_config["value_projection"]
-        dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
+        dv3 = make_model(
-                         padding_idx, embedding_std, max_positions, n_vocab,
+            n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
-                         freeze_embedding, filter_size, encoder_channels,
+            embedding_std, max_positions, n_vocab, freeze_embedding,
-                         n_mels, decoder_channels, r,
+            filter_size, encoder_channels, n_mels, decoder_channels, r,
            trainable_positional_encodings, use_memory_mask,
-                         query_position_rate, key_position_rate,
+            query_position_rate, key_position_rate, window_backward,
-                         window_backward, window_ahead, key_projection,
+            window_ahead, key_projection, value_projection, downsample_factor,
-                         value_projection, downsample_factor, linear_dim,
+            linear_dim, use_decoder_states, converter_channels, dropout)
-                         use_decoder_states, converter_channels, dropout)
        summary(dv3)
        state, _ = dg.load_dygraph(args.checkpoint)

--- a/examples/deepvoice3/train.py
+++ b/examples/deepvoice3/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import argparse
 import ruamel.yaml

--- a/examples/deepvoice3/utils.py
+++ b/examples/deepvoice3/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import numpy as np
 from matplotlib import cm
@@ -28,7 +42,8 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
               converter_channels, dropout):
    """just a simple function to create a deepvoice 3 model"""
    if n_speakers > 1:
-        spe = dg.Embedding((n_speakers, speaker_dim),
+        spe = dg.Embedding(
+            (n_speakers, speaker_dim),
            param_attr=I.Normal(scale=speaker_embed_std))
    else:
        spe = None
@@ -45,9 +60,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
-        ConvSpec(h, k, 3),
+        ConvSpec(h, k, 3), )
-    )
+    enc = Encoder(
-    enc = Encoder(n_vocab,
+        n_vocab,
        embed_dim,
        n_speakers,
        speaker_dim,
@@ -66,11 +81,11 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
-        ConvSpec(h, k, 1),
+        ConvSpec(h, k, 1), )
-    )
    attention = [True, False, False, False, True]
    force_monotonic_attention = [True, False, False, False, True]
-    dec = Decoder(n_speakers,
+    dec = Decoder(
+        n_speakers,
        speaker_dim,
        embed_dim,
        mel_dim,
@@ -97,9 +112,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(2 * h, k, 1),
-        ConvSpec(2 * h, k, 3),
+        ConvSpec(2 * h, k, 3), )
-    )
+    cvt = Converter(
-    cvt = Converter(n_speakers,
+        n_speakers,
        speaker_dim,
        dec.state_dim if use_decoder_states else mel_dim,
        linear_dim,
@@ -115,7 +130,9 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
               ref_level_db, power, n_iter, win_length, hop_length,
               preemphasis):
    """generate waveform from text using a deepvoice 3 model"""
-    text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob),
+    text = np.array(
+        en.text_to_sequence(
+            text, p=replace_pronounciation_prob),
        dtype=np.int64)
    length = len(text)
    print("text sequence's length: {}".format(length))
@@ -145,7 +162,8 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter,
    """
    denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db
    lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
-    wav = librosa.griffinlim(lin_scaled**power,
+    wav = librosa.griffinlim(
+        lin_scaled**power,
        n_iter=n_iter,
        hop_length=hop_length,
        win_length=win_length)
@@ -225,11 +243,12 @@ def save_state(save_dir,
        plt.colorbar()
        plt.title("mel_input")
        plt.savefig(
-            os.path.join(path,
+            os.path.join(path, "target_mel_spec_step{:09d}.png".format(
-                         "target_mel_spec_step{:09d}.png".format(global_step)))
+                global_step)))
        plt.close()
-        writer.add_image("target/mel_spec",
+        writer.add_image(
+            "target/mel_spec",
            cm.viridis(mel_input),
            global_step,
            dataformats="HWC")
@@ -239,11 +258,12 @@ def save_state(save_dir,
        plt.colorbar()
        plt.title("mel_output")
        plt.savefig(
-            os.path.join(
+            os.path.join(path, "predicted_mel_spec_step{:09d}.png".format(
-                path, "predicted_mel_spec_step{:09d}.png".format(global_step)))
+                global_step)))
        plt.close()
-        writer.add_image("predicted/mel_spec",
+        writer.add_image(
+            "predicted/mel_spec",
            cm.viridis(mel_output),
            global_step,
            dataformats="HWC")
@@ -258,11 +278,12 @@ def save_state(save_dir,
        plt.colorbar()
        plt.title("mel_input")
        plt.savefig(
-            os.path.join(path,
+            os.path.join(path, "target_lin_spec_step{:09d}.png".format(
-                         "target_lin_spec_step{:09d}.png".format(global_step)))
+                global_step)))
        plt.close()
-        writer.add_image("target/lin_spec",
+        writer.add_image(
+            "target/lin_spec",
            cm.viridis(lin_input),
            global_step,
            dataformats="HWC")
@@ -272,11 +293,12 @@ def save_state(save_dir,
        plt.colorbar()
        plt.title("mel_input")
        plt.savefig(
-            os.path.join(
+            os.path.join(path, "predicted_lin_spec_step{:09d}.png".format(
-                path, "predicted_lin_spec_step{:09d}.png".format(global_step)))
+                global_step)))
        plt.close()
-        writer.add_image("predicted/lin_spec",
+        writer.add_image(
+            "predicted/lin_spec",
            cm.viridis(lin_output),
            global_step,
            dataformats="HWC")
@@ -290,7 +312,8 @@ def save_state(save_dir,
                "train_attn_layer_{}_step_{}.png".format(idx, global_step))
            plot_alignment(attn_layer, save_path)
-            writer.add_image("train_attn/layer_{}".format(idx),
+            writer.add_image(
+                "train_attn/layer_{}".format(idx),
                cm.viridis(attn_layer),
                global_step,
                dataformats="HWC")
@@ -302,7 +325,5 @@ def save_state(save_dir,
        save_path = os.path.join(
            path, "train_sample_step_{:09d}.wav".format(global_step))
        sf.write(save_path, wav, sample_rate)
-        writer.add_audio("train_sample",
+        writer.add_audio(
-                         wav,
+            "train_sample", wav, global_step, sample_rate=sample_rate)
-                         global_step,
-                         sample_rate=sample_rate)
--- a/examples/fastspeech/README.md
+++ b/examples/fastspeech/README.md
--- a/examples/fastspeech/parse.py
+++ b/examples/fastspeech/parse.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 def add_config_options_to_parser(parser):
-    parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml',
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default='config/fastspeech.yaml',
        help="the yaml config file path.")
-    parser.add_argument('--batch_size', type=int, default=32,
+    parser.add_argument(
-        help="batch size for training.")
+        '--batch_size', type=int, default=32, help="batch size for training.")
-    parser.add_argument('--epochs', type=int, default=10000,
+    parser.add_argument(
+        '--epochs',
+        type=int,
+        default=10000,
        help="the number of epoch for training.")
-    parser.add_argument('--lr', type=float, default=0.001,
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=0.001,
        help="the learning rate for training.")
-    parser.add_argument('--save_step', type=int, default=500,
+    parser.add_argument(
+        '--save_step',
+        type=int,
+        default=500,
        help="checkpointing interval during training.")
-    parser.add_argument('--fastspeech_step', type=int, default=70000,
+    parser.add_argument(
+        '--fastspeech_step',
+        type=int,
+        default=70000,
        help="Global step to restore checkpoint of fastspeech.")
-    parser.add_argument('--use_gpu', type=int, default=1,
+    parser.add_argument(
+        '--use_gpu',
+        type=int,
+        default=1,
        help="use gpu or not during training.")
-    parser.add_argument('--use_data_parallel', type=int, default=0,
+    parser.add_argument(
+        '--use_data_parallel',
+        type=int,
+        default=0,
        help="use data parallel or not during training.")
-    parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='./dataset/LJSpeech-1.1',
        help="the path of dataset.")
-    parser.add_argument('--checkpoint_path', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint_path',
+        type=str,
+        default=None,
        help="the path to load checkpoint or pretrain model.")
-    parser.add_argument('--save_path', type=str, default='./checkpoint',
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./checkpoint',
        help="the path to save checkpoint.")
-    parser.add_argument('--log_dir', type=str, default='./log',
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default='./log',
        help="the directory to save tensorboard log.")
-    parser.add_argument('--sample_path', type=str, default='./sample',
+    parser.add_argument(
+        '--sample_path',
+        type=str,
+        default='./sample',
        help="the directory to save audio sample in synthesis.")
-    parser.add_argument('--transtts_path', type=str, default='./log',
+    parser.add_argument(
+        '--transtts_path',
+        type=str,
+        default='./log',
        help="the directory to load pretrain transformerTTS model.")
-    parser.add_argument('--transformer_step', type=int, default=160000,
+    parser.add_argument(
+        '--transformer_step',
+        type=int,
+        default=160000,
        help="the step to load transformerTTS model.")
--- a/examples/fastspeech/synthesis.py
+++ b/examples/fastspeech/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from tensorboardX import SummaryWriter
 from collections import OrderedDict
@@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence
 from parakeet import audio
 from parakeet.models.fastspeech.fastspeech import FastSpeech
 def load_checkpoint(step, model_path):
    model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
    new_state_dict = OrderedDict()
@@ -22,13 +36,14 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict
 def synthesis(text_input, args):
    place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
    # tensorboard
    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'synthesis')
+    path = os.path.join(args.log_dir, 'synthesis')
    with open(args.config_path) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)
@@ -37,15 +52,19 @@ def synthesis(text_input, args):
    with dg.guard(place):
        model = FastSpeech(cfg)
-        model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")))
+        model.set_dict(
+            load_checkpoint(
+                str(args.fastspeech_step),
+                os.path.join(args.checkpoint_path, "fastspeech")))
        model.eval()
        text = np.asarray(text_to_sequence(text_input))
-        text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
+        text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
-        pos_text = np.arange(1, text.shape[1]+1)
+        pos_text = np.arange(1, text.shape[1] + 1)
-        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
+        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
-        mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
+        mel_output, mel_output_postnet = model(
+            text, pos_text, alpha=args.alpha)
        _ljspeech_processor = audio.AudioProcessor(
            sample_rate=cfg['audio']['sr'],
@@ -53,8 +72,8 @@ def synthesis(text_input, args):
            min_level_db=cfg['audio']['min_level_db'],
            ref_level_db=cfg['audio']['ref_level_db'],
            n_fft=cfg['audio']['n_fft'],
-            win_length= cfg['audio']['win_length'], 
+            win_length=cfg['audio']['win_length'],
-            hop_length= cfg['audio']['hop_length'],
+            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
@@ -67,12 +86,15 @@ def synthesis(text_input, args):
            do_trim_silence=False,
            sound_norm=False)
-        mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0])
+        mel_output_postnet = fluid.layers.transpose(
-        wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy())
+            fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
+        wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy(
+        ))
        writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
        print("Synthesis completed !!!")
    writer.close()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train Fastspeech model")
    add_config_options_to_parser(parser)

--- a/examples/fastspeech/train.py
+++ b/examples/fastspeech/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import argparse
 import os
@@ -20,8 +33,10 @@ import sys
 sys.path.append("../transformer_tts")
 from data import LJSpeechLoader
 def load_checkpoint(step, model_path):
-    model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
+    model_dict, opti_dict = fluid.dygraph.load_dygraph(
+        os.path.join(model_path, step))
    new_state_dict = OrderedDict()
    for param in model_dict:
        if param.startswith('_layers.'):
@@ -30,6 +45,7 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict, opti_dict
 def main(args):
    local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
    nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
@@ -44,25 +60,32 @@ def main(args):
    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'fastspeech')
+    path = os.path.join(args.log_dir, 'fastspeech')
    writer = SummaryWriter(path) if local_rank == 0 else None
    with dg.guard(place):
        with fluid.unique_name.guard():
            transformerTTS = TransformerTTS(cfg)
-            model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer"))
+            model_dict, _ = load_checkpoint(
+                str(args.transformer_step),
+                os.path.join(args.transtts_path, "transformer"))
            transformerTTS.set_dict(model_dict)
            transformerTTS.eval()
        model = FastSpeech(cfg)
        model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
+        optimizer = fluid.optimizer.AdamOptimizer(
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
            parameter_list=model.parameters())
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, shuffle=True).reader()
        if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.fastspeech_step),
+                os.path.join(args.checkpoint_path, "fastspeech"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.fastspeech_step
@@ -76,31 +99,42 @@ def main(args):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
-                _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
+                _, _, attn_probs, _, _, _ = transformerTTS(
-                alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32)
+                    character, mel_input, pos_text, pos_mel)
+                alignment = dg.to_variable(
+                    get_alignment(attn_probs, mel_lens, cfg[
+                        'transformer_head'])).astype(np.float32)
                global_step += 1
                #Forward
-                result= model(character, 
+                result = model(
+                    character,
                    pos_text,
                    mel_pos=pos_mel,
                    length_target=alignment)
                mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
                mel_loss = layers.mse_loss(mel_output, mel)
                mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
-                duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment)))
+                duration_loss = layers.mean(
+                    layers.abs(
+                        layers.elementwise_sub(duration_predictor_output,
+                                               alignment)))
                total_loss = mel_loss + mel_postnet_loss + duration_loss
-                if local_rank==0:
+                if local_rank == 0:
-                    writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
+                    writer.add_scalar('mel_loss',
-                    writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
+                                      mel_loss.numpy(), global_step)
-                    writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
+                    writer.add_scalar('post_mel_loss',
-                    writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
+                                      mel_postnet_loss.numpy(), global_step)
+                    writer.add_scalar('duration_loss',
+                                      duration_loss.numpy(), global_step)
+                    writer.add_scalar('learning_rate',
+                                      optimizer._learning_rate.step().numpy(),
+                                      global_step)
                if args.use_data_parallel:
                    total_loss = model.scale_loss(total_loss)
@@ -108,21 +142,25 @@ def main(args):
                    model.apply_collective_grads()
                else:
                    total_loss.backward()
-                optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    total_loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                model.clear_gradients()
                # save checkpoint
-                if local_rank==0 and global_step % args.save_step == 0:
+                if local_rank == 0 and global_step % args.save_step == 0:
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
-                    save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step)
+                    save_path = os.path.join(args.save_path,
+                                             'fastspeech/%d' % global_step)
                    dg.save_dygraph(model.state_dict(), save_path)
                    dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
            writer.close()
-if __name__ =='__main__':
+if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train Fastspeech model")
    add_config_options_to_parser(parser)
    args = parser.parse_args()

--- a/examples/transformer_tts/README.md
+++ b/examples/transformer_tts/README.md
--- a/examples/transformer_tts/data.py
+++ b/examples/transformer_tts/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 import numpy as np
 import pandas as pd
@@ -12,22 +25,42 @@ from parakeet.data.datacargo import DataCargo
 from parakeet.data.batch import TextIDBatcher, SpecBatcher
 from parakeet.data.dataset import DatasetMixin, TransformDataset
 class LJSpeechLoader:
-    def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True):
+    def __init__(self,
+                 config,
+                 args,
+                 nranks,
+                 rank,
+                 is_vocoder=False,
+                 shuffle=True):
        place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
        LJSPEECH_ROOT = Path(args.data_path)
        metadata = LJSpeechMetaData(LJSPEECH_ROOT)
        transformer = LJSpeech(config)
        dataset = TransformDataset(metadata, transformer)
-        sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
+        sampler = DistributedSampler(
+            len(metadata), nranks, rank, shuffle=shuffle)
        assert args.batch_size % nranks == 0
        each_bs = args.batch_size // nranks
        if is_vocoder:
-            dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
+            dataloader = DataCargo(
+                dataset,
+                sampler=sampler,
+                batch_size=each_bs,
+                shuffle=shuffle,
+                batch_fn=batch_examples_vocoder,
+                drop_last=True)
        else:
-            dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True)
+            dataloader = DataCargo(
+                dataset,
+                sampler=sampler,
+                batch_size=each_bs,
+                shuffle=shuffle,
+                batch_fn=batch_examples,
+                drop_last=True)
        self.reader = fluid.io.DataLoader.from_generator(
            capacity=32,
@@ -68,8 +101,8 @@ class LJSpeech(object):
            min_level_db=config['audio']['min_level_db'],
            ref_level_db=config['audio']['ref_level_db'],
            n_fft=config['audio']['n_fft'],
-            win_length= config['audio']['win_length'], 
+            win_length=config['audio']['win_length'],
-            hop_length= config['audio']['hop_length'],
+            hop_length=config['audio']['hop_length'],
            power=config['audio']['power'],
            preemphasis=config['audio']['preemphasis'],
            signal_norm=True,
@@ -95,8 +128,10 @@ class LJSpeech(object):
        wav = self._ljspeech_processor.load_wav(str(fname))
        mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
        mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
-        phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
+        phonemes = np.array(
-        return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
+            g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
+        return (mag, mel, phonemes
+                )  # maybe we need to implement it as a map in the future
 def batch_examples(batch):
@@ -109,7 +144,10 @@ def batch_examples(batch):
    pos_mels = []
    for data in batch:
        _, mel, text = data
-        mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
+        mel_inputs.append(
+            np.concatenate(
+                [np.zeros([mel.shape[0], 1], np.float32), mel[:, :-1]],
+                axis=-1))
        mel_lens.append(mel.shape[1])
        text_lens.append(len(text))
        pos_texts.append(np.arange(1, len(text) + 1))
@@ -118,35 +156,59 @@ def batch_examples(batch):
        texts.append(text)
    # Sort by text_len in descending order
-    texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
+    texts = [
-    mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
+        i
-    mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
+        for i, _ in sorted(
-    mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)]
+            zip(texts, text_lens), key=lambda x: x[1], reverse=True)
-    pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
+    ]
-    pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
+    mels = [
+        i
+        for i, _ in sorted(
+            zip(mels, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    mel_inputs = [
+        i
+        for i, _ in sorted(
+            zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    mel_lens = [
+        i
+        for i, _ in sorted(
+            zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    pos_texts = [
+        i
+        for i, _ in sorted(
+            zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    pos_mels = [
+        i
+        for i, _ in sorted(
+            zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)
+    ]
    text_lens = sorted(text_lens, reverse=True)
    # Pad sequence with largest len of the batch
    texts = TextIDBatcher(pad_id=0)(texts)  #(B, T)
    pos_texts = TextIDBatcher(pad_id=0)(pos_texts)  #(B,T)
    pos_mels = TextIDBatcher(pad_id=0)(pos_mels)  #(B,T)
-    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
+    mels = np.transpose(
-    mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
+        SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))  #(B,T,num_mels)
-    return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens))
+    mel_inputs = np.transpose(
+        SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1))  #(B,T,num_mels)
+    return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
+            np.array(mel_lens))
 def batch_examples_vocoder(batch):
-    mels=[]
+    mels = []
-    mags=[]
+    mags = []
    for data in batch:
        mag, mel, _ = data
        mels.append(mel)
        mags.append(mag)
-    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
+    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))
-    mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
+    mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0, 2, 1))
    return (mels, mags)
--- a/examples/transformer_tts/parse.py
+++ b/examples/transformer_tts/parse.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 def add_config_options_to_parser(parser):
-    parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml',
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default='config/train_transformer.yaml',
        help="the yaml config file path.")
-    parser.add_argument('--batch_size', type=int, default=32,
+    parser.add_argument(
-        help="batch size for training.")
+        '--batch_size', type=int, default=32, help="batch size for training.")
-    parser.add_argument('--epochs', type=int, default=10000,
+    parser.add_argument(
+        '--epochs',
+        type=int,
+        default=10000,
        help="the number of epoch for training.")
-    parser.add_argument('--lr', type=float, default=0.001,
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=0.001,
        help="the learning rate for training.")
-    parser.add_argument('--save_step', type=int, default=500,
+    parser.add_argument(
+        '--save_step',
+        type=int,
+        default=500,
        help="checkpointing interval during training.")
-    parser.add_argument('--image_step', type=int, default=2000,
+    parser.add_argument(
+        '--image_step',
+        type=int,
+        default=2000,
        help="attention image interval during training.")
-    parser.add_argument('--max_len', type=int, default=400,
+    parser.add_argument(
+        '--max_len',
+        type=int,
+        default=400,
        help="The max length of audio when synthsis.")
-    parser.add_argument('--transformer_step', type=int, default=160000,
+    parser.add_argument(
+        '--transformer_step',
+        type=int,
+        default=160000,
        help="Global step to restore checkpoint of transformer.")
-    parser.add_argument('--vocoder_step', type=int, default=90000,
+    parser.add_argument(
+        '--vocoder_step',
+        type=int,
+        default=90000,
        help="Global step to restore checkpoint of postnet.")
-    parser.add_argument('--use_gpu', type=int, default=1,
+    parser.add_argument(
+        '--use_gpu',
+        type=int,
+        default=1,
        help="use gpu or not during training.")
-    parser.add_argument('--use_data_parallel', type=int, default=0,
+    parser.add_argument(
+        '--use_data_parallel',
+        type=int,
+        default=0,
        help="use data parallel or not during training.")
-    parser.add_argument('--stop_token', type=int, default=0,
+    parser.add_argument(
+        '--stop_token',
+        type=int,
+        default=0,
        help="use stop token loss in network or not.")
-    parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='./dataset/LJSpeech-1.1',
        help="the path of dataset.")
-    parser.add_argument('--checkpoint_path', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint_path',
+        type=str,
+        default=None,
        help="the path to load checkpoint or pretrain model.")
-    parser.add_argument('--save_path', type=str, default='./checkpoint',
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./checkpoint',
        help="the path to save checkpoint.")
-    parser.add_argument('--log_dir', type=str, default='./log',
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default='./log',
        help="the directory to save tensorboard log.")
-    parser.add_argument('--sample_path', type=str, default='./sample',
+    parser.add_argument(
+        '--sample_path',
+        type=str,
+        default='./sample',
        help="the directory to save audio sample in synthesis.")
--- a/examples/transformer_tts/synthesis.py
+++ b/examples/transformer_tts/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from scipy.io.wavfile import write
 from parakeet.g2p.en import text_to_sequence
@@ -16,6 +29,7 @@ from parakeet import audio
 from parakeet.models.transformer_tts.vocoder import Vocoder
 from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
 def load_checkpoint(step, model_path):
    model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
    new_state_dict = OrderedDict()
@@ -26,6 +40,7 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict
 def synthesis(text_input, args):
    place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
@@ -35,35 +50,42 @@ def synthesis(text_input, args):
    # tensorboard
    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'synthesis')
+    path = os.path.join(args.log_dir, 'synthesis')
    writer = SummaryWriter(path)
    with dg.guard(place):
        with fluid.unique_name.guard():
            model = TransformerTTS(cfg)
-            model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")))
+            model.set_dict(
+                load_checkpoint(
+                    str(args.transformer_step),
+                    os.path.join(args.checkpoint_path, "transformer")))
            model.eval()
        with fluid.unique_name.guard():
            model_vocoder = Vocoder(cfg, args.batch_size)
-            model_vocoder.set_dict(load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")))
+            model_vocoder.set_dict(
+                load_checkpoint(
+                    str(args.vocoder_step),
+                    os.path.join(args.checkpoint_path, "vocoder")))
            model_vocoder.eval()
        # init input
        text = np.asarray(text_to_sequence(text_input))
-        text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
+        text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
-        mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32)
+        mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
-        pos_text = np.arange(1, text.shape[1]+1)
+        pos_text = np.arange(1, text.shape[1] + 1)
-        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
+        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
        pbar = tqdm(range(args.max_len))
        for i in pbar:
-            pos_mel = np.arange(1, mel_input.shape[1]+1)
+            pos_mel = np.arange(1, mel_input.shape[1] + 1)
-            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0])
+            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
-            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel)
+            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
-            mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1)
+                text, mel_input, pos_text, pos_mel)
+            mel_input = fluid.layers.concat(
+                [mel_input, postnet_pred[:, -1:, :]], axis=1)
        mag_pred = model_vocoder(postnet_pred)
        _ljspeech_processor = audio.AudioProcessor(
@@ -72,8 +94,8 @@ def synthesis(text_input, args):
            min_level_db=cfg['audio']['min_level_db'],
            ref_level_db=cfg['audio']['ref_level_db'],
            n_fft=cfg['audio']['n_fft'],
-            win_length= cfg['audio']['win_length'], 
+            win_length=cfg['audio']['win_length'],
-            hop_length= cfg['audio']['hop_length'],
+            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
@@ -86,13 +108,18 @@ def synthesis(text_input, args):
            do_trim_silence=False,
            sound_norm=False)
-        wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
+        wav = _ljspeech_processor.inv_spectrogram(
+            fluid.layers.transpose(
+                fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
        writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
        if not os.path.exists(args.sample_path):
            os.mkdir(args.sample_path)
-        write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav)
+        write(
+            os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'],
+            wav)
    writer.close()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Synthesis model")
    add_config_options_to_parser(parser)

--- a/examples/transformer_tts/train_transformer.py
+++ b/examples/transformer_tts/train_transformer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from tqdm import tqdm
 from tensorboardX import SummaryWriter
@@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy
 from data import LJSpeechLoader
 from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
 def load_checkpoint(step, model_path):
-    model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
+    model_dict, opti_dict = fluid.dygraph.load_dygraph(
+        os.path.join(model_path, step))
    new_state_dict = OrderedDict()
    for param in model_dict:
        if param.startswith('_layers.'):
@@ -41,7 +56,7 @@ def main(args):
    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'transformer')
+    path = os.path.join(args.log_dir, 'transformer')
    writer = SummaryWriter(path) if local_rank == 0 else None
@@ -49,13 +64,18 @@ def main(args):
        model = TransformerTTS(cfg)
        model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), 
+        optimizer = fluid.optimizer.AdamOptimizer(
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
            parameter_list=model.parameters())
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, shuffle=True).reader()
        if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.transformer_step),
+                os.path.join(args.checkpoint_path, "transformer"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.transformer_step
@@ -68,60 +88,82 @@ def main(args):
        for epoch in range(args.epochs):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                character, mel, mel_input, pos_text, pos_mel, text_length, _ = data
                global_step += 1
-                mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
+                mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
+                    character, mel_input, pos_text, pos_mel)
                label = (pos_mel == 0).astype(np.float32)
-                mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
+                mel_loss = layers.mean(
-                post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
+                    layers.abs(layers.elementwise_sub(mel_pred, mel)))
+                post_mel_loss = layers.mean(
+                    layers.abs(layers.elementwise_sub(postnet_pred, mel)))
                loss = mel_loss + post_mel_loss
                # Note: When used stop token loss the learning did not work.
                if args.stop_token:
                    stop_loss = cross_entropy(stop_preds, label)
                    loss = loss + stop_loss
-                if local_rank==0:
+                if local_rank == 0:
                    writer.add_scalars('training_loss', {
-                        'mel_loss':mel_loss.numpy(),
+                        'mel_loss': mel_loss.numpy(),
-                        'post_mel_loss':post_mel_loss.numpy()
+                        'post_mel_loss': post_mel_loss.numpy()
                    }, global_step)
                    if args.stop_token:
-                        writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
+                        writer.add_scalar('stop_loss',
+                                          stop_loss.numpy(), global_step)
                    if args.use_data_parallel:
                        writer.add_scalars('alphas', {
-                            'encoder_alpha':model._layers.encoder.alpha.numpy(),
+                            'encoder_alpha':
-                            'decoder_alpha':model._layers.decoder.alpha.numpy(),
+                            model._layers.encoder.alpha.numpy(),
+                            'decoder_alpha':
+                            model._layers.decoder.alpha.numpy(),
                        }, global_step)
                    else:
                        writer.add_scalars('alphas', {
-                            'encoder_alpha':model.encoder.alpha.numpy(),
+                            'encoder_alpha': model.encoder.alpha.numpy(),
-                            'decoder_alpha':model.decoder.alpha.numpy(),
+                            'decoder_alpha': model.decoder.alpha.numpy(),
                        }, global_step)
-                    writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
+                    writer.add_scalar('learning_rate',
+                                      optimizer._learning_rate.step().numpy(),
+                                      global_step)
                    if global_step % args.image_step == 1:
                        for i, prob in enumerate(attn_probs):
                            for j in range(4):
-                                    x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
+                                x = np.uint8(
-                                    writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
                        for i, prob in enumerate(attn_enc):
                            for j in range(4):
-                                x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
+                                x = np.uint8(
-                                writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_enc_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
                        for i, prob in enumerate(attn_dec):
                            for j in range(4):
-                                x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
+                                x = np.uint8(
-                                writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_dec_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
                if args.use_data_parallel:
                    loss = model.scale_loss(loss)
@@ -129,21 +171,25 @@ def main(args):
                    model.apply_collective_grads()
                else:
                    loss.backward()
-                optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                model.clear_gradients()
                # save checkpoint
-                if local_rank==0 and global_step % args.save_step == 0:
+                if local_rank == 0 and global_step % args.save_step == 0:
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
-                    save_path = os.path.join(args.save_path,'transformer/%d' % global_step)
+                    save_path = os.path.join(args.save_path,
+                                             'transformer/%d' % global_step)
                    dg.save_dygraph(model.state_dict(), save_path)
                    dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
            writer.close()
-if __name__ =='__main__':
+if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train TransformerTTS model")
    add_config_options_to_parser(parser)

--- a/examples/transformer_tts/train_vocoder.py
+++ b/examples/transformer_tts/train_vocoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from tensorboardX import SummaryWriter
 import os
 from tqdm import tqdm
@@ -13,6 +26,7 @@ import paddle.fluid.layers as layers
 from data import LJSpeechLoader
 from parakeet.models.transformer_tts.vocoder import Vocoder
 def load_checkpoint(step, model_path):
    model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
    new_state_dict = OrderedDict()
@@ -23,6 +37,7 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict, opti_dict
 def main(args):
    local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
@@ -38,7 +53,7 @@ def main(args):
    if not os.path.exists(args.log_dir):
        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'vocoder')
+    path = os.path.join(args.log_dir, 'vocoder')
    writer = SummaryWriter(path) if local_rank == 0 else None
@@ -46,12 +61,15 @@ def main(args):
        model = Vocoder(cfg, args.batch_size)
        model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
+        optimizer = fluid.optimizer.AdamOptimizer(
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
            parameter_list=model.parameters())
        if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.vocoder_step),
+                os.path.join(args.checkpoint_path, "vocoder"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.vocoder_step
@@ -61,19 +79,21 @@ def main(args):
            strategy = dg.parallel.prepare_context()
            model = fluid.dygraph.parallel.DataParallel(model, strategy)
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader()
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, is_vocoder=True).reader()
        for epoch in range(args.epochs):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                mel, mag = data
                mag = dg.to_variable(mag.numpy())
                mel = dg.to_variable(mel.numpy())
                global_step += 1
                mag_pred = model(mel)
-                loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
+                loss = layers.mean(
+                    layers.abs(layers.elementwise_sub(mag_pred, mag)))
                if args.use_data_parallel:
                    loss = model.scale_loss(loss)
@@ -81,24 +101,29 @@ def main(args):
                    model.apply_collective_grads()
                else:
                    loss.backward()
-                optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                model.clear_gradients()
-                if local_rank==0:
+                if local_rank == 0:
-                    writer.add_scalars('training_loss',{
+                    writer.add_scalars('training_loss', {
-                        'loss':loss.numpy(),
+                        'loss': loss.numpy(),
                    }, global_step)
                    if global_step % args.save_step == 0:
                        if not os.path.exists(args.save_path):
                            os.mkdir(args.save_path)
-                        save_path = os.path.join(args.save_path,'vocoder/%d' % global_step)
+                        save_path = os.path.join(args.save_path,
+                                                 'vocoder/%d' % global_step)
                        dg.save_dygraph(model.state_dict(), save_path)
                        dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
            writer.close()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train vocoder model")
    add_config_options_to_parser(parser)

--- a/examples/waveflow/benchmark.py
+++ b/examples/waveflow/benchmark.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 from pprint import pprint

--- a/examples/waveflow/synthesis.py
+++ b/examples/waveflow/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 from pprint import pprint

--- a/examples/waveflow/train.py
+++ b/examples/waveflow/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 import subprocess

--- a/examples/waveflow/utils.py
+++ b/examples/waveflow/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import itertools
 import os
 import time

--- a/parakeet/__init__.py
+++ b/parakeet/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 __version__ = "0.0.0"
 from . import data, g2p, models, modules
--- a/parakeet/audio/__init__.py
+++ b/parakeet/audio/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from .audio import AudioProcessor
\ No newline at end of file
--- a/parakeet/audio/audio.py
+++ b/parakeet/audio/audio.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import librosa
 import soundfile as sf
 import numpy as np
 import scipy.io
 import scipy.signal
 class AudioProcessor(object):
-    def __init__(self,
+    def __init__(
+            self,
            sample_rate=None,  # int, sampling rate
            num_mels=None,  # int, bands of mel spectrogram
            min_level_db=None,  # float, minimum level db
@@ -52,7 +68,8 @@ class AudioProcessor(object):
        self.do_trim_silence = do_trim_silence
        self.sound_norm = sound_norm
-        self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters()
+        self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters(
+        )
    def _stft_parameters(self):
        """compute frame length and hop length in ms"""
@@ -65,25 +82,29 @@ class AudioProcessor(object):
        """object repr"""
        cls_name_str = self.__class__.__name__
        members = vars(self)
-        dict_str = "\n".join(["  {}: {},".format(k, v) for k, v in members.items()])
+        dict_str = "\n".join(
+            ["  {}: {},".format(k, v) for k, v in members.items()])
        repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
        return repr_str
    def save_wav(self, path, wav):
        """save audio with scipy.io.wavfile in 16bit integers"""
        wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
-        scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16))
+        scipy.io.wavfile.write(path, self.sample_rate,
+                               wav_norm.as_type(np.int16))
    def load_wav(self, path, sr=None):
        """load wav -> trim_silence -> rescale"""
        x, sr = librosa.load(path, sr=None)
-        assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate)
+        assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(
+            sr, self.sample_rate)
        if self.do_trim_silence:
            try:
                x = self.trim_silence(x)
            except ValueError:
-                print(" [!] File cannot be trimmed for silence - {}".format(path))
+                print(" [!] File cannot be trimmed for silence - {}".format(
+                    path))
        if self.sound_norm:
            x = x / x.max() * 0.9  # why 0.9 ?
        return x
@@ -91,18 +112,24 @@ class AudioProcessor(object):
    def trim_silence(self, wav):
        """Trim soilent parts with a threshold and 0.01s margin"""
        margin = int(self.sample_rate * 0.01)
-        wav = wav[margin: -margin]
+        wav = wav[margin:-margin]
-        trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
+        trimed_wav = librosa.effects.trim(
+            wav,
+            top_db=60,
+            frame_length=self.win_length,
+            hop_length=self.hop_length)[0]
        return trimed_wav
    def apply_preemphasis(self, x):
        if self.preemphasis == 0.:
-            raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
+            raise RuntimeError(
+                " !! Preemphasis coefficient should be positive. ")
        return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
    def apply_inv_preemphasis(self, x):
        if self.preemphasis == 0.:
-            raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
+            raise RuntimeError(
+                " !! Preemphasis coefficient should be positive. ")
        return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
    def _amplitude_to_db(self, x):
@@ -125,8 +152,7 @@ class AudioProcessor(object):
        """return mel basis for mel scale"""
        if self.mel_fmax is not None:
            assert self.mel_fmax <= self.sample_rate // 2
-        return librosa.filters.mel(
+        return librosa.filters.mel(self.sample_rate,
-            self.sample_rate, 
                                   self.n_fft,
                                   n_mels=self.num_mels,
                                   fmin=self.mel_fmin,
@@ -156,12 +182,15 @@ class AudioProcessor(object):
            if self.symmetric_norm:
                if self.clip_norm:
                    S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
-                S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db
+                S_denorm = (S_denorm + self.max_norm) * (
+                    -self.min_level_db) / (2 * self.max_norm
+                                           ) + self.min_level_db
                return S_denorm
            else:
                if self.clip_norm:
                    S_denorm = np.clip(S_denorm, 0, self.max_norm)
-                S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db
+                S_denorm = S_denorm * (-self.min_level_db
+                                       ) / self.max_norm + self.min_level_db
                return S_denorm
        else:
            return S
@@ -174,7 +203,8 @@ class AudioProcessor(object):
            hop_length=self.hop_length)
    def _istft(self, S):
-        return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length)
+        return librosa.istft(
+            S, hop_length=self.hop_length, win_length=self.win_length)
    def spectrogram(self, y):
        """compute linear spectrogram(amplitude)
@@ -195,7 +225,8 @@ class AudioProcessor(object):
            D = self._stft(self.apply_preemphasis(y))
        else:
            D = self._stft(y)
-        S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
+        S = self._amplitude_to_db(self._linear_to_mel(np.abs(
+            D))) - self.ref_level_db
        return self._normalize(S)
    def inv_spectrogram(self, spectrogram):
@@ -203,16 +234,16 @@ class AudioProcessor(object):
        S = self._denormalize(spectrogram)
        S = self._db_to_amplitude(S + self.ref_level_db)
        if self.preemphasis:
-            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
+            return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
-        return self._griffin_lim(S ** self.power)
+        return self._griffin_lim(S**self.power)
    def inv_melspectrogram(self, mel_spectrogram):
        S = self._denormalize(mel_spectrogram)
        S = self._db_to_amplitude(S + self.ref_level_db)
        S = self._mel_to_linear(np.abs(S))
        if self.preemphasis:
-            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
+            return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
-        return self._griffin_lim(S ** self.power)
+        return self._griffin_lim(S**self.power)
    def out_linear_to_mel(self, linear_spec):
        """convert output linear spec to mel spec"""
@@ -234,18 +265,18 @@ class AudioProcessor(object):
    @staticmethod
    def mulaw_encode(wav, qc):
-        mu = 2 ** qc - 1
+        mu = 2**qc - 1
        # wav_abs = np.minimum(np.abs(wav), 1.0)
        signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
        # Quantize signal to the specified number of levels.
        signal = (signal + 1) / 2 * mu + 0.5
-        return np.floor(signal,)
+        return np.floor(signal, )
    @staticmethod
    def mulaw_decode(wav, qc):
        """Recovers waveform from quantized values."""
-        mu = 2 ** qc - 1
+        mu = 2**qc - 1
-        x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
+        x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1)
        return x
    @staticmethod

--- a/parakeet/data/__init__.py
+++ b/parakeet/data/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from .dataset import *
 from .datacargo import *
 from .sampler import *

--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 functions to make batch for arrays which satisfy some conditions.
 """
 import numpy as np
 class TextIDBatcher(object):
    """A wrapper class for a function to build a functor, which holds the configs to pass to the function."""
    def __init__(self, pad_id=0, dtype=np.int64):
        self.pad_id = pad_id
        self.dtype = dtype
@@ -13,6 +28,7 @@ class TextIDBatcher(object):
        out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype)
        return out
 def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
    """
    minibatch: List[Example]
@@ -21,16 +37,21 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
    peek_example = minibatch[0]
    assert len(peek_example.shape) == 1, "text example is an 1D tensor"
-    lengths = [example.shape[0] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
+    lengths = [example.shape[0] for example in minibatch
+               ]  # assume (channel, n_samples) or (n_samples, )
    max_len = np.max(lengths)
    batch = []
    for example in minibatch:
        pad_len = max_len - example.shape[0]
-        batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_id))
+        batch.append(
+            np.pad(example, [(0, pad_len)],
+                   mode='constant',
+                   constant_values=pad_id))
    return np.array(batch, dtype=dtype)
 class WavBatcher(object):
    def __init__(self, pad_value=0., dtype=np.float32):
        self.pad_value = pad_value
@@ -40,6 +61,7 @@ class WavBatcher(object):
        out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype)
        return out
 def batch_wav(minibatch, pad_value=0., dtype=np.float32):
    """
    minibatch: List[Example]
@@ -52,16 +74,23 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
    elif len(peek_example.shape) == 2:
        mono_channel = False
-    lengths = [example.shape[-1] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
+    lengths = [example.shape[-1] for example in minibatch
+               ]  # assume (channel, n_samples) or (n_samples, )
    max_len = np.max(lengths)
    batch = []
    for example in minibatch:
        pad_len = max_len - example.shape[-1]
        if mono_channel:
-            batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_value))
+            batch.append(
+                np.pad(example, [(0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))
        else:
-            batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
+            batch.append(
+                np.pad(example, [(0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))  # what about PCM, no
    return np.array(batch, dtype=dtype)
@@ -75,6 +104,7 @@ class SpecBatcher(object):
        out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype)
        return out
 def batch_spec(minibatch, pad_value=0., dtype=np.float32):
    """
    minibatch: List[Example]
@@ -87,15 +117,22 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
    elif len(peek_example.shape) == 3:
        mono_channel = False
-    lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame)
+    lengths = [example.shape[-1] for example in minibatch
+               ]  # assume (channel, F, n_frame) or (F, n_frame)
    max_len = np.max(lengths)
    batch = []
    for example in minibatch:
        pad_len = max_len - example.shape[-1]
        if mono_channel:
-            batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value))
+            batch.append(
+                np.pad(example, [(0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))
        else:
-            batch.append(np.pad(example, [(0, 0), (0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
+            batch.append(
+                np.pad(example, [(0, 0), (0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))  # what about PCM, no
    return np.array(batch, dtype=dtype)
--- a/parakeet/data/datacargo.py
+++ b/parakeet/data/datacargo.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import six
 from .sampler import SequentialSampler, RandomSampler, BatchSampler

--- a/parakeet/data/dataset.py
+++ b/parakeet/data/dataset.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import six
 import numpy as np
@@ -9,8 +23,7 @@ class DatasetMixin(object):
        if isinstance(index, slice):
            start, stop, step = index.indices(len(self))
            return [
-                self.get_example(i)
+                self.get_example(i) for i in six.moves.range(start, stop, step)
-                for i in six.moves.range(start, stop, step)
            ]
        elif isinstance(index, (list, np.ndarray)):
            return [self.get_example(i) for i in index]
@@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin):
    def get_example(self, i):
        if i < 0:
-            raise IndexError(
+            raise IndexError("ChainDataset doesnot support negative indexing.")
-                "ChainDataset doesnot support negative indexing.")
        for dataset in self._datasets:
            if i < len(dataset):

--- a/parakeet/data/sampler.py
+++ b/parakeet/data/sampler.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__.
@@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices.
 So the sampler is only responsible for generating valid indices.
 """
 import numpy as np
 import random
 class Sampler(object):
    def __init__(self, data_source):
        pass
@@ -42,12 +55,14 @@ class RandomSampler(Sampler):
                             "replacement={}".format(self.replacement))
        if self._num_samples is not None and not replacement:
-            raise ValueError("With replacement=False, num_samples should not be specified, "
+            raise ValueError(
+                "With replacement=False, num_samples should not be specified, "
                "since a random permutation will be performed.")
        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
            raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(self.num_samples))
+                             "value, but got num_samples={}".format(
+                                 self.num_samples))
    @property
    def num_samples(self):
@@ -59,7 +74,9 @@ class RandomSampler(Sampler):
    def __iter__(self):
        n = len(self.data_source)
        if self.replacement:
-            return iter(np.random.randint(0, n, size=(self.num_samples,), dtype=np.int64).tolist())
+            return iter(
+                np.random.randint(
+                    0, n, size=(self.num_samples, ), dtype=np.int64).tolist())
        return iter(np.random.permutation(n).tolist())
    def __len__(self):
@@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler):
        self.indices = indices
    def __iter__(self):
-        return (self.indices[i] for i in np.random.permutation(len(self.indices)))
+        return (self.indices[i]
+                for i in np.random.permutation(len(self.indices)))
    def __len__(self):
        return len(self.indices)
@@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
    3. Permutate mini-batchs
    """
-    def __init__(self, lengths, batch_size=4, batch_group_size=None,
+    def __init__(self,
+                 lengths,
+                 batch_size=4,
+                 batch_group_size=None,
                 permutate=True):
-        _lengths = np.array(lengths, dtype=np.int64) # maybe better implement length as a sort key
+        _lengths = np.array(
+            lengths,
+            dtype=np.int64)  # maybe better implement length as a sort key
        self.lengths = np.sort(_lengths)
        self.sorted_indices = np.argsort(_lengths)
@@ -112,13 +135,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
        for i in range(len(indices) // batch_group_size):
            s = i * batch_group_size
            e = s + batch_group_size
-            random.shuffle(indices[s: e]) # inplace
+            random.shuffle(indices[s:e])  # inplace
        # Permutate batches
        if self.permutate:
            perm = np.arange(len(indices[:e]) // self.batch_size)
            random.shuffle(perm)
-            indices[:e] = indices[:e].reshape(-1, self.batch_size)[perm, :].reshape(-1)
+            indices[:e] = indices[:e].reshape(
+                -1, self.batch_size)[perm, :].reshape(-1)
        # Handle last elements
        s += batch_group_size
@@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler):
    def __init__(self, weights, num_samples, replacement):
        if not isinstance(num_samples, int) or num_samples <= 0:
            raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(num_samples))
+                             "value, but got num_samples={}".format(
+                                 num_samples))
        self.weights = np.array(weights, dtype=np.float64)
        self.num_samples = num_samples
        self.replacement = replacement
    def __iter__(self):
-        return iter(np.random.choice(len(self.weights), size=(self.num_samples, ),  
+        return iter(
-                                     replace=self.replacement, p=self.weights).tolist())
+            np.random.choice(
+                len(self.weights),
+                size=(self.num_samples, ),
+                replace=self.replacement,
+                p=self.weights).tolist())
    def __len__(self):
        return self.num_samples
@@ -209,8 +238,7 @@ class BatchSampler(Sampler):
    def __init__(self, sampler, batch_size, drop_last):
        if not isinstance(sampler, Sampler):
            raise ValueError("sampler should be an instance of "
-                             "Sampler, but got sampler={}"
+                             "Sampler, but got sampler={}".format(sampler))
-                             .format(sampler))
        if not isinstance(batch_size, int) or batch_size <= 0:
            raise ValueError("batch_size should be a positive integer value, "
                             "but got batch_size={}".format(batch_size))

--- a/parakeet/datasets/README.md
+++ b/parakeet/datasets/README.md
@@ -15,8 +15,3 @@ One of the reasons we choose to load data lazily (only load metadata before hand
 For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`.
 That is it!
--- a/parakeet/datasets/__init__.py
+++ b/parakeet/datasets/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parakeet/datasets/ljspeech.py
+++ b/parakeet/datasets/ljspeech.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 import numpy as np
 import pandas as pd

--- a/parakeet/datasets/vctk.py
+++ b/parakeet/datasets/vctk.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 import pandas as pd
 from ruamel.yaml import YAML
@@ -11,9 +25,11 @@ from parakeet.data.dataset import Dataset
 from parakeet.data.datacargo import DataCargo
 from parakeet.data.batch import TextIDBatcher, WavBatcher
 class VCTK(Dataset):
    def __init__(self, root):
-        assert isinstance(root, (str, Path)), "root should be a string or Path object"
+        assert isinstance(root, (
+            str, Path)), "root should be a string or Path object"
        self.root = root if isinstance(root, Path) else Path(root)
        self.text_root = self.root.joinpath("txt")
        self.wav_root = self.root.joinpath("wav48")
@@ -24,10 +40,10 @@ class VCTK(Dataset):
        self.speaker_indices, self.metadata = self._load_metadata()
    def _load_metadata(self):
-        yaml=YAML(typ='safe')
+        yaml = YAML(typ='safe')
        speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml"))
-        metadata = pd.read_csv(self.root.joinpath("metadata.csv"), 
+        metadata = pd.read_csv(
-                               sep="|", quoting=3, header=1)
+            self.root.joinpath("metadata.csv"), sep="|", quoting=3, header=1)
        return speaker_indices, metadata
    def _prepare_metadata(self):
@@ -41,15 +57,19 @@ class VCTK(Dataset):
                        with io.open(str(text_file)) as f:
                            transcription = f.read().strip()
                    wav_file = text_file.with_suffix(".wav")
-                    metadata.append((wav_file.name, speaker_folder.name, transcription))
+                    metadata.append(
-        metadata = pd.DataFrame.from_records(metadata,
+                        (wav_file.name, speaker_folder.name, transcription))
-                                             columns=["wave_file", "speaker", "text"])
+        metadata = pd.DataFrame.from_records(
+            metadata, columns=["wave_file", "speaker", "text"])
        # save them
-        yaml=YAML(typ='safe')
+        yaml = YAML(typ='safe')
        yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml"))
-        metadata.to_csv(self.root.joinpath("metadata.csv"), 
+        metadata.to_csv(
-                        sep="|", quoting=3, index=False)
+            self.root.joinpath("metadata.csv"),
+            sep="|",
+            quoting=3,
+            index=False)
    def _get_example(self, metadatum):
        wave_file, speaker, text = metadatum
@@ -77,5 +97,3 @@ class VCTK(Dataset):
        speaker_batch = np.array(speaker_batch)
        phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
        return wav_batch, speaker_batch, phoneme_batch
\ No newline at end of file
--- a/parakeet/g2p/__init__.py
+++ b/parakeet/g2p/__init__.py
 # coding: utf-8
 """Text processing frontend
 All frontend module should have the following functions:

--- a/parakeet/g2p/en/__init__.py
+++ b/parakeet/g2p/en/__init__.py
@@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0):
    from ..text import text_to_sequence
    text = text_to_sequence(text, ["english_cleaners"])
    return text
--- a/parakeet/g2p/es/__init__.py
+++ b/parakeet/g2p/es/__init__.py
@@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0):
    from ..text import text_to_sequence
    text = text_to_sequence(text, ["basic_cleaners"])
    return text
--- a/parakeet/g2p/jp/__init__.py
+++ b/parakeet/g2p/jp/__init__.py
 # coding: utf-8
 import MeCab
 import jaconv
 from random import random
@@ -30,8 +29,8 @@ def _yomi(mecab_result):
 def _mix_pronunciation(tokens, yomis, p):
-    return "".join(
+    return "".join(yomis[idx]
-        yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
+                   if yomis[idx] is not None and random() < p else tokens[idx]
                   for idx in range(len(tokens)))
@@ -59,8 +58,7 @@ def normalize_delimitor(text):
 def text_to_sequence(text, p=0.0):
-    for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】",
+    for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】", "（", "）", "(", ")"]:
-              "（", "）", "(", ")"]:
        text = text.replace(c, "")
    text = text.replace("!", "！")
    text = text.replace("?", "？")

--- a/parakeet/g2p/ko/__init__.py
+++ b/parakeet/g2p/ko/__init__.py
 # coding: utf-8
 from random import random
 n_vocab = 0xffff
@@ -13,5 +12,6 @@ _tagger = None
 def text_to_sequence(text, p=0.0):
    return [ord(c) for c in text] + [_eos]  # EOS
 def sequence_to_text(seq):
    return "".join(chr(n) for n in seq)
--- a/parakeet/g2p/text/__init__.py
+++ b/parakeet/g2p/text/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import re
 from . import cleaners
 from .symbols import symbols
 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 _id_to_symbol = {i: s for i, s in enumerate(symbols)}
@@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names):
        if not m:
            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
            break
-        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
+        sequence += _symbols_to_sequence(
+            _clean_text(m.group(1), cleaner_names))
        sequence += _arpabet_to_sequence(m.group(2))
        text = m.group(3)

--- a/parakeet/g2p/text/cleaners.py
+++ b/parakeet/g2p/text/cleaners.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Cleaners are transformations that run over the input text at both training and eval time.
@@ -14,12 +27,12 @@ import re
 from unidecode import unidecode
 from .numbers import normalize_numbers
 # Regular expression matching whitespace:
 _whitespace_re = re.compile(r'\s+')
 # List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
+                  for x in [
                      ('mrs', 'misess'),
                      ('mr', 'mister'),
                      ('dr', 'doctor'),
@@ -38,7 +51,7 @@ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in
                      ('ltd', 'limited'),
                      ('col', 'colonel'),
                      ('ft', 'fort'),
-]]
+                  ]]
 def expand_abbreviations(text):

--- a/parakeet/g2p/text/cmudict.py
+++ b/parakeet/g2p/text/cmudict.py
-import re
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
 valid_symbols = [
-    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
+    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
-    'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
+    'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
-    'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
+    'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
-    'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
+    'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
-    'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
+    'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
-    'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
+    'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
-    'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
+    'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
+    'Y', 'Z', 'ZH'
 ]
 _valid_symbol_set = set(valid_symbols)
@@ -24,7 +38,10 @@ class CMUDict:
        else:
            entries = _parse_cmudict(file_or_path)
        if not keep_ambiguous:
-            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+            entries = {
+                word: pron
+                for word, pron in entries.items() if len(pron) == 1
+            }
        self._entries = entries
    def __len__(self):

--- a/parakeet/g2p/text/numbers.py
+++ b/parakeet/g2p/text/numbers.py
@@ -3,7 +3,6 @@
 import inflect
 import re
 _inflect = inflect.engine()
 _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
@@ -56,7 +55,8 @@ def _expand_number(m):
        elif num % 100 == 0:
            return _inflect.number_to_words(num // 100) + ' hundred'
        else:
-            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+            return _inflect.number_to_words(
+                num, andword='', zero='oh', group=2).replace(', ', ' ')
    else:
        return _inflect.number_to_words(num, andword='')

--- a/parakeet/g2p/text/symbols.py
+++ b/parakeet/g2p/text/symbols.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Defines the set of symbols used in text input to the model.

--- a/parakeet/models/__init__.py
+++ b/parakeet/models/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parakeet/models/deepvoice3/__init__.py
+++ b/parakeet/models/deepvoice3/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec
 from parakeet.models.deepvoice3.decoder import Decoder, WindowRange
 from parakeet.models.deepvoice3.converter import Converter

--- a/parakeet/models/deepvoice3/attention.py
+++ b/parakeet/models/deepvoice3/attention.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from collections import namedtuple
 from paddle import fluid
@@ -19,23 +33,19 @@ class Attention(dg.Layer):
                 value_projection=True):
        super(Attention, self).__init__()
        std = np.sqrt(1 / query_dim)
-        self.query_proj = Linear(query_dim,
+        self.query_proj = Linear(
-                                 embed_dim,
+            query_dim, embed_dim, param_attr=I.Normal(scale=std))
-                                 param_attr=I.Normal(scale=std))
        if key_projection:
            std = np.sqrt(1 / embed_dim)
-            self.key_proj = Linear(embed_dim,
+            self.key_proj = Linear(
-                                   embed_dim,
+                embed_dim, embed_dim, param_attr=I.Normal(scale=std))
-                                   param_attr=I.Normal(scale=std))
        if value_projection:
            std = np.sqrt(1 / embed_dim)
-            self.value_proj = Linear(embed_dim,
+            self.value_proj = Linear(
-                                     embed_dim,
+                embed_dim, embed_dim, param_attr=I.Normal(scale=std))
-                                     param_attr=I.Normal(scale=std))
        std = np.sqrt(1 / embed_dim)
-        self.out_proj = Linear(embed_dim,
+        self.out_proj = Linear(
-                               query_dim,
+            embed_dim, query_dim, param_attr=I.Normal(scale=std))
-                               param_attr=I.Normal(scale=std))
        self.key_projection = key_projection
        self.value_projection = value_projection
@@ -102,9 +112,8 @@ class Attention(dg.Layer):
        x = F.softmax(x)
        attn_scores = x
-        x = F.dropout(x,
+        x = F.dropout(
-                      self.dropout,
+            x, self.dropout, dropout_implementation="upscale_in_train")
-                      dropout_implementation="upscale_in_train")
        x = F.matmul(x, values)
        encoder_length = keys.shape[1]
        # CAUTION: is it wrong? let it be now

--- a/parakeet/models/deepvoice3/conv1dglu.py
+++ b/parakeet/models/deepvoice3/conv1dglu.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from paddle import fluid
@@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer):
    has residual connection from the input x, and scale the output by 
    np.sqrt(0.5).
    """
    def __init__(self,
                 n_speakers,
                 speaker_dim,
@@ -50,7 +65,8 @@ class Conv1DGLU(dg.Layer):
            ), "this block uses residual connection"\
                "the input_channes should equals num_filters"
        std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
-        self.conv = Conv1DCell(in_channels,
+        self.conv = Conv1DCell(
+            in_channels,
            2 * num_filters,
            filter_size,
            dilation,
@@ -61,9 +77,8 @@ class Conv1DGLU(dg.Layer):
            assert (speaker_dim is not None
                    ), "speaker embed should not be null in multi-speaker case"
            std = np.sqrt(1 / speaker_dim)
-            self.fc = Linear(speaker_dim,
+            self.fc = Linear(
-                             num_filters,
+                speaker_dim, num_filters, param_attr=I.Normal(scale=std))
-                             param_attr=I.Normal(scale=std))
    def forward(self, x, speaker_embed=None):
        """
@@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer):
                C_out means the output channels of Conv1DGLU.
        """
        residual = x
-        x = F.dropout(x,
+        x = F.dropout(
-                      self.dropout,
+            x, self.dropout, dropout_implementation="upscale_in_train")
-                      dropout_implementation="upscale_in_train")
        x = self.conv(x)
        content, gate = F.split(x, num_or_sections=2, dim=1)
@@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer):
                C_out means the output channels of Conv1DGLU.
        """
        residual = x_t
-        x_t = F.dropout(x_t,
+        x_t = F.dropout(
-                        self.dropout,
+            x_t, self.dropout, dropout_implementation="upscale_in_train")
-                        dropout_implementation="upscale_in_train")
        x_t = self.conv.add_input(x_t)
        content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)

--- a/parakeet/models/deepvoice3/converter.py
+++ b/parakeet/models/deepvoice3/converter.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from itertools import chain
@@ -19,37 +33,38 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
-        Conv1DGLU(n_speakers,
+        Conv1DGLU(
+            n_speakers,
            speaker_dim,
            target_channels,
            target_channels,
            3,
            dilation=1,
            std_mul=1.,
-                  dropout=dropout),
+            dropout=dropout), Conv1DGLU(
-        Conv1DGLU(n_speakers,
+                n_speakers,
                speaker_dim,
                target_channels,
                target_channels,
                3,
                dilation=3,
                std_mul=4.,
-                  dropout=dropout),
+                dropout=dropout), Conv1DTranspose(
-        Conv1DTranspose(
                    target_channels,
                    target_channels,
                    2,
                    stride=2,
-            param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
+                    param_attr=I.Normal(scale=np.sqrt(
-        Conv1DGLU(n_speakers,
+                        4. / (2 * target_channels)))), Conv1DGLU(
+                            n_speakers,
                            speaker_dim,
                            target_channels,
                            target_channels,
                            3,
                            dilation=1,
                            std_mul=1.,
-                  dropout=dropout),
+                            dropout=dropout), Conv1DGLU(
-        Conv1DGLU(n_speakers,
+                                n_speakers,
                                speaker_dim,
                                target_channels,
                                target_channels,
@@ -69,15 +84,16 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
-        Conv1DGLU(n_speakers,
+        Conv1DGLU(
+            n_speakers,
            speaker_dim,
            target_channels,
            target_channels,
            3,
            dilation=1,
            std_mul=1.,
-                  dropout=dropout),
+            dropout=dropout), Conv1DGLU(
-        Conv1DGLU(n_speakers,
+                n_speakers,
                speaker_dim,
                target_channels,
                target_channels,
@@ -91,7 +107,8 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
 def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout):
    upsampling_convolutions = [
-        Conv1DGLU(n_speakers,
+        Conv1DGLU(
+            n_speakers,
            speaker_dim,
            target_channels,
            target_channels,
@@ -108,6 +125,7 @@ class Converter(dg.Layer):
    Vocoder that transforms mel spectrogram (or ecoder hidden states) 
    to waveform.
    """
    def __init__(self,
                 n_speakers,
                 speaker_dim,
@@ -161,7 +179,8 @@ class Converter(dg.Layer):
                std = np.sqrt(std_mul / in_channels)
                # CAUTION: relu
                self.convolutions.append(
-                    Conv1D(in_channels,
+                    Conv1D(
+                        in_channels,
                        out_channels,
                        1,
                        act="relu",
@@ -169,7 +188,8 @@ class Converter(dg.Layer):
                in_channels = out_channels
                std_mul = 2.0
            self.convolutions.append(
-                Conv1DGLU(n_speakers,
+                Conv1DGLU(
+                    n_speakers,
                    speaker_dim,
                    in_channels,
                    out_channels,
@@ -183,7 +203,8 @@ class Converter(dg.Layer):
        # final conv proj, channel transformed to linear dim
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        # CAUTION: sigmoid
-        self.last_conv_proj = Conv1D(in_channels,
+        self.last_conv_proj = Conv1D(
+            in_channels,
            linear_dim,
            1,
            act="sigmoid",

--- a/parakeet/models/deepvoice3/decoder.py
+++ b/parakeet/models/deepvoice3/decoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import paddle.fluid.layers as F
 import paddle.fluid.initializer as I
@@ -111,23 +125,17 @@ class Decoder(dg.Layer):
        conv_channels = convolutions[0].out_channels
        # only when padding idx is 0 can we easilt handle it
-        self.embed_keys_positions = PositionEmbedding(max_positions,
+        self.embed_keys_positions = PositionEmbedding(
-                                                      embed_dim,
+            max_positions, embed_dim, padding_idx=0)
-                                                      padding_idx=0)
+        self.embed_query_positions = PositionEmbedding(
-        self.embed_query_positions = PositionEmbedding(max_positions,
+            max_positions, conv_channels, padding_idx=0)
-                                                       conv_channels,
-                                                       padding_idx=0)
        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
-            self.speaker_proj1 = Linear(speaker_dim,
+            self.speaker_proj1 = Linear(
-                                        1,
+                speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
-                                        act="sigmoid",
+            self.speaker_proj2 = Linear(
-                                        param_attr=I.Normal(scale=std))
+                speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
-            self.speaker_proj2 = Linear(speaker_dim,
-                                        1,
-                                        act="sigmoid",
-                                        param_attr=I.Normal(scale=std))
        # prenet
        self.prenet = dg.LayerList()
@@ -138,7 +146,8 @@ class Decoder(dg.Layer):
                # conv1d & relu
                std = np.sqrt(std_mul / in_channels)
                self.prenet.append(
-                    Conv1D(in_channels,
+                    Conv1D(
+                        in_channels,
                        out_channels,
                        1,
                        act="relu",
@@ -146,7 +155,8 @@ class Decoder(dg.Layer):
                in_channels = out_channels
                std_mul = 2.0
            self.prenet.append(
-                Conv1DGLU(n_speakers,
+                Conv1DGLU(
+                    n_speakers,
                    speaker_dim,
                    in_channels,
                    out_channels,
@@ -184,7 +194,8 @@ class Decoder(dg.Layer):
            assert (
                in_channels == out_channels
            ), "the stack of convolution & attention does not change channels"
-            conv_layer = Conv1DGLU(n_speakers,
+            conv_layer = Conv1DGLU(
+                n_speakers,
                speaker_dim,
                in_channels,
                out_channels,
@@ -211,10 +222,8 @@ class Decoder(dg.Layer):
        # 1 * 1 conv to transform channels
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
-        self.last_conv = Conv1D(in_channels,
+        self.last_conv = Conv1D(
-                                mel_dim * r,
+            in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std))
-                                1,
-                                param_attr=I.Normal(scale=std))
        # mel (before sigmoid) to done hat
        std = np.sqrt(1 / in_channels)
@@ -308,9 +317,8 @@ class Decoder(dg.Layer):
        # (B, C, T)
        frames = F.transpose(frames, [0, 2, 1])
        x = frames
-        x = F.dropout(x,
+        x = F.dropout(
-                      self.dropout,
+            x, self.dropout, dropout_implementation="upscale_in_train")
-                      dropout_implementation="upscale_in_train")
        # Prenet
        for layer in self.prenet:
            if isinstance(layer, Conv1DGLU):
@@ -408,14 +416,13 @@ class Decoder(dg.Layer):
            test_inputs = fold_adjacent_frames(test_inputs, self.r)
            test_inputs = F.transpose(test_inputs, [0, 2, 1])
-        initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1),
+        initial_input = F.zeros(
-                                dtype=keys.dtype)
+            (batch_size, self.mel_dim * self.r, 1), dtype=keys.dtype)
        t = 0  # decoder time step
        while True:
-            frame_pos = F.fill_constant((batch_size, 1),
+            frame_pos = F.fill_constant(
-                                        value=t + 1,
+                (batch_size, 1), value=t + 1, dtype="int64")
-                                        dtype="int64")
            w = self.query_position_rate
            if self.n_speakers > 1:
                w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1])
@@ -433,9 +440,8 @@ class Decoder(dg.Layer):
                    current_input = initial_input
            x_t = current_input
-            x_t = F.dropout(x_t,
+            x_t = F.dropout(
-                            self.dropout,
+                x_t, self.dropout, dropout_implementation="upscale_in_train")
-                            dropout_implementation="upscale_in_train")
            # Prenet
            for layer in self.prenet:
@@ -453,15 +459,15 @@ class Decoder(dg.Layer):
                    x_t = F.transpose(x_t, [0, 2, 1])
                    if frame_pos_embed is not None:
                        x_t += frame_pos_embed
-                    x_t, attn_scores = attn(
+                    x_t, attn_scores = attn(x_t, (keys, values), mask,
-                        x_t, (keys, values), mask,
+                                            last_attended[i]
-                        last_attended[i] if test_inputs is None else None)
+                                            if test_inputs is None else None)
                    x_t = F.transpose(x_t, [0, 2, 1])
                    step_attn_scores.append(attn_scores)  #(B, T_dec=1, T_enc)
                    # update last attended when necessary
                    if self.force_monotonic_attention[i]:
-                        last_attended[i] = np.argmax(attn_scores.numpy(),
+                        last_attended[i] = np.argmax(
-                                                     axis=-1)[0][0]
+                            attn_scores.numpy(), axis=-1)[0][0]
                x_t = F.scale(residual + x_t, np.sqrt(0.5))
            if len(step_attn_scores):
                # (B, 1, T_enc) again
@@ -485,8 +491,8 @@ class Decoder(dg.Layer):
            t += 1
            if test_inputs is None:
-                if F.reduce_min(done_t).numpy(
+                if F.reduce_min(done_t).numpy()[
-                )[0] > 0.5 and t > self.min_decoder_steps:
+                        0] > 0.5 and t > self.min_decoder_steps:
                    break
                elif t > self.max_decoder_steps:
                    break

--- a/parakeet/models/deepvoice3/encoder.py
+++ b/parakeet/models/deepvoice3/encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from collections import namedtuple
@@ -33,11 +47,13 @@ class Encoder(dg.Layer):
        self.dropout = dropout
        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
-            self.sp_proj1 = Linear(speaker_dim,
+            self.sp_proj1 = Linear(
+                speaker_dim,
                embed_dim,
                act="softsign",
                param_attr=I.Normal(scale=std))
-            self.sp_proj2 = Linear(speaker_dim,
+            self.sp_proj2 = Linear(
+                speaker_dim,
                embed_dim,
                act="softsign",
                param_attr=I.Normal(scale=std))
@@ -51,7 +67,8 @@ class Encoder(dg.Layer):
            if in_channels != out_channels:
                std = np.sqrt(std_mul / in_channels)
                self.convolutions.append(
-                    Conv1D(in_channels,
+                    Conv1D(
+                        in_channels,
                        out_channels,
                        1,
                        act="relu",
@@ -60,7 +77,8 @@ class Encoder(dg.Layer):
                std_mul = 2.0
            self.convolutions.append(
-                Conv1DGLU(n_speakers,
+                Conv1DGLU(
+                    n_speakers,
                    speaker_dim,
                    in_channels,
                    out_channels,
@@ -75,7 +93,8 @@ class Encoder(dg.Layer):
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        self.convolutions.append(
-            Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
+            Conv1D(
+                in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
    def forward(self, x, speaker_embed=None):
        """
@@ -96,9 +115,8 @@ class Encoder(dg.Layer):
                representation for values.
        """
        x = self.embed(x)
-        x = F.dropout(x,
+        x = F.dropout(
-                      self.dropout,
+            x, self.dropout, dropout_implementation="upscale_in_train")
-                      dropout_implementation="upscale_in_train")
        x = F.transpose(x, [0, 2, 1])
        if self.n_speakers > 1 and speaker_embed is not None:

--- a/parakeet/models/deepvoice3/loss.py
+++ b/parakeet/models/deepvoice3/loss.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from numba import jit
@@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g):
    return W
-def guided_attentions(encoder_lengths,
+def guided_attentions(encoder_lengths, decoder_lengths, max_decoder_len,
-                      decoder_lengths,
-                      max_decoder_len,
                      g=0.2):
    B = len(encoder_lengths)
    max_input_len = encoder_lengths.max()
@@ -93,9 +105,8 @@ class TTSLoss(object):
    def binary_divergence(self, prediction, target, mask):
        flattened_prediction = F.reshape(prediction, [-1, 1])
        flattened_target = F.reshape(target, [-1, 1])
-        flattened_loss = F.log_loss(flattened_prediction,
+        flattened_loss = F.log_loss(
-                                    flattened_target,
+            flattened_prediction, flattened_target, epsilon=1e-8)
-                                    epsilon=1e-8)
        bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
        w = self.masked_weight
@@ -163,23 +174,20 @@ class TTSLoss(object):
        max_mel_steps = max_frames // self.downsample_factor
        max_decoder_steps = max_mel_steps // self.r
-        decoder_mask = F.sequence_mask(n_frames // self.downsample_factor //
+        decoder_mask = F.sequence_mask(
-                                       self.r,
+            n_frames // self.downsample_factor // self.r,
            max_decoder_steps,
            dtype="float32")
-        mel_mask = F.sequence_mask(n_frames // self.downsample_factor,
+        mel_mask = F.sequence_mask(
-                                   max_mel_steps,
+            n_frames // self.downsample_factor, max_mel_steps, dtype="float32")
-                                   dtype="float32")
        lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32")
        if compute_lin_loss:
            lin_hyp = lin_hyp[:, :-self.time_shift, :]
            lin_ref = lin_ref[:, self.time_shift:, :]
            lin_mask = lin_mask[:, self.time_shift:, :]
-            lin_l1_loss = self.l1_loss(lin_hyp,
+            lin_l1_loss = self.l1_loss(
-                                       lin_ref,
+                lin_hyp, lin_ref, lin_mask, priority_bin=self.priority_bin)
-                                       lin_mask,
-                                       priority_bin=self.priority_bin)
            lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask)
            lin_loss = self.binary_divergence_weight * lin_bce_loss \
                     + (1 - self.binary_divergence_weight) * lin_l1_loss
@@ -197,9 +205,10 @@ class TTSLoss(object):
            total_loss += mel_loss
        if compute_attn_loss:
-            attn_loss = self.attention_loss(
+            attn_loss = self.attention_loss(attn_hyp,
-                attn_hyp, input_lengths.numpy(),
+                                            input_lengths.numpy(),
-                n_frames.numpy() // (self.downsample_factor * self.r))
+                                            n_frames.numpy() //
+                                            (self.downsample_factor * self.r))
            total_loss += attn_loss
        if compute_done_loss:

--- a/parakeet/models/deepvoice3/model.py
+++ b/parakeet/models/deepvoice3/model.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import paddle.fluid.layers as F
@@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer):
        mel_outputs, alignments, done, decoder_states = self.decoder(
            (keys, values), valid_lengths, mel_inputs, text_positions,
            frame_positions, speaker_embed)
-        linear_outputs = self.converter(
+        linear_outputs = self.converter(decoder_states
-            decoder_states if self.use_decoder_states else mel_outputs,
+                                        if self.use_decoder_states else
-            speaker_embed)
+                                        mel_outputs, speaker_embed)
        return mel_outputs, linear_outputs, alignments, done
    def transduce(self, text_sequences, text_positions, speaker_indices=None):
@@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer):
        keys, values = self.encoder(text_sequences, speaker_embed)
        mel_outputs, alignments, done, decoder_states = self.decoder.decode(
            (keys, values), text_positions, speaker_embed)
-        linear_outputs = self.converter(
+        linear_outputs = self.converter(decoder_states
-            decoder_states if self.use_decoder_states else mel_outputs,
+                                        if self.use_decoder_states else
-            speaker_embed)
+                                        mel_outputs, speaker_embed)
        return mel_outputs, linear_outputs, alignments, done
--- a/parakeet/models/deepvoice3/position_embedding.py
+++ b/parakeet/models/deepvoice3/position_embedding.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from paddle import fluid
 import paddle.fluid.layers as F
@@ -95,8 +109,9 @@ class PositionEmbedding(dg.Layer):
                                            speaker_position_rate)  # (B, V, C)
        # make indices for gather_nd
        batch_id = F.expand(
-            F.unsqueeze(F.range(0, batch_size, 1, dtype="int64"), [1]),
+            F.unsqueeze(
-            [1, time_steps])
+                F.range(
+                    0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
        # (B, T, 2)
        gather_nd_id = F.stack([batch_id, indices], -1)

--- a/parakeet/models/fastspeech/__init__.py
+++ b/parakeet/models/fastspeech/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
--- a/parakeet/models/fastspeech/decoder.py
+++ b/parakeet/models/fastspeech/decoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.fastspeech.fft_block import FFTBlock
 class Decoder(dg.Layer):
    def __init__(self,
                 len_max_seq,
@@ -18,13 +32,26 @@ class Decoder(dg.Layer):
        super(Decoder, self).__init__()
        n_position = len_max_seq + 1
-        self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
+        self.pos_inp = get_sinusoid_encoding_table(
-        self.position_enc = dg.Embedding(size=[n_position, d_model],
+            n_position, d_model, padding_idx=0)
+        self.position_enc = dg.Embedding(
+            size=[n_position, d_model],
            padding_idx=0,
            param_attr=fluid.ParamAttr(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
                trainable=False))
-        self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] 
+        self.layer_stack = [
+            FFTBlock(
+                d_model,
+                d_inner,
+                n_head,
+                d_k,
+                d_v,
+                fft_conv1d_kernel,
+                fft_conv1d_padding,
+                dropout=dropout) for _ in range(n_layers)
+        ]
        for i, layer in enumerate(self.layer_stack):
            self.add_sublayer('fft_{}'.format(i), layer)

--- a/parakeet/models/fastspeech/encoder.py
+++ b/parakeet/models/fastspeech/encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.fastspeech.fft_block import FFTBlock
 class Encoder(dg.Layer):
    def __init__(self,
                 n_src_vocab,
@@ -19,14 +33,28 @@ class Encoder(dg.Layer):
        super(Encoder, self).__init__()
        n_position = len_max_seq + 1
-        self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
+        self.src_word_emb = dg.Embedding(
-        self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
+            size=[n_src_vocab, d_model], padding_idx=0)
-        self.position_enc = dg.Embedding(size=[n_position, d_model],
+        self.pos_inp = get_sinusoid_encoding_table(
+            n_position, d_model, padding_idx=0)
+        self.position_enc = dg.Embedding(
+            size=[n_position, d_model],
            padding_idx=0,
            param_attr=fluid.ParamAttr(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
                trainable=False))
-        self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
+        self.layer_stack = [
+            FFTBlock(
+                d_model,
+                d_inner,
+                n_head,
+                d_k,
+                d_v,
+                fft_conv1d_kernel,
+                fft_conv1d_padding,
+                dropout=dropout) for _ in range(n_layers)
+        ]
        for i, layer in enumerate(self.layer_stack):
            self.add_sublayer('fft_{}'.format(i), layer)
@@ -52,7 +80,8 @@ class Encoder(dg.Layer):
        non_pad_mask = get_non_pad_mask(character)
        # -- Forward
-        enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
+        enc_output = self.src_word_emb(character) + self.position_enc(
+            text_pos)  #(N, T, C)
        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(

--- a/parakeet/models/fastspeech/fastspeech.py
+++ b/parakeet/models/fastspeech/fastspeech.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
@@ -7,12 +20,14 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator
 from parakeet.models.fastspeech.encoder import Encoder
 from parakeet.models.fastspeech.decoder import Decoder
 class FastSpeech(dg.Layer):
    def __init__(self, cfg):
        " FastSpeech"
        super(FastSpeech, self).__init__()
-        self.encoder = Encoder(n_src_vocab=len(symbols)+1,
+        self.encoder = Encoder(
+            n_src_vocab=len(symbols) + 1,
            len_max_seq=cfg['max_seq_len'],
            n_layers=cfg['encoder_n_layer'],
            n_head=cfg['encoder_head'],
@@ -23,11 +38,13 @@ class FastSpeech(dg.Layer):
            fft_conv1d_kernel=cfg['fft_conv1d_filter'],
            fft_conv1d_padding=cfg['fft_conv1d_padding'],
            dropout=0.1)
-        self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'], 
+        self.length_regulator = LengthRegulator(
+            input_size=cfg['fs_hidden_size'],
            out_channels=cfg['duration_predictor_output_size'],
            filter_size=cfg['duration_predictor_filter_size'],
            dropout=cfg['dropout'])
-        self.decoder = Decoder(len_max_seq=cfg['max_seq_len'],
+        self.decoder = Decoder(
+            len_max_seq=cfg['max_seq_len'],
            n_layers=cfg['decoder_n_layer'],
            n_head=cfg['decoder_head'],
            d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
@@ -37,14 +54,18 @@ class FastSpeech(dg.Layer):
            fft_conv1d_kernel=cfg['fft_conv1d_filter'],
            fft_conv1d_padding=cfg['fft_conv1d_padding'],
            dropout=0.1)
-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
+        self.weight = fluid.ParamAttr(
+            initializer=fluid.initializer.XavierInitializer())
        k = math.sqrt(1 / cfg['fs_hidden_size'])
-        self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
+        self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-        self.mel_linear = dg.Linear(cfg['fs_hidden_size'], 
+            low=-k, high=k))
-                                    cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'],
+        self.mel_linear = dg.Linear(
-                                    param_attr = self.weight,
+            cfg['fs_hidden_size'],
-                                    bias_attr = self.bias,)
+            cfg['audio']['num_mels'] * cfg['audio']['outputs_per_step'],
-        self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'],
+            param_attr=self.weight,
+            bias_attr=self.bias, )
+        self.postnet = PostConvNet(
+            n_mels=cfg['audio']['num_mels'],
            num_hidden=512,
            filter_size=5,
            padding=int(5 / 2),
@@ -54,7 +75,12 @@ class FastSpeech(dg.Layer):
            dropout=0.1,
            batchnorm_last=True)
-    def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0):
+    def forward(self,
+                character,
+                text_pos,
+                mel_pos=None,
+                length_target=None,
+                alpha=1.0):
        """
        FastSpeech model.
@@ -80,21 +106,24 @@ class FastSpeech(dg.Layer):
            dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
        """
-        encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos)
+        encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(
+            character, text_pos)
        if fluid.framework._dygraph_tracer()._train_mode:
-            length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
+            length_regulator_output, duration_predictor_output = self.length_regulator(
-                                                                                       target=length_target,
+                encoder_output, target=length_target, alpha=alpha)
-                                                                                       alpha=alpha)
+            decoder_output, dec_slf_attn_list = self.decoder(
-            decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos)
+                length_regulator_output, mel_pos)
            mel_output = self.mel_linear(decoder_output)
            mel_output_postnet = self.postnet(mel_output) + mel_output
            return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
        else:
-            length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha)
+            length_regulator_output, decoder_pos = self.length_regulator(
-            decoder_output, _ = self.decoder(length_regulator_output, decoder_pos)
+                encoder_output, alpha=alpha)
+            decoder_output, _ = self.decoder(length_regulator_output,
+                                             decoder_pos)
            mel_output = self.mel_linear(decoder_output)
            mel_output_postnet = self.postnet(mel_output) + mel_output

--- a/parakeet/models/fastspeech/fft_block.py
+++ b/parakeet/models/fastspeech/fft_block.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import math
 import paddle.fluid.dygraph as dg
@@ -6,11 +19,32 @@ import paddle.fluid as fluid
 from parakeet.modules.multihead_attention import MultiheadAttention
 from parakeet.modules.ffn import PositionwiseFeedForward
 class FFTBlock(dg.Layer):
-    def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
+    def __init__(self,
+                 d_model,
+                 d_inner,
+                 n_head,
+                 d_k,
+                 d_v,
+                 filter_size,
+                 padding,
+                 dropout=0.2):
        super(FFTBlock, self).__init__()
-        self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
+        self.slf_attn = MultiheadAttention(
-        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
+            d_model,
+            d_k,
+            d_v,
+            num_head=n_head,
+            is_bias=True,
+            dropout=dropout,
+            is_concat=False)
+        self.pos_ffn = PositionwiseFeedForward(
+            d_model,
+            d_inner,
+            filter_size=filter_size,
+            padding=padding,
+            dropout=dropout)
    def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
        """
@@ -27,7 +61,8 @@ class FFTBlock(dg.Layer):
            output (Variable), Shape(B, T, C), the output after self-attention & ffn.
            slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
        """
-        output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
+        output, slf_attn = self.slf_attn(
+            enc_input, enc_input, enc_input, mask=slf_attn_mask)
        output *= non_pad_mask
        output = self.pos_ffn(output)

--- a/parakeet/models/fastspeech/length_regulator.py
+++ b/parakeet/models/fastspeech/length_regulator.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import math
 import parakeet.models.fastspeech.utils
@@ -6,10 +19,12 @@ import paddle.fluid.layers as layers
 import paddle.fluid as fluid
 from parakeet.modules.customized import Conv1D
 class LengthRegulator(dg.Layer):
    def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
        super(LengthRegulator, self).__init__()
-        self.duration_predictor = DurationPredictor(input_size=input_size, 
+        self.duration_predictor = DurationPredictor(
+            input_size=input_size,
            out_channels=out_channels,
            filter_size=filter_size,
            dropout=dropout)
@@ -18,7 +33,9 @@ class LengthRegulator(dg.Layer):
        output = []
        batch_size = x.shape[0]
        for i in range(batch_size):
-            output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha))
+            output.append(
+                self.expand(x[i:i + 1], duration_predictor_output[i:i + 1],
+                            alpha))
        output = self.pad(output)
        return output
@@ -27,8 +44,8 @@ class LengthRegulator(dg.Layer):
        out_list = []
        for i in range(len(input_ele)):
            pad_len = max_len - input_ele[i].shape[0]
-            one_batch_padded = layers.pad(
+            one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0],
-                input_ele[i], [0, pad_len, 0, 0], pad_value=0.0)
+                                          pad_value=0.0)
            out_list.append(one_batch_padded)
        out_padded = layers.stack(out_list)
        return out_padded
@@ -37,17 +54,16 @@ class LengthRegulator(dg.Layer):
        out = []
        time_steps = batch.shape[1]
        fertilities = predicted.numpy()
-        batch = layers.squeeze(batch,[0]) 
+        batch = layers.squeeze(batch, [0])
        for i in range(time_steps):
-            if fertilities[0,i]==0:
+            if fertilities[0, i] == 0:
                continue
-            out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1]))
+            out.append(
+                layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1]))
        out = layers.concat(out, axis=0)
        return out
    def forward(self, x, alpha=1.0, target=None):
        """
        Length Regulator block in FastSpeech.
@@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer):
        else:
            duration_predictor_output = layers.round(duration_predictor_output)
            output = self.LR(x, duration_predictor_output, alpha)
-            mel_pos = dg.to_variable(np.arange(1, output.shape[1]+1))
+            mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1))
            mel_pos = layers.unsqueeze(mel_pos, [0])
            return output, mel_pos
 class DurationPredictor(dg.Layer):
    def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
        super(DurationPredictor, self).__init__()
@@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer):
        self.dropout = dropout
        k = math.sqrt(1 / self.input_size)
-        self.conv1 = Conv1D(num_channels = self.input_size, 
+        self.conv1 = Conv1D(
-                        num_filters = self.out_channels, 
+            num_channels=self.input_size,
-                        filter_size = self.filter_size,
+            num_filters=self.out_channels,
+            filter_size=self.filter_size,
            padding=1,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            param_attr=fluid.ParamAttr(
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
        #data_format='NTC')
        k = math.sqrt(1 / self.out_channels)
-        self.conv2 = Conv1D(num_channels = self.out_channels, 
+        self.conv2 = Conv1D(
-                        num_filters = self.out_channels, 
+            num_channels=self.out_channels,
-                        filter_size = self.filter_size,
+            num_filters=self.out_channels,
+            filter_size=self.filter_size,
            padding=1,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            param_attr=fluid.ParamAttr(
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
        #data_format='NTC')
        self.layer_norm1 = dg.LayerNorm(self.out_channels)
        self.layer_norm2 = dg.LayerNorm(self.out_channels)
-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
+        self.weight = fluid.ParamAttr(
+            initializer=fluid.initializer.XavierInitializer())
        k = math.sqrt(1 / self.out_channels)
-        self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
+        self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+            low=-k, high=k))
-        self.linear =dg.Linear(self.out_channels, 1, param_attr = self.weight,
+        self.linear = dg.Linear(
-                            bias_attr = self.bias)
+            self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias)
    def forward(self, encoder_output):
        """
@@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer):
            out (Variable), Shape(B, T, C), the output of duration predictor.
        """
        # encoder_output.shape(N, T, C)
-        out = layers.transpose(encoder_output, [0,2,1])
+        out = layers.transpose(encoder_output, [0, 2, 1])
        out = self.conv1(out)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        out = self.conv2(out)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout)
        out = layers.relu(self.linear(out))
        out = layers.squeeze(out, axes=[-1])
        return out
--- a/parakeet/models/fastspeech/utils.py
+++ b/parakeet/models/fastspeech/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 def get_alignment(attn_probs, mel_lens, n_head):
    max_F = 0
    assert attn_probs[0].shape[0] % n_head == 0
@@ -8,7 +22,7 @@ def get_alignment(attn_probs, mel_lens, n_head):
    for i in range(len(attn_probs)):
        multi_attn = attn_probs[i].numpy()
        for j in range(n_head):
-            attn = multi_attn[j*batch_size:(j+1)*batch_size]
+            attn = multi_attn[j * batch_size:(j + 1) * batch_size]
            F = score_F(attn)
            if max_F < F:
                max_F = F
@@ -16,19 +30,19 @@ def get_alignment(attn_probs, mel_lens, n_head):
    alignment = compute_duration(max_attn, mel_lens)
    return alignment
 def score_F(attn):
    max = np.max(attn, axis=-1)
    mean = np.mean(max)
    return mean
 def compute_duration(attn, mel_lens):
-    alignment = np.zeros([attn.shape[0],attn.shape[2]])
+    alignment = np.zeros([attn.shape[0], attn.shape[2]])
    mel_lens = mel_lens.numpy()
    for i in range(attn.shape[0]):
        for j in range(mel_lens[i]):
-            max_index = np.argmax(attn[i,j])
+            max_index = np.argmax(attn[i, j])
-            alignment[i,max_index] += 1
+            alignment[i, max_index] += 1
    return alignment
--- a/parakeet/models/transformer_tts/__init__.py
+++ b/parakeet/models/transformer_tts/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
--- a/parakeet/models/transformer_tts/cbhg.py
+++ b/parakeet/models/transformer_tts/cbhg.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from parakeet.g2p.text.symbols import symbols
 import paddle.fluid.dygraph as dg
@@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D
 from parakeet.modules.dynamic_gru import DynamicGRU
 import numpy as np
 class CBHG(dg.Layer):
-    def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2, 
+    def __init__(self,
-                 max_pool_kernel_size=2, is_post=False):
+                 hidden_size,
+                 batch_size,
+                 K=16,
+                 projection_size=256,
+                 num_gru_layers=2,
+                 max_pool_kernel_size=2,
+                 is_post=False):
        super(CBHG, self).__init__()
        """
        :param hidden_size: dimension of hidden unit
@@ -24,28 +44,39 @@ class CBHG(dg.Layer):
        self.projection_size = projection_size
        self.conv_list = []
        k = math.sqrt(1 / projection_size)
-        self.conv_list.append(Conv1D(num_channels = projection_size,
+        self.conv_list.append(
-                            num_filters = hidden_size,
+            Conv1D(
-                            filter_size = 1,
+                num_channels=projection_size,
-                            padding = int(np.floor(1/2)),
+                num_filters=hidden_size,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                filter_size=1,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
+                padding=int(np.floor(1 / 2)),
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k))))
        k = math.sqrt(1 / hidden_size)
-        for i in range(2,K+1):
+        for i in range(2, K + 1):
-            self.conv_list.append(Conv1D(num_channels = hidden_size,
+            self.conv_list.append(
-                            num_filters = hidden_size,
+                Conv1D(
-                            filter_size = i,
+                    num_channels=hidden_size,
-                            padding = int(np.floor(i/2)),
+                    num_filters=hidden_size,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                    filter_size=i,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
+                    padding=int(np.floor(i / 2)),
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)
        self.batchnorm_list = []
        for i in range(K):
-            self.batchnorm_list.append(dg.BatchNorm(hidden_size, 
+            self.batchnorm_list.append(
-                            data_layout='NCHW'))
+                dg.BatchNorm(
+                    hidden_size, data_layout='NCHW'))
        for i, layer in enumerate(self.batchnorm_list):
            self.add_sublayer("batchnorm_list_{}".format(i), layer)
@@ -53,68 +84,94 @@ class CBHG(dg.Layer):
        conv_outdim = hidden_size * K
        k = math.sqrt(1 / conv_outdim)
-        self.conv_projection_1 = Conv1D(num_channels = conv_outdim,
+        self.conv_projection_1 = Conv1D(
-                            num_filters = hidden_size,
+            num_channels=conv_outdim,
-                            filter_size = 3,
+            num_filters=hidden_size,
-                            padding = int(np.floor(3/2)),
+            filter_size=3,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            padding=int(np.floor(3 / 2)),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
        k = math.sqrt(1 / hidden_size)
-        self.conv_projection_2 = Conv1D(num_channels = hidden_size,
+        self.conv_projection_2 = Conv1D(
-                            num_filters = projection_size,
+            num_channels=hidden_size,
-                            filter_size = 3,
+            num_filters=projection_size,
-                            padding = int(np.floor(3/2)),
+            filter_size=3,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            padding=int(np.floor(3 / 2)),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
-        self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, 
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                            data_layout='NCHW')
+                low=-k, high=k)))
-        self.batchnorm_proj_2 = dg.BatchNorm(projection_size, 
-                            data_layout='NCHW')
+        self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
-        self.max_pool = Pool1D(pool_size = max_pool_kernel_size, 
+        self.batchnorm_proj_2 = dg.BatchNorm(
+            projection_size, data_layout='NCHW')
+        self.max_pool = Pool1D(
+            pool_size=max_pool_kernel_size,
            pool_type='max',
            pool_stride=1,
            pool_padding=1,
-                    data_format = "NCT")
+            data_format="NCT")
        self.highway = Highwaynet(self.projection_size)
        h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
        h_0 = dg.to_variable(h_0)
        k = math.sqrt(1 / hidden_size)
-        self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
+        self.fc_forward1 = dg.Linear(
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            hidden_size,
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            hidden_size // 2 * 3,
-        self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
-                            param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-        self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
+                low=-k, high=k)))
-                              is_reverse = False,
+        self.fc_reverse1 = dg.Linear(
-                              origin_mode = True,
+            hidden_size,
-                              h_0 = h_0)
+            hidden_size // 2 * 3,
-        self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.gru_forward1 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=False,
+            origin_mode=True,
+            h_0=h_0)
+        self.gru_reverse1 = DynamicGRU(
+            size=self.hidden_size // 2,
            is_reverse=True,
            origin_mode=True,
-                              h_0 = h_0)
+            h_0=h_0)
-        self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
+        self.fc_forward2 = dg.Linear(
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            hidden_size,
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            hidden_size // 2 * 3,
-        self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                initializer=fluid.initializer.XavierInitializer()),
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-        self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
+                low=-k, high=k)))
-                              is_reverse = False,
+        self.fc_reverse2 = dg.Linear(
-                              origin_mode = True,
+            hidden_size,
-                              h_0 = h_0)
+            hidden_size // 2 * 3,
-        self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.gru_forward2 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=False,
+            origin_mode=True,
+            h_0=h_0)
+        self.gru_reverse2 = DynamicGRU(
+            size=self.hidden_size // 2,
            is_reverse=True,
            origin_mode=True,
-                              h_0 = h_0)
+            h_0=h_0)
    def _conv_fit_dim(self, x, filter_size=3):
        if filter_size % 2 == 0:
-            return x[:,:,:-1]
+            return x[:, :, :-1]
        else:
            return x
@@ -124,20 +181,23 @@ class CBHG(dg.Layer):
        conv_list = []
        conv_input = input_
-        for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
+        for i, (conv, batchnorm
-            conv_input = self._conv_fit_dim(conv(conv_input), i+1)
+                ) in enumerate(zip(self.conv_list, self.batchnorm_list)):
+            conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
            conv_input = layers.relu(batchnorm(conv_input))
            conv_list.append(conv_input)
        conv_cat = layers.concat(conv_list, axis=1)
-        conv_pool = self.max_pool(conv_cat)[:,:,:-1]
+        conv_pool = self.max_pool(conv_cat)[:, :, :-1]
+        conv_proj = layers.relu(
-        conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
+            self.batchnorm_proj_1(
-        conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
+                self._conv_fit_dim(self.conv_projection_1(conv_pool))))
+        conv_proj = self.batchnorm_proj_2(
+            self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
        # conv_proj.shape = [N, C, T]
-        highway = layers.transpose(conv_proj, [0,2,1])
+        highway = layers.transpose(conv_proj, [0, 2, 1])
        highway = self.highway(highway)
        # highway.shape = [N, T, C]
@@ -151,9 +211,10 @@ class CBHG(dg.Layer):
        out_forward = self.gru_forward2(fc_forward)
        out_reverse = self.gru_reverse2(fc_reverse)
        out = layers.concat([out_forward, out_reverse], axis=-1)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        return out
 class Highwaynet(dg.Layer):
    def __init__(self, num_units, num_layers=4):
        super(Highwaynet, self).__init__()
@@ -164,14 +225,26 @@ class Highwaynet(dg.Layer):
        self.linears = []
        k = math.sqrt(1 / num_units)
        for i in range(num_layers):
-            self.linears.append(dg.Linear(num_units, num_units,
+            self.linears.append(
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                dg.Linear(
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
+                    num_units,
-            self.gates.append(dg.Linear(num_units, num_units,
+                    num_units,
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                    param_attr=fluid.ParamAttr(
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
-        for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
+            self.gates.append(
+                dg.Linear(
+                    num_units,
+                    num_units,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
+        for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
            self.add_sublayer("linears_{}".format(i), linear)
            self.add_sublayer("gates_{}".format(i), gate)
@@ -186,9 +259,3 @@ class Highwaynet(dg.Layer):
            out = h * t_ + out * c
        return out
--- a/parakeet/models/transformer_tts/decoder.py
+++ b/parakeet/models/transformer_tts/decoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
@@ -7,48 +20,83 @@ from parakeet.modules.ffn import PositionwiseFeedForward
 from parakeet.models.transformer_tts.prenet import PreNet
 from parakeet.models.transformer_tts.post_convnet import PostConvNet
 class Decoder(dg.Layer):
    def __init__(self, num_hidden, config, num_head=4):
        super(Decoder, self).__init__()
        self.num_hidden = num_hidden
        param = fluid.ParamAttr()
-        self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
+        self.alpha = self.create_parameter(
-                        default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
+            shape=(1, ),
-        self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
+            attr=param,
-        self.pos_emb = dg.Embedding(size=[1024, num_hidden],
+            dtype='float32',
+            default_initializer=fluid.initializer.ConstantInitializer(
+                value=1.0))
+        self.pos_inp = get_sinusoid_encoding_table(
+            1024, self.num_hidden, padding_idx=0)
+        self.pos_emb = dg.Embedding(
+            size=[1024, num_hidden],
            padding_idx=0,
            param_attr=fluid.ParamAttr(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
                trainable=False))
-        self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'], 
+        self.decoder_prenet = PreNet(
-                                            hidden_size = num_hidden * 2, 
+            input_size=config['audio']['num_mels'],
-                                            output_size = num_hidden, 
+            hidden_size=num_hidden * 2,
+            output_size=num_hidden,
            dropout_rate=0.2)
        k = math.sqrt(1 / num_hidden)
-        self.linear = dg.Linear(num_hidden, num_hidden,
+        self.linear = dg.Linear(
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            num_hidden,
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            num_hidden,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
-        self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        self.selfattn_layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
        for i, layer in enumerate(self.selfattn_layers):
            self.add_sublayer("self_attn_{}".format(i), layer)
-        self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        self.attn_layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
        for i, layer in enumerate(self.attn_layers):
            self.add_sublayer("attn_{}".format(i), layer)
-        self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
+        self.ffns = [
+            PositionwiseFeedForward(
+                num_hidden, num_hidden * num_head, filter_size=1)
+            for _ in range(3)
+        ]
        for i, layer in enumerate(self.ffns):
            self.add_sublayer("ffns_{}".format(i), layer)
-        self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'],
+        self.mel_linear = dg.Linear(
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            num_hidden,
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            config['audio']['num_mels'] * config['audio']['outputs_per_step'],
-        self.stop_linear = dg.Linear(num_hidden, 1,
+            param_attr=fluid.ParamAttr(
-                                  param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                initializer=fluid.initializer.XavierInitializer()),
-                                  bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
-        self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'], 
+        self.stop_linear = dg.Linear(
-                                       filter_size = 5, padding = 4, num_conv=5, 
+            num_hidden,
+            1,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.postconvnet = PostConvNet(
+            config['audio']['num_mels'],
+            config['hidden_size'],
+            filter_size=5,
+            padding=4,
+            num_conv=5,
            outputs_per_step=config['audio']['outputs_per_step'],
-                                       use_cudnn = True)
+            use_cudnn=True)
    def forward(self, key, value, query, c_mask, positional):
@@ -56,15 +104,20 @@ class Decoder(dg.Layer):
        if fluid.framework._dygraph_tracer()._train_mode:
            m_mask = get_non_pad_mask(positional)
-            mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
+            mask = get_attn_key_pad_mask((positional == 0).astype(np.float32),
-            triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
+                                         query)
+            triu_tensor = dg.to_variable(
+                get_triu_tensor(query.numpy(), query.numpy())).astype(
+                    np.float32)
            mask = mask + triu_tensor
            mask = fluid.layers.cast(mask == 0, np.float32)
            # (batch_size, decoder_len, encoder_len)
-            zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
+            zero_mask = get_attn_key_pad_mask(
+                layers.squeeze(c_mask, [-1]), query)
        else:
-            mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
+            mask = get_triu_tensor(query.numpy(),
+                                   query.numpy()).astype(np.float32)
            mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
            m_mask, zero_mask = None, None
@@ -85,9 +138,12 @@ class Decoder(dg.Layer):
        selfattn_list = list()
        attn_list = list()
-        for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
+        for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
-            query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
+                                       self.ffns):
-            query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
+            query, attn_dec = selfattn(
+                query, query, query, mask=mask, query_mask=m_mask)
+            query, attn_dot = attn(
+                key, value, query, mask=zero_mask, query_mask=m_mask)
            query = ffn(query)
            selfattn_list.append(attn_dec)
            attn_list.append(attn_dot)

--- a/parakeet/models/transformer_tts/encoder.py
+++ b/parakeet/models/transformer_tts/encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
@@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention
 from parakeet.modules.ffn import PositionwiseFeedForward
 from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
 class Encoder(dg.Layer):
    def __init__(self, embedding_size, num_hidden, num_head=4):
        super(Encoder, self).__init__()
        self.num_hidden = num_hidden
-        param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
+        param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
-        self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
+            value=1.0))
-        self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
+        self.alpha = self.create_parameter(
-        self.pos_emb = dg.Embedding(size=[1024, num_hidden],
+            shape=(1, ), attr=param, dtype='float32')
+        self.pos_inp = get_sinusoid_encoding_table(
+            1024, self.num_hidden, padding_idx=0)
+        self.pos_emb = dg.Embedding(
+            size=[1024, num_hidden],
            padding_idx=0,
            param_attr=fluid.ParamAttr(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
                trainable=False))
-        self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, 
+        self.encoder_prenet = EncoderPrenet(
-                                            num_hidden = num_hidden, 
+            embedding_size=embedding_size,
+            num_hidden=num_hidden,
            use_cudnn=True)
-        self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        self.layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
        for i, layer in enumerate(self.layers):
            self.add_sublayer("self_attn_{}".format(i), layer)
-        self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)]
+        self.ffns = [
+            PositionwiseFeedForward(
+                num_hidden,
+                num_hidden * num_head,
+                filter_size=1,
+                use_cudnn=True) for _ in range(3)
+        ]
        for i, layer in enumerate(self.ffns):
            self.add_sublayer("ffns_{}".format(i), layer)
@@ -37,20 +66,18 @@ class Encoder(dg.Layer):
        # Encoder pre_network
        x = self.encoder_prenet(x)  #(N,T,C)
        # Get positional encoding
        positional = self.pos_emb(positional)
        x = positional * self.alpha + x  #(N, T, C)
        # Positional dropout
        x = layers.dropout(x, 0.1)
        # Self attention encoder
        attentions = list()
        for layer, ffn in zip(self.layers, self.ffns):
-            x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
+            x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
            x = ffn(x)
            attentions.append(attention)

--- a/parakeet/models/transformer_tts/encoderprenet.py
+++ b/parakeet/models/transformer_tts/encoderprenet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from parakeet.g2p.text.symbols import symbols
 import paddle.fluid.dygraph as dg
@@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer):
        self.embedding_size = embedding_size
        self.num_hidden = num_hidden
        self.use_cudnn = use_cudnn
-        self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
+        self.embedding = dg.Embedding(
-                                        padding_idx = None)
+            size=[len(symbols), embedding_size], padding_idx=None)
        self.conv_list = []
        k = math.sqrt(1 / embedding_size)
-        self.conv_list.append(Conv1D(num_channels = embedding_size, 
+        self.conv_list.append(
-                            num_filters = num_hidden, 
+            Conv1D(
-                            filter_size = 5,
+                num_channels=embedding_size,
-                            padding = int(np.floor(5/2)),
+                num_filters=num_hidden,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                filter_size=5,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+                padding=int(np.floor(5 / 2)),
-                            use_cudnn = use_cudnn))
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))
        k = math.sqrt(1 / num_hidden)
        for _ in range(2):
-            self.conv_list.append(Conv1D(num_channels = num_hidden, 
+            self.conv_list.append(
-                                num_filters = num_hidden, 
+                Conv1D(
-                                filter_size = 5,
+                    num_channels=num_hidden,
-                                padding = int(np.floor(5/2)),
+                    num_filters=num_hidden,
-                                param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                    filter_size=5,
-                                bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+                    padding=int(np.floor(5 / 2)),
-                                use_cudnn = use_cudnn))
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k)),
+                    use_cudnn=use_cudnn))
        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)
-        self.batch_norm_list = [dg.BatchNorm(num_hidden, 
+        self.batch_norm_list = [
-                            data_layout='NCHW') for _ in range(3)]
+            dg.BatchNorm(
+                num_hidden, data_layout='NCHW') for _ in range(3)
+        ]
        for i, layer in enumerate(self.batch_norm_list):
            self.add_sublayer("batch_norm_list_{}".format(i), layer)
        k = math.sqrt(1 / num_hidden)
-        self.projection = dg.Linear(num_hidden, num_hidden,
+        self.projection = dg.Linear(
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            num_hidden,
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            num_hidden,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
    def forward(self, x):
        x = self.embedding(x)  #(batch_size, seq_len, embending_size)
-        x = layers.transpose(x,[0,2,1])
+        x = layers.transpose(x, [0, 2, 1])
        for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
            x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
-        x = layers.transpose(x,[0,2,1]) #(N,T,C)
+        x = layers.transpose(x, [0, 2, 1])  #(N,T,C)
        x = self.projection(x)
        return x
--- a/parakeet/models/transformer_tts/post_convnet.py
+++ b/parakeet/models/transformer_tts/post_convnet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from parakeet.modules.customized import Conv1D
 class PostConvNet(dg.Layer):
    def __init__(self,
                 n_mels=80,
@@ -22,44 +36,61 @@ class PostConvNet(dg.Layer):
        self.batchnorm_last = batchnorm_last
        self.conv_list = []
        k = math.sqrt(1 / (n_mels * outputs_per_step))
-        self.conv_list.append(Conv1D(num_channels = n_mels * outputs_per_step,
+        self.conv_list.append(
-                            num_filters = num_hidden,
+            Conv1D(
-                            filter_size = filter_size,
+                num_channels=n_mels * outputs_per_step,
-                            padding = padding,
+                num_filters=num_hidden,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                filter_size=filter_size,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+                padding=padding,
-                            use_cudnn = use_cudnn))
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))
        k = math.sqrt(1 / num_hidden)
-        for _ in range(1, num_conv-1):
+        for _ in range(1, num_conv - 1):
-            self.conv_list.append(Conv1D(num_channels = num_hidden,
+            self.conv_list.append(
-                                num_filters = num_hidden,
+                Conv1D(
-                                filter_size = filter_size,
+                    num_channels=num_hidden,
-                                padding = padding,
+                    num_filters=num_hidden,
-                                param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                    filter_size=filter_size,
-                                bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+                    padding=padding,
-                                use_cudnn = use_cudnn))
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k)),
+                    use_cudnn=use_cudnn))
-        self.conv_list.append(Conv1D(num_channels = num_hidden,
+        self.conv_list.append(
-                            num_filters = n_mels * outputs_per_step,
+            Conv1D(
-                            filter_size = filter_size,
+                num_channels=num_hidden,
-                            padding = padding,
+                num_filters=n_mels * outputs_per_step,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                filter_size=filter_size,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+                padding=padding,
-                            use_cudnn = use_cudnn))
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))
        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)
-        self.batch_norm_list = [dg.BatchNorm(num_hidden, 
+        self.batch_norm_list = [
-                            data_layout='NCHW') for _ in range(num_conv-1)]
+            dg.BatchNorm(
+                num_hidden, data_layout='NCHW') for _ in range(num_conv - 1)
+        ]
        if self.batchnorm_last:
-            self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, 
+            self.batch_norm_list.append(
-                                data_layout='NCHW'))
+                dg.BatchNorm(
+                    n_mels * outputs_per_step, data_layout='NCHW'))
        for i, layer in enumerate(self.batch_norm_list):
            self.add_sublayer("batch_norm_list_{}".format(i), layer)
    def forward(self, input):
        """
        Post Conv Net.
@@ -70,17 +101,18 @@ class PostConvNet(dg.Layer):
            output (Variable), Shape(B, T, C), the result after postconvnet.
        """
-        input = layers.transpose(input, [0,2,1])
+        input = layers.transpose(input, [0, 2, 1])
        len = input.shape[-1]
-        for i in range(self.num_conv-1):
+        for i in range(self.num_conv - 1):
            batch_norm = self.batch_norm_list[i]
            conv = self.conv_list[i]
-            input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
+            input = layers.dropout(
-        conv = self.conv_list[self.num_conv-1]
+                layers.tanh(batch_norm(conv(input)[:, :, :len])), self.dropout)
-        input = conv(input)[:,:,:len]
+        conv = self.conv_list[self.num_conv - 1]
+        input = conv(input)[:, :, :len]
        if self.batchnorm_last:
-            batch_norm = self.batch_norm_list[self.num_conv-1]
+            batch_norm = self.batch_norm_list[self.num_conv - 1]
            input = layers.dropout(batch_norm(input), self.dropout)
-        output = layers.transpose(input, [0,2,1])
+        output = layers.transpose(input, [0, 2, 1])
        return output
--- a/parakeet/models/transformer_tts/prenet.py
+++ b/parakeet/models/transformer_tts/prenet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 class PreNet(dg.Layer):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
        """
@@ -17,13 +31,21 @@ class PreNet(dg.Layer):
        self.dropout_rate = dropout_rate
        k = math.sqrt(1 / input_size)
-        self.linear1 = dg.Linear(input_size, hidden_size,
+        self.linear1 = dg.Linear(
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            input_size,
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            hidden_size,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
        k = math.sqrt(1 / hidden_size)
-        self.linear2 = dg.Linear(hidden_size, output_size,
+        self.linear2 = dg.Linear(
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            hidden_size,
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            output_size,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
    def forward(self, x):
        """

--- a/parakeet/models/transformer_tts/transformer_tts.py
+++ b/parakeet/models/transformer_tts/transformer_tts.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.encoder import Encoder
 from parakeet.models.transformer_tts.decoder import Decoder
 class TransformerTTS(dg.Layer):
    def __init__(self, config):
        super(TransformerTTS, self).__init__()
@@ -14,13 +28,7 @@ class TransformerTTS(dg.Layer):
        key, c_mask, attns_enc = self.encoder(characters, pos_text)
-        mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
+        mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
+            key, key, mel_input, c_mask, pos_mel)
        return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
--- a/parakeet/models/transformer_tts/utils.py
+++ b/parakeet/models/transformer_tts/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import librosa
 import os, copy
@@ -6,14 +19,15 @@ import paddle.fluid.layers as layers
 def get_positional_table(d_pos_vec, n_position=1024):
-    position_enc = np.array([
+    position_enc = np.array(
-        [pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)]
+        [[pos / np.power(10000, 2 * i / d_pos_vec) for i in range(d_pos_vec)]
         if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
    return position_enc
 def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
    ''' Sinusoid position encoding table '''
@@ -23,7 +37,8 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
-    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table = np.array(
+        [get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
@@ -34,8 +49,10 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
    return sinusoid_table
 def get_non_pad_mask(seq):
-    return layers.unsqueeze((seq != 0).astype(np.float32),[-1])
+    return layers.unsqueeze((seq != 0).astype(np.float32), [-1])
 def get_attn_key_pad_mask(seq_k, seq_q):
    ''' For masking out the padding part of key sequence. '''
@@ -43,32 +60,37 @@ def get_attn_key_pad_mask(seq_k, seq_q):
    # Expand to fit the shape of key query attention matrix.
    len_q = seq_q.shape[1]
    padding_mask = (seq_k != 0).astype(np.float32)
-    padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1]) 
+    padding_mask = layers.expand(
+        layers.unsqueeze(padding_mask, [1]), [1, len_q, 1])
    return padding_mask
 def get_triu_tensor(seq_k, seq_q):
    ''' For make a triu tensor '''
    len_k = seq_k.shape[1]
    len_q = seq_q.shape[1]
    batch_size = seq_k.shape[0]
    triu_tensor = np.triu(np.ones([len_k, len_q]), 1)
-    triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0)
+    triu_tensor = np.repeat(
+        np.expand_dims(
+            triu_tensor, axis=0), batch_size, axis=0)
    return triu_tensor
 def guided_attention(N, T, g=0.2):
    '''Guided attention. Refer to page 3 on the paper.'''
    W = np.zeros((N, T), dtype=np.float32)
    for n_pos in range(W.shape[0]):
        for t_pos in range(W.shape[1]):
-            W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
+            W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N))
+                                         **2 / (2 * g * g))
    return W
 def cross_entropy(input, label, position_weight=1.0, epsilon=1e-30):
-    output = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
+    output = -1 * label * layers.log(input + epsilon) - (
+        1 - label) * layers.log(1 - input + epsilon)
    output = output * (label * (position_weight - 1) + 1)
    return layers.reduce_sum(output, dim=[0, 1])
--- a/parakeet/models/transformer_tts/vocoder.py
+++ b/parakeet/models/transformer_tts/vocoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.modules.customized import Conv1D
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.transformer_tts.cbhg import CBHG
 class Vocoder(dg.Layer):
    """
    CBHG Network (mel -> linear)
    """
    def __init__(self, config, batch_size):
        super(Vocoder, self).__init__()
-        self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'], 
+        self.pre_proj = Conv1D(
-                             num_filters = config['hidden_size'],
+            num_channels=config['audio']['num_mels'],
+            num_filters=config['hidden_size'],
            filter_size=1)
        self.cbhg = CBHG(config['hidden_size'], batch_size)
-        self.post_proj = Conv1D(num_channels = config['hidden_size'], 
+        self.post_proj = Conv1D(
-                             num_filters = (config['audio']['n_fft'] // 2) + 1,
+            num_channels=config['hidden_size'],
+            num_filters=(config['audio']['n_fft'] // 2) + 1,
            filter_size=1)
    def forward(self, mel):
-        mel = layers.transpose(mel, [0,2,1])
+        mel = layers.transpose(mel, [0, 2, 1])
        mel = self.pre_proj(mel)
        mel = self.cbhg(mel)
        mag_pred = self.post_proj(mel)
-        mag_pred = layers.transpose(mag_pred, [0,2,1])
+        mag_pred = layers.transpose(mag_pred, [0, 2, 1])
        return mag_pred
--- a/parakeet/models/waveflow/__init__.py
+++ b/parakeet/models/waveflow/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from parakeet.models.waveflow.waveflow import WaveFlow
--- a/parakeet/models/waveflow/data.py
+++ b/parakeet/models/waveflow/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import random
 import librosa

--- a/parakeet/models/waveflow/waveflow.py
+++ b/parakeet/models/waveflow/waveflow.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import itertools
 import os
 import time

--- a/parakeet/models/waveflow/waveflow_modules.py
+++ b/parakeet/models/waveflow/waveflow_modules.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import itertools
 import numpy as np
 import paddle.fluid.dygraph as dg

--- a/parakeet/models/wavenet/README.md
+++ b/parakeet/models/wavenet/README.md
--- a/parakeet/models/wavenet/data.py
+++ b/parakeet/models/wavenet/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import random
 import librosa
@@ -18,8 +32,8 @@ class Dataset(ljspeech.LJSpeech):
        self.fft_window_shift = config.fft_window_shift
        # Calculate context frames.
        frames_per_second = config.sample_rate // self.fft_window_shift
-        train_clip_frames = int(np.ceil(
+        train_clip_frames = int(
-            config.train_clip_second * frames_per_second))
+            np.ceil(config.train_clip_second * frames_per_second))
        context_frames = config.context_size // self.fft_window_shift
        self.num_frames = train_clip_frames + context_frames
@@ -53,12 +67,16 @@ class Dataset(ljspeech.LJSpeech):
        # Compute mel-spectrogram.
        # Turn center to False to prevent internal padding.
        spectrogram = librosa.core.stft(
-            audio, hop_length=fft_window_shift,
+            audio,
-            win_length=fft_window_size, n_fft=fft_size, center=False)
+            hop_length=fft_window_shift,
+            win_length=fft_window_size,
+            n_fft=fft_size,
+            center=False)
        spectrogram_magnitude = np.abs(spectrogram)
        # Compute mel-spectrograms.
-        mel_filter_bank = librosa.filters.mel(sr=sr, n_fft=fft_size,
+        mel_filter_bank = librosa.filters.mel(sr=sr,
+                                              n_fft=fft_size,
                                              n_mels=config.mel_bands)
        mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
        mel_spectrogram = mel_spectrogram.T
@@ -70,7 +88,7 @@ class Dataset(ljspeech.LJSpeech):
        mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1)
        # Extract the center of audio that corresponds to mel spectrograms.
-        audio = audio[fft_padding : -fft_padding]
+        audio = audio[fft_padding:-fft_padding]
        assert mel_spectrogram.shape[0] * fft_window_shift == audio.size
        return audio, mel_spectrogram
@@ -101,7 +119,7 @@ class Subset(dataset.Dataset):
            audio_start = frame_start * fft_window_shift
            audio_end = frame_end * fft_window_shift
-            audio = audio[audio_start : audio_end]
+            audio = audio[audio_start:audio_end]
        return audio, mel, audio_start
@@ -141,8 +159,8 @@ class LJSpeech:
        sampler = DistributedSampler(len(trainset), nranks, rank)
        total_bs = config.batch_size
        assert total_bs % nranks == 0
-        train_sampler = BatchSampler(sampler, total_bs // nranks,
+        train_sampler = BatchSampler(
-            drop_last=True)
+            sampler, total_bs // nranks, drop_last=True)
        trainloader = DataCargo(trainset, batch_sampler=train_sampler)
        trainreader = fluid.io.PyReader(capacity=50, return_list=True)

--- a/parakeet/models/wavenet/slurm.py
+++ b/parakeet/models/wavenet/slurm.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Utility module for restarting training when using SLURM.
 """
@@ -45,8 +58,8 @@ def parse_time(text):
    try:
        return parse_hours(hours) * 3600 + int(minutes) * 60 + int(seconds)
    except ValueError as e:
-        raise ValueError("Error parsing time {}. Got error {}.".format(
+        raise ValueError("Error parsing time {}. Got error {}.".format(text,
-            text, str(e)))
+                                                                       str(e)))
 def restart_command():
@@ -76,8 +89,10 @@ def restart_command():
    gres, partition = info.get("Gres"), info.get("Partition")
    stderr, stdout = info.get("StdErr"), info.get("StdOut")
    job_name = info.get("JobName")
-    command = ["sbatch", "--job-name={}".format(job_name),
+    command = [
-               "--ntasks={}".format(num_tasks)]
+        "sbatch", "--job-name={}".format(job_name),
+        "--ntasks={}".format(num_tasks)
+    ]
    if partition:
        command.extend(["--partition", partition])
@@ -98,11 +113,12 @@ def restart_command():
    dist_setting = ['-m', 'paddle.distributed.launch']
    wrap_cmd = ["srun", python, '-u'] + dist_setting + sys.argv
-    command.append(
+    command.append("--wrap={}".format(" ".join(
-        "--wrap={}".format(" ".join(shlex.quote(arg) for arg in wrap_cmd)))
+        shlex.quote(arg) for arg in wrap_cmd)))
    time_limit_string = info["TimeLimit"]
    if time_limit_string.lower() == "unlimited":
-        print("UNLIMITED detected: restart OFF, infinite learning ON.",
+        print(
+            "UNLIMITED detected: restart OFF, infinite learning ON.",
            flush=True)
        return command, None
    time_limit = parse_time(time_limit_string)

--- a/parakeet/models/wavenet/synthesis.py
+++ b/parakeet/models/wavenet/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 from pprint import pprint
@@ -12,25 +26,42 @@ from wavenet import WaveNet
 def add_options_to_parser(parser):
-    parser.add_argument('--model', type=str, default='wavenet',
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='wavenet',
        help="general name of the model")
-    parser.add_argument('--name', type=str,
+    parser.add_argument(
-        help="specific name of the training model")
+        '--name', type=str, help="specific name of the training model")
-    parser.add_argument('--root', type=str,
+    parser.add_argument(
-        help="root path of the LJSpeech dataset")
+        '--root', type=str, help="root path of the LJSpeech dataset")
-    parser.add_argument('--use_gpu', type=bool, default=True,
+    parser.add_argument(
+        '--use_gpu',
+        type=bool,
+        default=True,
        help="option to use gpu training")
-    parser.add_argument('--iteration', type=int, default=None,
+    parser.add_argument(
+        '--iteration',
+        type=int,
+        default=None,
        help=("which iteration of checkpoint to load, "
              "default to load the latest checkpoint"))
-    parser.add_argument('--checkpoint', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint',
+        type=str,
+        default=None,
        help="path of the checkpoint to load")
-    parser.add_argument('--output', type=str, default="./syn_audios",
+    parser.add_argument(
+        '--output',
+        type=str,
+        default="./syn_audios",
        help="path to write synthesized audio files")
-    parser.add_argument('--sample', type=int,
+    parser.add_argument(
+        '--sample',
+        type=int,
        help="which of the valid samples to synthesize audio")

--- a/parakeet/models/wavenet/train.py
+++ b/parakeet/models/wavenet/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 import subprocess
@@ -18,24 +32,42 @@ MAXIMUM_SAVE_TIME = 10 * 60
 def add_options_to_parser(parser):
-    parser.add_argument('--model', type=str, default='wavenet',
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='wavenet',
        help="general name of the model")
-    parser.add_argument('--name', type=str,
+    parser.add_argument(
-        help="specific name of the training model")
+        '--name', type=str, help="specific name of the training model")
-    parser.add_argument('--root', type=str,
+    parser.add_argument(
-        help="root path of the LJSpeech dataset")
+        '--root', type=str, help="root path of the LJSpeech dataset")
-    parser.add_argument('--parallel', type=bool, default=True,
+    parser.add_argument(
+        '--parallel',
+        type=bool,
+        default=True,
        help="option to use data parallel training")
-    parser.add_argument('--use_gpu', type=bool, default=True,
+    parser.add_argument(
+        '--use_gpu',
+        type=bool,
+        default=True,
        help="option to use gpu training")
-    parser.add_argument('--iteration', type=int, default=None,
+    parser.add_argument(
+        '--iteration',
+        type=int,
+        default=None,
        help=("which iteration of checkpoint to load, "
              "default to load the latest checkpoint"))
-    parser.add_argument('--checkpoint', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint',
+        type=str,
+        default=None,
        help="path of the checkpoint to load")
-    parser.add_argument('--slurm', type=bool, default=False,
+    parser.add_argument(
+        '--slurm',
+        type=bool,
+        default=False,
        help="whether you are using slurm to submit training jobs")
@@ -104,8 +136,8 @@ def train(config):
            # Check whether reaching the time limit.
            if config.slurm:
-                done = (death_time is not None and death_time - time.time() <
+                done = (death_time is not None and
-                    MAXIMUM_SAVE_TIME)
+                        death_time - time.time() < MAXIMUM_SAVE_TIME)
            if rank == 0 and done:
                print("Saving progress before exiting.")
@@ -127,8 +159,8 @@ def train(config):
 if __name__ == "__main__":
    # Create parser.
-    parser = jsonargparse.ArgumentParser(description="Train WaveNet model",
+    parser = jsonargparse.ArgumentParser(
-        formatter_class='default_argparse')
+        description="Train WaveNet model", formatter_class='default_argparse')
    add_options_to_parser(parser)
    utils.add_config_options_to_parser(parser)

--- a/parakeet/models/wavenet/utils.py
+++ b/parakeet/models/wavenet/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import itertools
 import os
 import time
@@ -8,57 +22,82 @@ import paddle.fluid.dygraph as dg
 def add_config_options_to_parser(parser):
-    parser.add_argument('--valid_size', type=int,
+    parser.add_argument(
-        help="size of the valid dataset")
+        '--valid_size', type=int, help="size of the valid dataset")
-    parser.add_argument('--train_clip_second', type=float,
+    parser.add_argument(
+        '--train_clip_second',
+        type=float,
        help="the length of audio clip for training")
-    parser.add_argument('--sample_rate', type=int,
+    parser.add_argument(
-        help="sampling rate of audio data file")
+        '--sample_rate', type=int, help="sampling rate of audio data file")
-    parser.add_argument('--fft_window_shift', type=int,
+    parser.add_argument(
+        '--fft_window_shift',
+        type=int,
        help="the shift of fft window for each frame")
-    parser.add_argument('--fft_window_size', type=int,
+    parser.add_argument(
+        '--fft_window_size',
+        type=int,
        help="the size of fft window for each frame")
-    parser.add_argument('--fft_size', type=int,
+    parser.add_argument(
-        help="the size of fft filter on each frame")
+        '--fft_size', type=int, help="the size of fft filter on each frame")
-    parser.add_argument('--mel_bands', type=int,
+    parser.add_argument(
+        '--mel_bands',
+        type=int,
        help="the number of mel bands when calculating mel spectrograms")
-    parser.add_argument('--seed', type=int,
+    parser.add_argument(
-        help="seed of random initialization for the model")
+        '--seed', type=int, help="seed of random initialization for the model")
-    parser.add_argument('--batch_size', type=int,
+    parser.add_argument(
-        help="batch size for training")
+        '--batch_size', type=int, help="batch size for training")
-    parser.add_argument('--test_every', type=int,
+    parser.add_argument(
-        help="test interval during training")
+        '--test_every', type=int, help="test interval during training")
-    parser.add_argument('--save_every', type=int,
+    parser.add_argument(
+        '--save_every',
+        type=int,
        help="checkpointing interval during training")
-    parser.add_argument('--max_iterations', type=int,
+    parser.add_argument(
-        help="maximum training iterations")
+        '--max_iterations', type=int, help="maximum training iterations")
-    parser.add_argument('--layers', type=int,
+    parser.add_argument(
-        help="number of dilated convolution layers")
+        '--layers', type=int, help="number of dilated convolution layers")
-    parser.add_argument('--kernel_width', type=int,
+    parser.add_argument(
-        help="dilated convolution kernel width")
+        '--kernel_width', type=int, help="dilated convolution kernel width")
-    parser.add_argument('--dilation_block', type=list,
+    parser.add_argument(
-        help="dilated convolution kernel width")
+        '--dilation_block', type=list, help="dilated convolution kernel width")
    parser.add_argument('--residual_channels', type=int)
    parser.add_argument('--skip_channels', type=int)
-    parser.add_argument('--loss_type', type=str,
+    parser.add_argument(
-        help="mix-gaussian-pdf or softmax")
+        '--loss_type', type=str, help="mix-gaussian-pdf or softmax")
-    parser.add_argument('--num_channels', type=int, default=None,
+    parser.add_argument(
+        '--num_channels',
+        type=int,
+        default=None,
        help="number of channels for softmax output")
-    parser.add_argument('--num_mixtures', type=int, default=None,
+    parser.add_argument(
+        '--num_mixtures',
+        type=int,
+        default=None,
        help="number of gaussian mixtures for gaussian output")
-    parser.add_argument('--log_scale_min', type=float, default=None,
+    parser.add_argument(
+        '--log_scale_min',
+        type=float,
+        default=None,
        help="minimum clip value of log variance of gaussian output")
-    parser.add_argument('--conditioner.filter_sizes', type=list,
+    parser.add_argument(
+        '--conditioner.filter_sizes',
+        type=list,
        help="conv2d tranpose op filter sizes for building conditioner")
-    parser.add_argument('--conditioner.upsample_factors', type=list,
+    parser.add_argument(
+        '--conditioner.upsample_factors',
+        type=list,
        help="list of upsample factors for building conditioner")
    parser.add_argument('--learning_rate', type=float)
    parser.add_argument('--gradient_max_norm', type=float)
-    parser.add_argument('--anneal.every', type=int,
+    parser.add_argument(
+        '--anneal.every',
+        type=int,
        help="step interval for annealing learning rate")
    parser.add_argument('--anneal.rate', type=float)
@@ -113,8 +152,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration):
        handle.write("model_checkpoint_path: step-{}".format(iteration))
-def load_parameters(checkpoint_dir, rank, model, optimizer=None,
+def load_parameters(checkpoint_dir,
-                    iteration=None, file_path=None):
+                    rank,
+                    model,
+                    optimizer=None,
+                    iteration=None,
+                    file_path=None):
    if file_path is None:
        if iteration is None:
            iteration = load_latest_checkpoint(checkpoint_dir, rank)

--- a/parakeet/models/wavenet/wavenet.py
+++ b/parakeet/models/wavenet/wavenet.py
--- a/parakeet/models/wavenet/wavenet_modules.py
+++ b/parakeet/models/wavenet/wavenet_modules.py
--- a/parakeet/modules/__init__.py
+++ b/parakeet/modules/__init__.py
--- a/parakeet/modules/customized.py
+++ b/parakeet/modules/customized.py
--- a/parakeet/modules/dynamic_gru.py
+++ b/parakeet/modules/dynamic_gru.py
--- a/parakeet/modules/ffn.py
+++ b/parakeet/modules/ffn.py
--- a/parakeet/modules/multihead_attention.py
+++ b/parakeet/modules/multihead_attention.py
--- a/parakeet/modules/weight_norm.py
+++ b/parakeet/modules/weight_norm.py
--- a/parakeet/utils/layer_tools.py
+++ b/parakeet/utils/layer_tools.py
--- a/setup.py
+++ b/setup.py
--- a/tests/test_ljspeech.py
+++ b/tests/test_ljspeech.py
--- a/tests/test_vctk.py
+++ b/tests/test_vctk.py
--- a/tools/copyright.hook
+++ b/tools/copyright.hook