Merge branch 'add_license' into 'master'

add license See merge request !24

Merge branch 'add_license' into 'master'
add license See merge request !24
faa725ba · liuyibing01 · f84d6bec · 9d796994 · faa725ba · faa725ba
92 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,3 +25,11 @@
        files: \.md$
    -   id: remove-tabs
        files: \.md$
+-   repo: local
+    hooks:
+    -   id: copyright_checker
+        name: copyright_checker
+        entry: python ./tools/copyright.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
+        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
--- a/examples/deepvoice3/README.md
+++ b/examples/deepvoice3/README.md
-# Deepvoice 3 
+# Deepvoice 3
 Paddle implementation of deepvoice 3 in dynamic graph, a convolutional network based text-to-speech synthesis model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654).
@@ -22,7 +22,7 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed
 ## Project Structure
 ```text
-├── data.py          data_processing 
+├── data.py          data_processing
 ├── ljspeech.yaml    (example) configuration file
 ├── sentences.txt    sample sentences
 ├── synthesis.py     script to synthesize waveform from text
@@ -50,7 +50,7 @@ optional arguments:
                        The directory to save result.
  -g DEVICE, --device DEVICE
                        device to use
-``` 
+```
 1. `--config` is the configuration file to use. The provided `ljspeech.yaml` can be used directly. And you can change some values in the configuration file and train the model with a different config.
 2. `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt).
@@ -61,7 +61,7 @@ optional arguments:
 ├── checkpoints      # checkpoint
 ├── log              # tensorboard log
 └── states           # train and evaluation results
-    ├── alignments   # attention 
+    ├── alignments   # attention
    ├── lin_spec     # linear spectrogram
    ├── mel_spec     # mel spectrogram
    └── waveform     # waveform (.wav files)
@@ -112,4 +112,3 @@ example script:
 ```bash
 python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated
 ```
--- a/examples/deepvoice3/data.py
+++ b/examples/deepvoice3/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import csv
 from pathlib import Path
@@ -79,10 +93,11 @@ class Transform(object):
        y = signal.lfilter([1., -self.preemphasis], [1.], wav)
        # STFT
-        D = librosa.stft(y=y,
+        D = librosa.stft(
-                         n_fft=self.n_fft,
+            y=y,
-                         win_length=self.win_length,
+            n_fft=self.n_fft,
-                         hop_length=self.hop_length)
+            win_length=self.win_length,
+            hop_length=self.hop_length)
        S = np.abs(D)
        # to db and normalize to 0-1
@@ -96,11 +111,8 @@ class Transform(object):
        # mel scale and to db and normalize to 0-1,
        # CAUTION: pass linear scale S, not dbscaled S
-        S_mel = librosa.feature.melspectrogram(S=S,
+        S_mel = librosa.feature.melspectrogram(
-                                               n_mels=self.n_mels,
+            S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
-                                               fmin=self.fmin,
-                                               fmax=self.fmax,
-                                               power=1.)
        S_mel = 20 * np.log10(np.maximum(amplitude_min,
                                         S_mel)) - self.ref_level_db
        S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
@@ -148,20 +160,18 @@ class DataCollector(object):
            (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
             S_mel_norm, num_frames) = example
            text_sequences.append(
-                np.pad(mix_grapheme_phonemes,
+                np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
-                       (0, max_text_length - text_length)))
+                                               )))
            lin_specs.append(
-                np.pad(S_norm,
+                np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
-                       ((0, 0), (self._pad_begin,
+                                         self._pad_begin - num_frames))))
-                                 max_frames - self._pad_begin - num_frames))))
            mel_specs.append(
-                np.pad(S_mel_norm,
+                np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
-                       ((0, 0), (self._pad_begin,
+                                             self._pad_begin - num_frames))))
-                                 max_frames - self._pad_begin - num_frames))))
            done_flags.append(
                np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
-                       (0, max_decoder_length -
+                       (0, max_decoder_length - int(
-                        int(np.ceil(num_frames // self._factor))),
+                           np.ceil(num_frames // self._factor))),
                       constant_values=1))
        text_sequences = np.array(text_sequences).astype(np.int64)
        lin_specs = np.transpose(np.array(lin_specs),

--- a/examples/deepvoice3/synthesis.py
+++ b/examples/deepvoice3/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import argparse
 import ruamel.yaml
@@ -22,11 +36,8 @@ if __name__ == "__main__":
    parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
    parser.add_argument("text", type=str, help="text file to synthesize")
    parser.add_argument("output_path", type=str, help="path to save results")
-    parser.add_argument("-g",
+    parser.add_argument(
-                        "--device",
+        "-g", "--device", type=int, default=-1, help="device to use")
-                        type=int,
-                        default=-1,
-                        help="device to use")
    args = parser.parse_args()
    with open(args.config, 'rt') as f:
@@ -76,15 +87,14 @@ if __name__ == "__main__":
        window_ahead = model_config["window_ahead"]
        key_projection = model_config["key_projection"]
        value_projection = model_config["value_projection"]
-        dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
+        dv3 = make_model(
-                         padding_idx, embedding_std, max_positions, n_vocab,
+            n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
-                         freeze_embedding, filter_size, encoder_channels,
+            embedding_std, max_positions, n_vocab, freeze_embedding,
-                         n_mels, decoder_channels, r,
+            filter_size, encoder_channels, n_mels, decoder_channels, r,
-                         trainable_positional_encodings, use_memory_mask,
+            trainable_positional_encodings, use_memory_mask,
-                         query_position_rate, key_position_rate,
+            query_position_rate, key_position_rate, window_backward,
-                         window_backward, window_ahead, key_projection,
+            window_ahead, key_projection, value_projection, downsample_factor,
-                         value_projection, downsample_factor, linear_dim,
+            linear_dim, use_decoder_states, converter_channels, dropout)
-                         use_decoder_states, converter_channels, dropout)
        summary(dv3)
        state, _ = dg.load_dygraph(args.checkpoint)

--- a/examples/deepvoice3/train.py
+++ b/examples/deepvoice3/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import argparse
 import ruamel.yaml

--- a/examples/deepvoice3/utils.py
+++ b/examples/deepvoice3/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import numpy as np
 from matplotlib import cm
@@ -28,8 +42,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
               converter_channels, dropout):
    """just a simple function to create a deepvoice 3 model"""
    if n_speakers > 1:
-        spe = dg.Embedding((n_speakers, speaker_dim),
+        spe = dg.Embedding(
-                           param_attr=I.Normal(scale=speaker_embed_std))
+            (n_speakers, speaker_dim),
+            param_attr=I.Normal(scale=speaker_embed_std))
    else:
        spe = None
@@ -45,17 +60,17 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
-        ConvSpec(h, k, 3),
+        ConvSpec(h, k, 3), )
-    )
+    enc = Encoder(
-    enc = Encoder(n_vocab,
+        n_vocab,
-                  embed_dim,
+        embed_dim,
-                  n_speakers,
+        n_speakers,
-                  speaker_dim,
+        speaker_dim,
-                  padding_idx=None,
+        padding_idx=None,
-                  embedding_weight_std=embedding_std,
+        embedding_weight_std=embedding_std,
-                  convolutions=encoder_convolutions,
+        convolutions=encoder_convolutions,
-                  max_positions=max_positions,
+        max_positions=max_positions,
-                  dropout=dropout)
+        dropout=dropout)
    if freeze_embedding:
        freeze(enc.embed)
@@ -66,28 +81,28 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
-        ConvSpec(h, k, 1),
+        ConvSpec(h, k, 1), )
-    )
    attention = [True, False, False, False, True]
    force_monotonic_attention = [True, False, False, False, True]
-    dec = Decoder(n_speakers,
+    dec = Decoder(
-                  speaker_dim,
+        n_speakers,
-                  embed_dim,
+        speaker_dim,
-                  mel_dim,
+        embed_dim,
-                  r=r,
+        mel_dim,
-                  max_positions=max_positions,
+        r=r,
-                  padding_idx=padding_idx,
+        max_positions=max_positions,
-                  preattention=prenet_convolutions,
+        padding_idx=padding_idx,
-                  convolutions=attentive_convolutions,
+        preattention=prenet_convolutions,
-                  attention=attention,
+        convolutions=attentive_convolutions,
-                  dropout=dropout,
+        attention=attention,
-                  use_memory_mask=use_memory_mask,
+        dropout=dropout,
-                  force_monotonic_attention=force_monotonic_attention,
+        use_memory_mask=use_memory_mask,
-                  query_position_rate=query_position_rate,
+        force_monotonic_attention=force_monotonic_attention,
-                  key_position_rate=key_position_rate,
+        query_position_rate=query_position_rate,
-                  window_range=WindowRange(window_behind, window_ahead),
+        key_position_rate=key_position_rate,
-                  key_projection=key_projection,
+        window_range=WindowRange(window_behind, window_ahead),
-                  value_projection=value_projection)
+        key_projection=key_projection,
+        value_projection=value_projection)
    if not trainable_positional_encodings:
        freeze(dec.embed_keys_positions)
        freeze(dec.embed_query_positions)
@@ -97,15 +112,15 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(2 * h, k, 1),
-        ConvSpec(2 * h, k, 3),
+        ConvSpec(2 * h, k, 3), )
-    )
+    cvt = Converter(
-    cvt = Converter(n_speakers,
+        n_speakers,
-                    speaker_dim,
+        speaker_dim,
-                    dec.state_dim if use_decoder_states else mel_dim,
+        dec.state_dim if use_decoder_states else mel_dim,
-                    linear_dim,
+        linear_dim,
-                    time_upsampling=downsample_factor,
+        time_upsampling=downsample_factor,
-                    convolutions=postnet_convolutions,
+        convolutions=postnet_convolutions,
-                    dropout=dropout)
+        dropout=dropout)
    dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
    return dv3
@@ -115,8 +130,10 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
               ref_level_db, power, n_iter, win_length, hop_length,
               preemphasis):
    """generate waveform from text using a deepvoice 3 model"""
-    text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob),
+    text = np.array(
-                    dtype=np.int64)
+        en.text_to_sequence(
+            text, p=replace_pronounciation_prob),
+        dtype=np.int64)
    length = len(text)
    print("text sequence's length: {}".format(length))
    text_positions = np.arange(1, 1 + length)
@@ -145,10 +162,11 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter,
    """
    denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db
    lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
-    wav = librosa.griffinlim(lin_scaled**power,
+    wav = librosa.griffinlim(
-                             n_iter=n_iter,
+        lin_scaled**power,
-                             hop_length=hop_length,
+        n_iter=n_iter,
-                             win_length=win_length)
+        hop_length=hop_length,
+        win_length=win_length)
    if preemphasis > 0:
        wav = signal.lfilter([1.], [1., -preemphasis], wav)
    return wav
@@ -225,28 +243,30 @@ def save_state(save_dir,
        plt.colorbar()
        plt.title("mel_input")
        plt.savefig(
-            os.path.join(path,
+            os.path.join(path, "target_mel_spec_step{:09d}.png".format(
-                         "target_mel_spec_step{:09d}.png".format(global_step)))
+                global_step)))
        plt.close()
-        writer.add_image("target/mel_spec",
+        writer.add_image(
-                         cm.viridis(mel_input),
+            "target/mel_spec",
-                         global_step,
+            cm.viridis(mel_input),
-                         dataformats="HWC")
+            global_step,
+            dataformats="HWC")
        plt.figure(figsize=(10, 3))
        display.specshow(mel_output)
        plt.colorbar()
        plt.title("mel_output")
        plt.savefig(
-            os.path.join(
+            os.path.join(path, "predicted_mel_spec_step{:09d}.png".format(
-                path, "predicted_mel_spec_step{:09d}.png".format(global_step)))
+                global_step)))
        plt.close()
-        writer.add_image("predicted/mel_spec",
+        writer.add_image(
-                         cm.viridis(mel_output),
+            "predicted/mel_spec",
-                         global_step,
+            cm.viridis(mel_output),
-                         dataformats="HWC")
+            global_step,
+            dataformats="HWC")
    if lin_input is not None and lin_output is not None:
        lin_input = lin_input[0].numpy().T
@@ -258,28 +278,30 @@ def save_state(save_dir,
        plt.colorbar()
        plt.title("mel_input")
        plt.savefig(
-            os.path.join(path,
+            os.path.join(path, "target_lin_spec_step{:09d}.png".format(
-                         "target_lin_spec_step{:09d}.png".format(global_step)))
+                global_step)))
        plt.close()
-        writer.add_image("target/lin_spec",
+        writer.add_image(
-                         cm.viridis(lin_input),
+            "target/lin_spec",
-                         global_step,
+            cm.viridis(lin_input),
-                         dataformats="HWC")
+            global_step,
+            dataformats="HWC")
        plt.figure(figsize=(10, 3))
        display.specshow(lin_output)
        plt.colorbar()
        plt.title("mel_input")
        plt.savefig(
-            os.path.join(
+            os.path.join(path, "predicted_lin_spec_step{:09d}.png".format(
-                path, "predicted_lin_spec_step{:09d}.png".format(global_step)))
+                global_step)))
        plt.close()
-        writer.add_image("predicted/lin_spec",
+        writer.add_image(
-                         cm.viridis(lin_output),
+            "predicted/lin_spec",
-                         global_step,
+            cm.viridis(lin_output),
-                         dataformats="HWC")
+            global_step,
+            dataformats="HWC")
    if alignments is not None and len(alignments.shape) == 4:
        path = os.path.join(save_dir, "alignments")
@@ -290,10 +312,11 @@ def save_state(save_dir,
                "train_attn_layer_{}_step_{}.png".format(idx, global_step))
            plot_alignment(attn_layer, save_path)
-            writer.add_image("train_attn/layer_{}".format(idx),
+            writer.add_image(
-                             cm.viridis(attn_layer),
+                "train_attn/layer_{}".format(idx),
-                             global_step,
+                cm.viridis(attn_layer),
-                             dataformats="HWC")
+                global_step,
+                dataformats="HWC")
    if lin_output is not None:
        wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power,
@@ -302,7 +325,5 @@ def save_state(save_dir,
        save_path = os.path.join(
            path, "train_sample_step_{:09d}.wav".format(global_step))
        sf.write(save_path, wav, sample_rate)
-        writer.add_audio("train_sample",
+        writer.add_audio(
-                         wav,
+            "train_sample", wav, global_step, sample_rate=sample_rate)
-                         global_step,
-                         sample_rate=sample_rate)
--- a/examples/fastspeech/README.md
+++ b/examples/fastspeech/README.md
@@ -57,7 +57,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
 if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--fastspeech_step``
-For more help on arguments: 
+For more help on arguments:
 ``python train.py --help``.
 ## Synthesis
@@ -75,5 +75,5 @@ or you can run the script file directly.
 sh synthesis.sh
 ```
-For more help on arguments: 
+For more help on arguments:
 ``python synthesis.py --help``.
--- a/examples/fastspeech/parse.py
+++ b/examples/fastspeech/parse.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 def add_config_options_to_parser(parser):
-    parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml',
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default='config/fastspeech.yaml',
        help="the yaml config file path.")
-    parser.add_argument('--batch_size', type=int, default=32,
+    parser.add_argument(
-        help="batch size for training.")
+        '--batch_size', type=int, default=32, help="batch size for training.")
-    parser.add_argument('--epochs', type=int, default=10000,
+    parser.add_argument(
+        '--epochs',
+        type=int,
+        default=10000,
        help="the number of epoch for training.")
-    parser.add_argument('--lr', type=float, default=0.001,
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=0.001,
        help="the learning rate for training.")
-    parser.add_argument('--save_step', type=int, default=500,
+    parser.add_argument(
+        '--save_step',
+        type=int,
+        default=500,
        help="checkpointing interval during training.")
-    parser.add_argument('--fastspeech_step', type=int, default=70000,
+    parser.add_argument(
+        '--fastspeech_step',
+        type=int,
+        default=70000,
        help="Global step to restore checkpoint of fastspeech.")
-    parser.add_argument('--use_gpu', type=int, default=1,
+    parser.add_argument(
+        '--use_gpu',
+        type=int,
+        default=1,
        help="use gpu or not during training.")
-    parser.add_argument('--use_data_parallel', type=int, default=0,
+    parser.add_argument(
+        '--use_data_parallel',
+        type=int,
+        default=0,
        help="use data parallel or not during training.")
-    parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='./dataset/LJSpeech-1.1',
        help="the path of dataset.")
-    parser.add_argument('--checkpoint_path', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint_path',
+        type=str,
+        default=None,
        help="the path to load checkpoint or pretrain model.")
-    parser.add_argument('--save_path', type=str, default='./checkpoint',
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./checkpoint',
        help="the path to save checkpoint.")
-    parser.add_argument('--log_dir', type=str, default='./log',
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default='./log',
        help="the directory to save tensorboard log.")
-    parser.add_argument('--sample_path', type=str, default='./sample',
+    parser.add_argument(
+        '--sample_path',
+        type=str,
+        default='./sample',
        help="the directory to save audio sample in synthesis.")
-    parser.add_argument('--transtts_path', type=str, default='./log',
+    parser.add_argument(
+        '--transtts_path',
+        type=str,
+        default='./log',
        help="the directory to load pretrain transformerTTS model.")
-    parser.add_argument('--transformer_step', type=int, default=160000,
+    parser.add_argument(
+        '--transformer_step',
+        type=int,
+        default=160000,
        help="the step to load transformerTTS model.")
--- a/examples/fastspeech/synthesis.py
+++ b/examples/fastspeech/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from tensorboardX import SummaryWriter
 from collections import OrderedDict
@@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence
 from parakeet import audio
 from parakeet.models.fastspeech.fastspeech import FastSpeech
 def load_checkpoint(step, model_path):
    model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
    new_state_dict = OrderedDict()
@@ -22,13 +36,14 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict
 def synthesis(text_input, args):
    place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
    # tensorboard
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
+        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'synthesis')
+    path = os.path.join(args.log_dir, 'synthesis')
    with open(args.config_path) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)
@@ -37,24 +52,28 @@ def synthesis(text_input, args):
    with dg.guard(place):
        model = FastSpeech(cfg)
-        model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")))
+        model.set_dict(
+            load_checkpoint(
+                str(args.fastspeech_step),
+                os.path.join(args.checkpoint_path, "fastspeech")))
        model.eval()
        text = np.asarray(text_to_sequence(text_input))
-        text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
+        text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
-        pos_text = np.arange(1, text.shape[1]+1)
+        pos_text = np.arange(1, text.shape[1] + 1)
-        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
+        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
-        mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
+        mel_output, mel_output_postnet = model(
+            text, pos_text, alpha=args.alpha)
        _ljspeech_processor = audio.AudioProcessor(
-            sample_rate=cfg['audio']['sr'], 
+            sample_rate=cfg['audio']['sr'],
-            num_mels=cfg['audio']['num_mels'], 
+            num_mels=cfg['audio']['num_mels'],
-            min_level_db=cfg['audio']['min_level_db'], 
+            min_level_db=cfg['audio']['min_level_db'],
-            ref_level_db=cfg['audio']['ref_level_db'], 
+            ref_level_db=cfg['audio']['ref_level_db'],
-            n_fft=cfg['audio']['n_fft'], 
+            n_fft=cfg['audio']['n_fft'],
-            win_length= cfg['audio']['win_length'], 
+            win_length=cfg['audio']['win_length'],
-            hop_length= cfg['audio']['hop_length'],
+            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
@@ -67,14 +86,17 @@ def synthesis(text_input, args):
            do_trim_silence=False,
            sound_norm=False)
-        mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0])
+        mel_output_postnet = fluid.layers.transpose(
-        wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy())
+            fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
+        wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy(
+        ))
        writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
        print("Synthesis completed !!!")
    writer.close()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train Fastspeech model")
    add_config_options_to_parser(parser)
    args = parser.parse_args()
    synthesis("Transformer model is so fast!", args)
\ No newline at end of file
--- a/examples/fastspeech/train.py
+++ b/examples/fastspeech/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import argparse
 import os
@@ -20,8 +33,10 @@ import sys
 sys.path.append("../transformer_tts")
 from data import LJSpeechLoader
 def load_checkpoint(step, model_path):
-    model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
+    model_dict, opti_dict = fluid.dygraph.load_dygraph(
+        os.path.join(model_path, step))
    new_state_dict = OrderedDict()
    for param in model_dict:
        if param.startswith('_layers.'):
@@ -30,6 +45,7 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict, opti_dict
 def main(args):
    local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
    nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
@@ -43,26 +59,33 @@ def main(args):
             if args.use_gpu else fluid.CPUPlace())
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
+        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'fastspeech')
+    path = os.path.join(args.log_dir, 'fastspeech')
    writer = SummaryWriter(path) if local_rank == 0 else None
    with dg.guard(place):
        with fluid.unique_name.guard():
            transformerTTS = TransformerTTS(cfg)
-            model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer"))
+            model_dict, _ = load_checkpoint(
+                str(args.transformer_step),
+                os.path.join(args.transtts_path, "transformer"))
            transformerTTS.set_dict(model_dict)
            transformerTTS.eval()
        model = FastSpeech(cfg)
        model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
+        optimizer = fluid.optimizer.AdamOptimizer(
-                                                  parameter_list=model.parameters())
+            learning_rate=dg.NoamDecay(1 / (
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
+            parameter_list=model.parameters())
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, shuffle=True).reader()
        if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.fastspeech_step),
+                os.path.join(args.checkpoint_path, "fastspeech"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.fastspeech_step
@@ -76,31 +99,42 @@ def main(args):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
-                _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
+                _, _, attn_probs, _, _, _ = transformerTTS(
-                alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32)
+                    character, mel_input, pos_text, pos_mel)
+                alignment = dg.to_variable(
+                    get_alignment(attn_probs, mel_lens, cfg[
+                        'transformer_head'])).astype(np.float32)
                global_step += 1
                #Forward
-                result= model(character, 
+                result = model(
-                              pos_text, 
+                    character,
-                              mel_pos=pos_mel,  
+                    pos_text,
-                              length_target=alignment)
+                    mel_pos=pos_mel,
+                    length_target=alignment)
                mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
                mel_loss = layers.mse_loss(mel_output, mel)
                mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
-                duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment)))
+                duration_loss = layers.mean(
+                    layers.abs(
+                        layers.elementwise_sub(duration_predictor_output,
+                                               alignment)))
                total_loss = mel_loss + mel_postnet_loss + duration_loss
-                if local_rank==0:
+                if local_rank == 0:
-                    writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
+                    writer.add_scalar('mel_loss',
-                    writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
+                                      mel_loss.numpy(), global_step)
-                    writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
+                    writer.add_scalar('post_mel_loss',
-                    writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
+                                      mel_postnet_loss.numpy(), global_step)
+                    writer.add_scalar('duration_loss',
+                                      duration_loss.numpy(), global_step)
+                    writer.add_scalar('learning_rate',
+                                      optimizer._learning_rate.step().numpy(),
+                                      global_step)
                if args.use_data_parallel:
                    total_loss = model.scale_loss(total_loss)
@@ -108,21 +142,25 @@ def main(args):
                    model.apply_collective_grads()
                else:
                    total_loss.backward()
-                optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    total_loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                model.clear_gradients()
-                 # save checkpoint
+                # save checkpoint
-                if local_rank==0 and global_step % args.save_step == 0:
+                if local_rank == 0 and global_step % args.save_step == 0:
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
-                    save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step)
+                    save_path = os.path.join(args.save_path,
+                                             'fastspeech/%d' % global_step)
                    dg.save_dygraph(model.state_dict(), save_path)
                    dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
            writer.close()
-if __name__ =='__main__':
+if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train Fastspeech model")
    add_config_options_to_parser(parser)
    args = parser.parse_args()

--- a/examples/transformer_tts/README.md
+++ b/examples/transformer_tts/README.md
@@ -50,7 +50,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
 if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--transformer_step``
-For more help on arguments: 
+For more help on arguments:
 ``python train_transformer.py --help``.
 ## Train Vocoder
@@ -78,7 +78,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
 ```
 if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--vocoder_step``
-For more help on arguments: 
+For more help on arguments:
 ``python train_vocoder.py --help``.
 ## Synthesis
@@ -101,5 +101,5 @@ sh synthesis.sh
 And the audio file will be saved in ``--sample_path``.
-For more help on arguments: 
+For more help on arguments:
 ``python synthesis.py --help``.
--- a/examples/transformer_tts/data.py
+++ b/examples/transformer_tts/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 import numpy as np
 import pandas as pd
@@ -12,23 +25,43 @@ from parakeet.data.datacargo import DataCargo
 from parakeet.data.batch import TextIDBatcher, SpecBatcher
 from parakeet.data.dataset import DatasetMixin, TransformDataset
 class LJSpeechLoader:
-    def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True):
+    def __init__(self,
+                 config,
+                 args,
+                 nranks,
+                 rank,
+                 is_vocoder=False,
+                 shuffle=True):
        place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
        LJSPEECH_ROOT = Path(args.data_path)
        metadata = LJSpeechMetaData(LJSPEECH_ROOT)
        transformer = LJSpeech(config)
        dataset = TransformDataset(metadata, transformer)
-        sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
+        sampler = DistributedSampler(
+            len(metadata), nranks, rank, shuffle=shuffle)
        assert args.batch_size % nranks == 0
        each_bs = args.batch_size // nranks
        if is_vocoder:
-            dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
+            dataloader = DataCargo(
+                dataset,
+                sampler=sampler,
+                batch_size=each_bs,
+                shuffle=shuffle,
+                batch_fn=batch_examples_vocoder,
+                drop_last=True)
        else:
-            dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True)
+            dataloader = DataCargo(
+                dataset,
+                sampler=sampler,
+                batch_size=each_bs,
+                shuffle=shuffle,
+                batch_fn=batch_examples,
+                drop_last=True)
        self.reader = fluid.io.DataLoader.from_generator(
            capacity=32,
            iterable=True,
@@ -63,13 +96,13 @@ class LJSpeech(object):
        super(LJSpeech, self).__init__()
        self.config = config
        self._ljspeech_processor = audio.AudioProcessor(
-            sample_rate=config['audio']['sr'], 
+            sample_rate=config['audio']['sr'],
-            num_mels=config['audio']['num_mels'], 
+            num_mels=config['audio']['num_mels'],
-            min_level_db=config['audio']['min_level_db'], 
+            min_level_db=config['audio']['min_level_db'],
-            ref_level_db=config['audio']['ref_level_db'], 
+            ref_level_db=config['audio']['ref_level_db'],
-            n_fft=config['audio']['n_fft'], 
+            n_fft=config['audio']['n_fft'],
-            win_length= config['audio']['win_length'], 
+            win_length=config['audio']['win_length'],
-            hop_length= config['audio']['hop_length'],
+            hop_length=config['audio']['hop_length'],
            power=config['audio']['power'],
            preemphasis=config['audio']['preemphasis'],
            signal_norm=True,
@@ -81,7 +114,7 @@ class LJSpeech(object):
            griffin_lim_iters=60,
            do_trim_silence=False,
            sound_norm=False)
    def __call__(self, metadatum):
        """All the code for generating an Example from a metadatum. If you want a 
        different preprocessing pipeline, you can override this method. 
@@ -90,13 +123,15 @@ class LJSpeech(object):
        method.
        """
        fname, raw_text, normalized_text = metadatum
        # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
        wav = self._ljspeech_processor.load_wav(str(fname))
        mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
        mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
-        phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
+        phonemes = np.array(
-        return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
+            g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
+        return (mag, mel, phonemes
+                )  # maybe we need to implement it as a map in the future
 def batch_examples(batch):
@@ -109,44 +144,71 @@ def batch_examples(batch):
    pos_mels = []
    for data in batch:
        _, mel, text = data
-        mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
+        mel_inputs.append(
+            np.concatenate(
+                [np.zeros([mel.shape[0], 1], np.float32), mel[:, :-1]],
+                axis=-1))
        mel_lens.append(mel.shape[1])
        text_lens.append(len(text))
        pos_texts.append(np.arange(1, len(text) + 1))
        pos_mels.append(np.arange(1, mel.shape[1] + 1))
        mels.append(mel)
        texts.append(text)
    # Sort by text_len in descending order
-    texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
+    texts = [
-    mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
+        i
-    mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
+        for i, _ in sorted(
-    mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)]
+            zip(texts, text_lens), key=lambda x: x[1], reverse=True)
-    pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
+    ]
-    pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
+    mels = [
+        i
+        for i, _ in sorted(
+            zip(mels, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    mel_inputs = [
+        i
+        for i, _ in sorted(
+            zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    mel_lens = [
+        i
+        for i, _ in sorted(
+            zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    pos_texts = [
+        i
+        for i, _ in sorted(
+            zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    pos_mels = [
+        i
+        for i, _ in sorted(
+            zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)
+    ]
    text_lens = sorted(text_lens, reverse=True)
    # Pad sequence with largest len of the batch
-    texts = TextIDBatcher(pad_id=0)(texts)   #(B, T)
+    texts = TextIDBatcher(pad_id=0)(texts)  #(B, T)
-    pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
+    pos_texts = TextIDBatcher(pad_id=0)(pos_texts)  #(B,T)
-    pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
+    pos_mels = TextIDBatcher(pad_id=0)(pos_mels)  #(B,T)
-    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
+    mels = np.transpose(
-    mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
+        SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))  #(B,T,num_mels)
-    return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens))
+    mel_inputs = np.transpose(
+        SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1))  #(B,T,num_mels)
+    return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
+            np.array(mel_lens))
 def batch_examples_vocoder(batch):
-    mels=[]
+    mels = []
-    mags=[]
+    mags = []
    for data in batch:
        mag, mel, _ = data
        mels.append(mel)
        mags.append(mag)
-    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
+    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))
-    mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
+    mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0, 2, 1))
    return (mels, mags)
--- a/examples/transformer_tts/parse.py
+++ b/examples/transformer_tts/parse.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 def add_config_options_to_parser(parser):
-    parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml',
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default='config/train_transformer.yaml',
        help="the yaml config file path.")
-    parser.add_argument('--batch_size', type=int, default=32,
+    parser.add_argument(
-        help="batch size for training.")
+        '--batch_size', type=int, default=32, help="batch size for training.")
-    parser.add_argument('--epochs', type=int, default=10000,
+    parser.add_argument(
+        '--epochs',
+        type=int,
+        default=10000,
        help="the number of epoch for training.")
-    parser.add_argument('--lr', type=float, default=0.001,
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=0.001,
        help="the learning rate for training.")
-    parser.add_argument('--save_step', type=int, default=500,
+    parser.add_argument(
+        '--save_step',
+        type=int,
+        default=500,
        help="checkpointing interval during training.")
-    parser.add_argument('--image_step', type=int, default=2000,
+    parser.add_argument(
+        '--image_step',
+        type=int,
+        default=2000,
        help="attention image interval during training.")
-    parser.add_argument('--max_len', type=int, default=400,
+    parser.add_argument(
+        '--max_len',
+        type=int,
+        default=400,
        help="The max length of audio when synthsis.")
-    parser.add_argument('--transformer_step', type=int, default=160000,
+    parser.add_argument(
+        '--transformer_step',
+        type=int,
+        default=160000,
        help="Global step to restore checkpoint of transformer.")
-    parser.add_argument('--vocoder_step', type=int, default=90000,
+    parser.add_argument(
+        '--vocoder_step',
+        type=int,
+        default=90000,
        help="Global step to restore checkpoint of postnet.")
-    parser.add_argument('--use_gpu', type=int, default=1,
+    parser.add_argument(
+        '--use_gpu',
+        type=int,
+        default=1,
        help="use gpu or not during training.")
-    parser.add_argument('--use_data_parallel', type=int, default=0,
+    parser.add_argument(
+        '--use_data_parallel',
+        type=int,
+        default=0,
        help="use data parallel or not during training.")
-    parser.add_argument('--stop_token', type=int, default=0,
+    parser.add_argument(
+        '--stop_token',
+        type=int,
+        default=0,
        help="use stop token loss in network or not.")
-    parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='./dataset/LJSpeech-1.1',
        help="the path of dataset.")
-    parser.add_argument('--checkpoint_path', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint_path',
+        type=str,
+        default=None,
        help="the path to load checkpoint or pretrain model.")
-    parser.add_argument('--save_path', type=str, default='./checkpoint',
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./checkpoint',
        help="the path to save checkpoint.")
-    parser.add_argument('--log_dir', type=str, default='./log',
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default='./log',
        help="the directory to save tensorboard log.")
-    parser.add_argument('--sample_path', type=str, default='./sample',
+    parser.add_argument(
+        '--sample_path',
+        type=str,
+        default='./sample',
        help="the directory to save audio sample in synthesis.")
--- a/examples/transformer_tts/synthesis.py
+++ b/examples/transformer_tts/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from scipy.io.wavfile import write
 from parakeet.g2p.en import text_to_sequence
@@ -16,6 +29,7 @@ from parakeet import audio
 from parakeet.models.transformer_tts.vocoder import Vocoder
 from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
 def load_checkpoint(step, model_path):
    model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
    new_state_dict = OrderedDict()
@@ -26,6 +40,7 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict
 def synthesis(text_input, args):
    place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
@@ -34,46 +49,53 @@ def synthesis(text_input, args):
    # tensorboard
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
+        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'synthesis')
+    path = os.path.join(args.log_dir, 'synthesis')
    writer = SummaryWriter(path)
    with dg.guard(place):
        with fluid.unique_name.guard():
            model = TransformerTTS(cfg)
-            model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")))
+            model.set_dict(
+                load_checkpoint(
+                    str(args.transformer_step),
+                    os.path.join(args.checkpoint_path, "transformer")))
            model.eval()
        with fluid.unique_name.guard():
            model_vocoder = Vocoder(cfg, args.batch_size)
-            model_vocoder.set_dict(load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")))
+            model_vocoder.set_dict(
+                load_checkpoint(
+                    str(args.vocoder_step),
+                    os.path.join(args.checkpoint_path, "vocoder")))
            model_vocoder.eval()
        # init input
        text = np.asarray(text_to_sequence(text_input))
-        text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
+        text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
-        mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32)
+        mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
-        pos_text = np.arange(1, text.shape[1]+1)
+        pos_text = np.arange(1, text.shape[1] + 1)
-        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
+        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
        pbar = tqdm(range(args.max_len))
        for i in pbar:
-            pos_mel = np.arange(1, mel_input.shape[1]+1)
+            pos_mel = np.arange(1, mel_input.shape[1] + 1)
-            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0])
+            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
-            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel)
+            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
-            mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1)
+                text, mel_input, pos_text, pos_mel)
+            mel_input = fluid.layers.concat(
+                [mel_input, postnet_pred[:, -1:, :]], axis=1)
        mag_pred = model_vocoder(postnet_pred)
        _ljspeech_processor = audio.AudioProcessor(
-            sample_rate=cfg['audio']['sr'], 
+            sample_rate=cfg['audio']['sr'],
-            num_mels=cfg['audio']['num_mels'], 
+            num_mels=cfg['audio']['num_mels'],
-            min_level_db=cfg['audio']['min_level_db'], 
+            min_level_db=cfg['audio']['min_level_db'],
-            ref_level_db=cfg['audio']['ref_level_db'], 
+            ref_level_db=cfg['audio']['ref_level_db'],
-            n_fft=cfg['audio']['n_fft'], 
+            n_fft=cfg['audio']['n_fft'],
-            win_length= cfg['audio']['win_length'], 
+            win_length=cfg['audio']['win_length'],
-            hop_length= cfg['audio']['hop_length'],
+            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
@@ -86,13 +108,18 @@ def synthesis(text_input, args):
            do_trim_silence=False,
            sound_norm=False)
-        wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
+        wav = _ljspeech_processor.inv_spectrogram(
+            fluid.layers.transpose(
+                fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
        writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
        if not os.path.exists(args.sample_path):
            os.mkdir(args.sample_path)
-        write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav)
+        write(
+            os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'],
+            wav)
    writer.close()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Synthesis model")
    add_config_options_to_parser(parser)

--- a/examples/transformer_tts/train_transformer.py
+++ b/examples/transformer_tts/train_transformer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from tqdm import tqdm
 from tensorboardX import SummaryWriter
@@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy
 from data import LJSpeechLoader
 from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
 def load_checkpoint(step, model_path):
-    model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
+    model_dict, opti_dict = fluid.dygraph.load_dygraph(
+        os.path.join(model_path, step))
    new_state_dict = OrderedDict()
    for param in model_dict:
        if param.startswith('_layers.'):
@@ -40,22 +55,27 @@ def main(args):
             if args.use_gpu else fluid.CPUPlace())
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
+        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'transformer')
+    path = os.path.join(args.log_dir, 'transformer')
    writer = SummaryWriter(path) if local_rank == 0 else None
    with dg.guard(place):
        model = TransformerTTS(cfg)
        model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), 
+        optimizer = fluid.optimizer.AdamOptimizer(
-                                                  parameter_list=model.parameters())
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
+            parameter_list=model.parameters())
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, shuffle=True).reader()
        if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.transformer_step),
+                os.path.join(args.checkpoint_path, "transformer"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.transformer_step
@@ -64,86 +84,112 @@ def main(args):
        if args.use_data_parallel:
            strategy = dg.parallel.prepare_context()
            model = fluid.dygraph.parallel.DataParallel(model, strategy)
        for epoch in range(args.epochs):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                character, mel, mel_input, pos_text, pos_mel, text_length, _ = data
                global_step += 1
-                mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
+                mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
+                    character, mel_input, pos_text, pos_mel)
                label = (pos_mel == 0).astype(np.float32)
-                mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
+                mel_loss = layers.mean(
-                post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
+                    layers.abs(layers.elementwise_sub(mel_pred, mel)))
+                post_mel_loss = layers.mean(
+                    layers.abs(layers.elementwise_sub(postnet_pred, mel)))
                loss = mel_loss + post_mel_loss
                # Note: When used stop token loss the learning did not work.
                if args.stop_token:
                    stop_loss = cross_entropy(stop_preds, label)
                    loss = loss + stop_loss
-                if local_rank==0:
+                if local_rank == 0:
                    writer.add_scalars('training_loss', {
-                        'mel_loss':mel_loss.numpy(),
+                        'mel_loss': mel_loss.numpy(),
-                        'post_mel_loss':post_mel_loss.numpy()
+                        'post_mel_loss': post_mel_loss.numpy()
                    }, global_step)
                    if args.stop_token:
-                        writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
+                        writer.add_scalar('stop_loss',
+                                          stop_loss.numpy(), global_step)
                    if args.use_data_parallel:
                        writer.add_scalars('alphas', {
-                            'encoder_alpha':model._layers.encoder.alpha.numpy(),
+                            'encoder_alpha':
-                            'decoder_alpha':model._layers.decoder.alpha.numpy(),
+                            model._layers.encoder.alpha.numpy(),
+                            'decoder_alpha':
+                            model._layers.decoder.alpha.numpy(),
                        }, global_step)
                    else:
                        writer.add_scalars('alphas', {
-                            'encoder_alpha':model.encoder.alpha.numpy(),
+                            'encoder_alpha': model.encoder.alpha.numpy(),
-                            'decoder_alpha':model.decoder.alpha.numpy(),
+                            'decoder_alpha': model.decoder.alpha.numpy(),
                        }, global_step)
-                    writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
+                    writer.add_scalar('learning_rate',
+                                      optimizer._learning_rate.step().numpy(),
+                                      global_step)
                    if global_step % args.image_step == 1:
                        for i, prob in enumerate(attn_probs):
                            for j in range(4):
-                                    x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
+                                x = np.uint8(
-                                    writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
                        for i, prob in enumerate(attn_enc):
                            for j in range(4):
-                                x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
+                                x = np.uint8(
-                                writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_enc_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
                        for i, prob in enumerate(attn_dec):
                            for j in range(4):
-                                x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
+                                x = np.uint8(
-                                writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_dec_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
                if args.use_data_parallel:
                    loss = model.scale_loss(loss)
                    loss.backward()
                    model.apply_collective_grads()
                else:
                    loss.backward()
-                optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                model.clear_gradients()
                # save checkpoint
-                if local_rank==0 and global_step % args.save_step == 0:
+                if local_rank == 0 and global_step % args.save_step == 0:
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
-                    save_path = os.path.join(args.save_path,'transformer/%d' % global_step)
+                    save_path = os.path.join(args.save_path,
+                                             'transformer/%d' % global_step)
                    dg.save_dygraph(model.state_dict(), save_path)
                    dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
            writer.close()
-if __name__ =='__main__':
+if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train TransformerTTS model")
    add_config_options_to_parser(parser)

--- a/examples/transformer_tts/train_vocoder.py
+++ b/examples/transformer_tts/train_vocoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from tensorboardX import SummaryWriter
 import os
 from tqdm import tqdm
@@ -13,6 +26,7 @@ import paddle.fluid.layers as layers
 from data import LJSpeechLoader
 from parakeet.models.transformer_tts.vocoder import Vocoder
 def load_checkpoint(step, model_path):
    model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
    new_state_dict = OrderedDict()
@@ -23,8 +37,9 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict, opti_dict
 def main(args):
    local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
    nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
@@ -35,23 +50,26 @@ def main(args):
    place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
             if args.use_data_parallel else fluid.CUDAPlace(0)
             if args.use_gpu else fluid.CPUPlace())
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
+        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'vocoder')
+    path = os.path.join(args.log_dir, 'vocoder')
    writer = SummaryWriter(path) if local_rank == 0 else None
-    with dg.guard(place):   
+    with dg.guard(place):
        model = Vocoder(cfg, args.batch_size)
        model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
+        optimizer = fluid.optimizer.AdamOptimizer(
-                                                  parameter_list=model.parameters())
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
+            parameter_list=model.parameters())
        if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.vocoder_step),
+                os.path.join(args.checkpoint_path, "vocoder"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.vocoder_step
@@ -61,48 +79,55 @@ def main(args):
            strategy = dg.parallel.prepare_context()
            model = fluid.dygraph.parallel.DataParallel(model, strategy)
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader()
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, is_vocoder=True).reader()
        for epoch in range(args.epochs):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                mel, mag = data
                mag = dg.to_variable(mag.numpy())
                mel = dg.to_variable(mel.numpy())
                global_step += 1
                mag_pred = model(mel)
-                loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
+                loss = layers.mean(
+                    layers.abs(layers.elementwise_sub(mag_pred, mag)))
                if args.use_data_parallel:
                    loss = model.scale_loss(loss)
                    loss.backward()
                    model.apply_collective_grads()
                else:
                    loss.backward()
-                optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                model.clear_gradients()
-                if local_rank==0:
+                if local_rank == 0:
-                    writer.add_scalars('training_loss',{
+                    writer.add_scalars('training_loss', {
-                        'loss':loss.numpy(),
+                        'loss': loss.numpy(),
                    }, global_step)
                    if global_step % args.save_step == 0:
                        if not os.path.exists(args.save_path):
                            os.mkdir(args.save_path)
-                        save_path = os.path.join(args.save_path,'vocoder/%d' % global_step)
+                        save_path = os.path.join(args.save_path,
+                                                 'vocoder/%d' % global_step)
                        dg.save_dygraph(model.state_dict(), save_path)
                        dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
            writer.close()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train vocoder model")
    add_config_options_to_parser(parser)
    args = parser.parse_args()
    # Print the whole config setting.
    pprint(args)
    main(args)
\ No newline at end of file
--- a/examples/waveflow/benchmark.py
+++ b/examples/waveflow/benchmark.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 from pprint import pprint

--- a/examples/waveflow/synthesis.py
+++ b/examples/waveflow/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 from pprint import pprint

--- a/examples/waveflow/train.py
+++ b/examples/waveflow/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 import subprocess

--- a/examples/waveflow/utils.py
+++ b/examples/waveflow/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import itertools
 import os
 import time

--- a/parakeet/__init__.py
+++ b/parakeet/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 __version__ = "0.0.0"
 from . import data, g2p, models, modules
--- a/parakeet/audio/__init__.py
+++ b/parakeet/audio/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from .audio import AudioProcessor
\ No newline at end of file
--- a/parakeet/audio/audio.py
+++ b/parakeet/audio/audio.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import librosa
 import soundfile as sf
 import numpy as np
 import scipy.io
 import scipy.signal
 class AudioProcessor(object):
-    def __init__(self,
+    def __init__(
-                 sample_rate=None, # int, sampling rate
+            self,
-                 num_mels=None, # int, bands of mel spectrogram
+            sample_rate=None,  # int, sampling rate
-                 min_level_db=None, # float, minimum level db
+            num_mels=None,  # int, bands of mel spectrogram
-                 ref_level_db=None, # float, reference level db
+            min_level_db=None,  # float, minimum level db
-                 n_fft=None, # int: number of samples in a frame for stft
+            ref_level_db=None,  # float, reference level db
-                 win_length=None, # int: the same meaning with n_fft
+            n_fft=None,  # int: number of samples in a frame for stft
-                 hop_length=None, # int: number of samples between neighboring frame
+            win_length=None,  # int: the same meaning with n_fft
-                 power=None, # float:power to raise before griffin-lim
+            hop_length=None,  # int: number of samples between neighboring frame
-                 preemphasis=None, # float: preemphasis coefficident
+            power=None,  # float:power to raise before griffin-lim
-                 signal_norm=None, # 
+            preemphasis=None,  # float: preemphasis coefficident
-                 symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
+            signal_norm=None,  # 
-                 max_norm=None, # float, max norm
+            symmetric_norm=False,  # bool, apply clip norm in [-max_norm, max_form]
-                 mel_fmin=None, # int: mel spectrogram's minimum frequency
+            max_norm=None,  # float, max norm
-                 mel_fmax=None, # int: mel spectrogram's maximum frequency
+            mel_fmin=None,  # int: mel spectrogram's minimum frequency
-                 clip_norm=True, # bool: clip spectrogram's norm
+            mel_fmax=None,  # int: mel spectrogram's maximum frequency
-                 griffin_lim_iters=None, # int:
+            clip_norm=True,  # bool: clip spectrogram's norm
-                 do_trim_silence=False, # bool: trim silence
+            griffin_lim_iters=None,  # int:
-                 sound_norm=False,
+            do_trim_silence=False,  # bool: trim silence
-                 **kwargs):
+            sound_norm=False,
+            **kwargs):
        self.sample_rate = sample_rate
        self.num_mels = num_mels
        self.min_level_db = min_level_db
@@ -34,8 +50,8 @@ class AudioProcessor(object):
        self.n_fft = n_fft
        self.win_length = win_length or n_fft
        # hop length defaults to 1/4 window_length
-        self.hop_length = hop_length or 0.25 * self.win_length 
+        self.hop_length = hop_length or 0.25 * self.win_length
        self.power = power
        self.preemphasis = float(preemphasis)
@@ -52,7 +68,8 @@ class AudioProcessor(object):
        self.do_trim_silence = do_trim_silence
        self.sound_norm = sound_norm
-        self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters()
+        self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters(
+        )
    def _stft_parameters(self):
        """compute frame length and hop length in ms"""
@@ -65,44 +82,54 @@ class AudioProcessor(object):
        """object repr"""
        cls_name_str = self.__class__.__name__
        members = vars(self)
-        dict_str = "\n".join(["  {}: {},".format(k, v) for k, v in members.items()])
+        dict_str = "\n".join(
+            ["  {}: {},".format(k, v) for k, v in members.items()])
        repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
        return repr_str
    def save_wav(self, path, wav):
        """save audio with scipy.io.wavfile in 16bit integers"""
        wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
-        scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16))
+        scipy.io.wavfile.write(path, self.sample_rate,
+                               wav_norm.as_type(np.int16))
    def load_wav(self, path, sr=None):
        """load wav -> trim_silence -> rescale"""
        x, sr = librosa.load(path, sr=None)
-        assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate)
+        assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(
+            sr, self.sample_rate)
        if self.do_trim_silence:
            try:
                x = self.trim_silence(x)
            except ValueError:
-                print(" [!] File cannot be trimmed for silence - {}".format(path))
+                print(" [!] File cannot be trimmed for silence - {}".format(
+                    path))
        if self.sound_norm:
-            x = x / x.max() * 0.9 # why 0.9 ?
+            x = x / x.max() * 0.9  # why 0.9 ?
        return x
    def trim_silence(self, wav):
        """Trim soilent parts with a threshold and 0.01s margin"""
        margin = int(self.sample_rate * 0.01)
-        wav = wav[margin: -margin]
+        wav = wav[margin:-margin]
-        trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
+        trimed_wav = librosa.effects.trim(
+            wav,
+            top_db=60,
+            frame_length=self.win_length,
+            hop_length=self.hop_length)[0]
        return trimed_wav
    def apply_preemphasis(self, x):
        if self.preemphasis == 0.:
-            raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
+            raise RuntimeError(
+                " !! Preemphasis coefficient should be positive. ")
        return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
    def apply_inv_preemphasis(self, x):
        if self.preemphasis == 0.:
-            raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
+            raise RuntimeError(
+                " !! Preemphasis coefficient should be positive. ")
        return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
    def _amplitude_to_db(self, x):
@@ -125,12 +152,11 @@ class AudioProcessor(object):
        """return mel basis for mel scale"""
        if self.mel_fmax is not None:
            assert self.mel_fmax <= self.sample_rate // 2
-        return librosa.filters.mel(
+        return librosa.filters.mel(self.sample_rate,
-            self.sample_rate, 
+                                   self.n_fft,
-            self.n_fft, 
+                                   n_mels=self.num_mels,
-            n_mels=self.num_mels,
+                                   fmin=self.mel_fmin,
-            fmin=self.mel_fmin,
+                                   fmax=self.mel_fmax)
-            fmax=self.mel_fmax)
    def _normalize(self, S):
        """put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
@@ -156,25 +182,29 @@ class AudioProcessor(object):
            if self.symmetric_norm:
                if self.clip_norm:
                    S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
-                S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db
+                S_denorm = (S_denorm + self.max_norm) * (
+                    -self.min_level_db) / (2 * self.max_norm
+                                           ) + self.min_level_db
                return S_denorm
            else:
                if self.clip_norm:
                    S_denorm = np.clip(S_denorm, 0, self.max_norm)
-                S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db
+                S_denorm = S_denorm * (-self.min_level_db
+                                       ) / self.max_norm + self.min_level_db
                return S_denorm
        else:
            return S
    def _stft(self, y):
        return librosa.stft(
-            y=y, 
+            y=y,
            n_fft=self.n_fft,
            win_length=self.win_length,
            hop_length=self.hop_length)
    def _istft(self, S):
-        return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length)
+        return librosa.istft(
+            S, hop_length=self.hop_length, win_length=self.win_length)
    def spectrogram(self, y):
        """compute linear spectrogram(amplitude)
@@ -195,7 +225,8 @@ class AudioProcessor(object):
            D = self._stft(self.apply_preemphasis(y))
        else:
            D = self._stft(y)
-        S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
+        S = self._amplitude_to_db(self._linear_to_mel(np.abs(
+            D))) - self.ref_level_db
        return self._normalize(S)
    def inv_spectrogram(self, spectrogram):
@@ -203,16 +234,16 @@ class AudioProcessor(object):
        S = self._denormalize(spectrogram)
        S = self._db_to_amplitude(S + self.ref_level_db)
        if self.preemphasis:
-            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
+            return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
-        return self._griffin_lim(S ** self.power)
+        return self._griffin_lim(S**self.power)
    def inv_melspectrogram(self, mel_spectrogram):
        S = self._denormalize(mel_spectrogram)
        S = self._db_to_amplitude(S + self.ref_level_db)
        S = self._mel_to_linear(np.abs(S))
        if self.preemphasis:
-            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
+            return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
-        return self._griffin_lim(S ** self.power)
+        return self._griffin_lim(S**self.power)
    def out_linear_to_mel(self, linear_spec):
        """convert output linear spec to mel spec"""
@@ -222,7 +253,7 @@ class AudioProcessor(object):
        S = self._amplitude_to_db(S) - self.ref_level_db
        mel = self._normalize(S)
        return mel
    def _griffin_lim(self, S):
        angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
        S_complex = np.abs(S).astype(np.complex)
@@ -234,18 +265,18 @@ class AudioProcessor(object):
    @staticmethod
    def mulaw_encode(wav, qc):
-        mu = 2 ** qc - 1
+        mu = 2**qc - 1
        # wav_abs = np.minimum(np.abs(wav), 1.0)
        signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
        # Quantize signal to the specified number of levels.
        signal = (signal + 1) / 2 * mu + 0.5
-        return np.floor(signal,)
+        return np.floor(signal, )
    @staticmethod
    def mulaw_decode(wav, qc):
        """Recovers waveform from quantized values."""
-        mu = 2 ** qc - 1
+        mu = 2**qc - 1
-        x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
+        x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1)
        return x
    @staticmethod

--- a/parakeet/data/__init__.py
+++ b/parakeet/data/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from .dataset import *
 from .datacargo import *
 from .sampler import *

--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 functions to make batch for arrays which satisfy some conditions.
 """
 import numpy as np
 class TextIDBatcher(object):
    """A wrapper class for a function to build a functor, which holds the configs to pass to the function."""
    def __init__(self, pad_id=0, dtype=np.int64):
        self.pad_id = pad_id
        self.dtype = dtype
    def __call__(self, minibatch):
        out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype)
        return out
 def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
    """
    minibatch: List[Example]
@@ -20,26 +36,32 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
    """
    peek_example = minibatch[0]
    assert len(peek_example.shape) == 1, "text example is an 1D tensor"
-    lengths = [example.shape[0] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
+    lengths = [example.shape[0] for example in minibatch
+               ]  # assume (channel, n_samples) or (n_samples, )
    max_len = np.max(lengths)
    batch = []
    for example in minibatch:
        pad_len = max_len - example.shape[0]
-        batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_id))
+        batch.append(
+            np.pad(example, [(0, pad_len)],
+                   mode='constant',
+                   constant_values=pad_id))
    return np.array(batch, dtype=dtype)
 class WavBatcher(object):
    def __init__(self, pad_value=0., dtype=np.float32):
        self.pad_value = pad_value
        self.dtype = dtype
    def __call__(self, minibatch):
        out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype)
        return out
 def batch_wav(minibatch, pad_value=0., dtype=np.float32):
    """
    minibatch: List[Example]
@@ -51,18 +73,25 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
        mono_channel = True
    elif len(peek_example.shape) == 2:
        mono_channel = False
-    lengths = [example.shape[-1] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
+    lengths = [example.shape[-1] for example in minibatch
+               ]  # assume (channel, n_samples) or (n_samples, )
    max_len = np.max(lengths)
    batch = []
    for example in minibatch:
        pad_len = max_len - example.shape[-1]
        if mono_channel:
-            batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_value))
+            batch.append(
+                np.pad(example, [(0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))
        else:
-            batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
+            batch.append(
+                np.pad(example, [(0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))  # what about PCM, no
    return np.array(batch, dtype=dtype)
@@ -75,6 +104,7 @@ class SpecBatcher(object):
        out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype)
        return out
 def batch_spec(minibatch, pad_value=0., dtype=np.float32):
    """
    minibatch: List[Example]
@@ -86,16 +116,23 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
        mono_channel = True
    elif len(peek_example.shape) == 3:
        mono_channel = False
-    lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame)
+    lengths = [example.shape[-1] for example in minibatch
-    max_len = np.max(lengths)  
+               ]  # assume (channel, F, n_frame) or (F, n_frame)
+    max_len = np.max(lengths)
    batch = []
    for example in minibatch:
        pad_len = max_len - example.shape[-1]
        if mono_channel:
-            batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value))
+            batch.append(
+                np.pad(example, [(0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))
        else:
-            batch.append(np.pad(example, [(0, 0), (0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
+            batch.append(
+                np.pad(example, [(0, 0), (0, 0), (0, pad_len)],
-    return np.array(batch, dtype=dtype) 
+                       mode='constant',
\ No newline at end of file
+                       constant_values=pad_value))  # what about PCM, no
+    return np.array(batch, dtype=dtype)
--- a/parakeet/data/datacargo.py
+++ b/parakeet/data/datacargo.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import six
 from .sampler import SequentialSampler, RandomSampler, BatchSampler

--- a/parakeet/data/dataset.py
+++ b/parakeet/data/dataset.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import six
 import numpy as np
@@ -9,8 +23,7 @@ class DatasetMixin(object):
        if isinstance(index, slice):
            start, stop, step = index.indices(len(self))
            return [
-                self.get_example(i)
+                self.get_example(i) for i in six.moves.range(start, stop, step)
-                for i in six.moves.range(start, stop, step)
            ]
        elif isinstance(index, (list, np.ndarray)):
            return [self.get_example(i) for i in index]
@@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin):
    def get_example(self, i):
        if i < 0:
-            raise IndexError(
+            raise IndexError("ChainDataset doesnot support negative indexing.")
-                "ChainDataset doesnot support negative indexing.")
        for dataset in self._datasets:
            if i < len(dataset):

--- a/parakeet/data/sampler.py
+++ b/parakeet/data/sampler.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__.
@@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices.
 So the sampler is only responsible for generating valid indices.
 """
 import numpy as np
 import random
 class Sampler(object):
    def __init__(self, data_source):
        pass
@@ -23,7 +36,7 @@ class Sampler(object):
 class SequentialSampler(Sampler):
    def __init__(self, data_source):
        self.data_source = data_source
    def __iter__(self):
        return iter(range(len(self.data_source)))
@@ -42,12 +55,14 @@ class RandomSampler(Sampler):
                             "replacement={}".format(self.replacement))
        if self._num_samples is not None and not replacement:
-            raise ValueError("With replacement=False, num_samples should not be specified, "
+            raise ValueError(
-                             "since a random permutation will be performed.")
+                "With replacement=False, num_samples should not be specified, "
+                "since a random permutation will be performed.")
        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
            raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(self.num_samples))
+                             "value, but got num_samples={}".format(
+                                 self.num_samples))
    @property
    def num_samples(self):
@@ -59,7 +74,9 @@ class RandomSampler(Sampler):
    def __iter__(self):
        n = len(self.data_source)
        if self.replacement:
-            return iter(np.random.randint(0, n, size=(self.num_samples,), dtype=np.int64).tolist())
+            return iter(
+                np.random.randint(
+                    0, n, size=(self.num_samples, ), dtype=np.int64).tolist())
        return iter(np.random.permutation(n).tolist())
    def __len__(self):
@@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler):
        self.indices = indices
    def __iter__(self):
-        return (self.indices[i] for i in np.random.permutation(len(self.indices)))
+        return (self.indices[i]
+                for i in np.random.permutation(len(self.indices)))
    def __len__(self):
        return len(self.indices)
@@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
    3. Permutate mini-batchs
    """
-    def __init__(self, lengths, batch_size=4, batch_group_size=None,
+    def __init__(self,
+                 lengths,
+                 batch_size=4,
+                 batch_group_size=None,
                 permutate=True):
-        _lengths = np.array(lengths, dtype=np.int64) # maybe better implement length as a sort key
+        _lengths = np.array(
+            lengths,
+            dtype=np.int64)  # maybe better implement length as a sort key
        self.lengths = np.sort(_lengths)
        self.sorted_indices = np.argsort(_lengths)
@@ -112,20 +135,21 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
        for i in range(len(indices) // batch_group_size):
            s = i * batch_group_size
            e = s + batch_group_size
-            random.shuffle(indices[s: e]) # inplace
+            random.shuffle(indices[s:e])  # inplace
        # Permutate batches
        if self.permutate:
            perm = np.arange(len(indices[:e]) // self.batch_size)
            random.shuffle(perm)
-            indices[:e] = indices[:e].reshape(-1, self.batch_size)[perm, :].reshape(-1)
+            indices[:e] = indices[:e].reshape(
+                -1, self.batch_size)[perm, :].reshape(-1)
        # Handle last elements
        s += batch_group_size
        #print(indices)
        if s < len(indices):
            random.shuffle(indices[s:])
        return iter(indices)
    def __len__(self):
@@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler):
    def __init__(self, weights, num_samples, replacement):
        if not isinstance(num_samples, int) or num_samples <= 0:
            raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(num_samples))
+                             "value, but got num_samples={}".format(
+                                 num_samples))
        self.weights = np.array(weights, dtype=np.float64)
        self.num_samples = num_samples
        self.replacement = replacement
    def __iter__(self):
-        return iter(np.random.choice(len(self.weights), size=(self.num_samples, ),  
+        return iter(
-                                     replace=self.replacement, p=self.weights).tolist())
+            np.random.choice(
+                len(self.weights),
+                size=(self.num_samples, ),
+                replace=self.replacement,
+                p=self.weights).tolist())
    def __len__(self):
        return self.num_samples
@@ -184,7 +213,7 @@ class DistributedSampler(Sampler):
        # Subset samples for each trainer.
        indices = indices[self.rank:self.total_size:self.num_trainers]
-        assert len(indices) ==  self.num_samples
+        assert len(indices) == self.num_samples
        return iter(indices)
@@ -209,8 +238,7 @@ class BatchSampler(Sampler):
    def __init__(self, sampler, batch_size, drop_last):
        if not isinstance(sampler, Sampler):
            raise ValueError("sampler should be an instance of "
-                             "Sampler, but got sampler={}"
+                             "Sampler, but got sampler={}".format(sampler))
-                             .format(sampler))
        if not isinstance(batch_size, int) or batch_size <= 0:
            raise ValueError("batch_size should be a positive integer value, "
                             "but got batch_size={}".format(batch_size))

--- a/parakeet/datasets/README.md
+++ b/parakeet/datasets/README.md
@@ -14,9 +14,4 @@ One of the reasons we choose to load data lazily (only load metadata before hand
 For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`.
-That is it! 
+That is it!
--- a/parakeet/datasets/__init__.py
+++ b/parakeet/datasets/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parakeet/datasets/ljspeech.py
+++ b/parakeet/datasets/ljspeech.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 import numpy as np
 import pandas as pd

--- a/parakeet/datasets/vctk.py
+++ b/parakeet/datasets/vctk.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 import pandas as pd
 from ruamel.yaml import YAML
@@ -11,23 +25,25 @@ from parakeet.data.dataset import Dataset
 from parakeet.data.datacargo import DataCargo
 from parakeet.data.batch import TextIDBatcher, WavBatcher
 class VCTK(Dataset):
    def __init__(self, root):
-        assert isinstance(root, (str, Path)), "root should be a string or Path object"
+        assert isinstance(root, (
+            str, Path)), "root should be a string or Path object"
        self.root = root if isinstance(root, Path) else Path(root)
        self.text_root = self.root.joinpath("txt")
        self.wav_root = self.root.joinpath("wav48")
-        if not (self.root.joinpath("metadata.csv").exists() and 
+        if not (self.root.joinpath("metadata.csv").exists() and
                self.root.joinpath("speaker_indices.yaml").exists()):
            self._prepare_metadata()
        self.speaker_indices, self.metadata = self._load_metadata()
    def _load_metadata(self):
-        yaml=YAML(typ='safe')
+        yaml = YAML(typ='safe')
        speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml"))
-        metadata = pd.read_csv(self.root.joinpath("metadata.csv"), 
+        metadata = pd.read_csv(
-                               sep="|", quoting=3, header=1)
+            self.root.joinpath("metadata.csv"), sep="|", quoting=3, header=1)
        return speaker_indices, metadata
    def _prepare_metadata(self):
@@ -41,15 +57,19 @@ class VCTK(Dataset):
                        with io.open(str(text_file)) as f:
                            transcription = f.read().strip()
                    wav_file = text_file.with_suffix(".wav")
-                    metadata.append((wav_file.name, speaker_folder.name, transcription))
+                    metadata.append(
-        metadata = pd.DataFrame.from_records(metadata,
+                        (wav_file.name, speaker_folder.name, transcription))
-                                             columns=["wave_file", "speaker", "text"])
+        metadata = pd.DataFrame.from_records(
+            metadata, columns=["wave_file", "speaker", "text"])
        # save them
-        yaml=YAML(typ='safe')
+        yaml = YAML(typ='safe')
        yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml"))
-        metadata.to_csv(self.root.joinpath("metadata.csv"), 
+        metadata.to_csv(
-                        sep="|", quoting=3, index=False)
+            self.root.joinpath("metadata.csv"),
+            sep="|",
+            quoting=3,
+            index=False)
    def _get_example(self, metadatum):
        wave_file, speaker, text = metadatum
@@ -77,5 +97,3 @@ class VCTK(Dataset):
        speaker_batch = np.array(speaker_batch)
        phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
        return wav_batch, speaker_batch, phoneme_batch
\ No newline at end of file
--- a/parakeet/g2p/__init__.py
+++ b/parakeet/g2p/__init__.py
 # coding: utf-8
 """Text processing frontend
 All frontend module should have the following functions:

--- a/parakeet/g2p/en/__init__.py
+++ b/parakeet/g2p/en/__init__.py
@@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0):
    from ..text import text_to_sequence
    text = text_to_sequence(text, ["english_cleaners"])
    return text
--- a/parakeet/g2p/es/__init__.py
+++ b/parakeet/g2p/es/__init__.py
@@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0):
    from ..text import text_to_sequence
    text = text_to_sequence(text, ["basic_cleaners"])
    return text
--- a/parakeet/g2p/jp/__init__.py
+++ b/parakeet/g2p/jp/__init__.py
 # coding: utf-8
 import MeCab
 import jaconv
 from random import random
@@ -30,9 +29,9 @@ def _yomi(mecab_result):
 def _mix_pronunciation(tokens, yomis, p):
-    return "".join(
+    return "".join(yomis[idx]
-        yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
+                   if yomis[idx] is not None and random() < p else tokens[idx]
-        for idx in range(len(tokens)))
+                   for idx in range(len(tokens)))
 def mix_pronunciation(text, p):
@@ -59,8 +58,7 @@ def normalize_delimitor(text):
 def text_to_sequence(text, p=0.0):
-    for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】",
+    for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】", "（", "）", "(", ")"]:
-              "（", "）", "(", ")"]:
        text = text.replace(c, "")
    text = text.replace("!", "！")
    text = text.replace("?", "？")

--- a/parakeet/g2p/ko/__init__.py
+++ b/parakeet/g2p/ko/__init__.py
 # coding: utf-8
 from random import random
 n_vocab = 0xffff
@@ -13,5 +12,6 @@ _tagger = None
 def text_to_sequence(text, p=0.0):
    return [ord(c) for c in text] + [_eos]  # EOS
 def sequence_to_text(seq):
    return "".join(chr(n) for n in seq)
--- a/parakeet/g2p/text/__init__.py
+++ b/parakeet/g2p/text/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import re
 from . import cleaners
 from .symbols import symbols
 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 _id_to_symbol = {i: s for i, s in enumerate(symbols)}
@@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names):
        if not m:
            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
            break
-        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
+        sequence += _symbols_to_sequence(
+            _clean_text(m.group(1), cleaner_names))
        sequence += _arpabet_to_sequence(m.group(2))
        text = m.group(3)

--- a/parakeet/g2p/text/cleaners.py
+++ b/parakeet/g2p/text/cleaners.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Cleaners are transformations that run over the input text at both training and eval time.
@@ -14,31 +27,31 @@ import re
 from unidecode import unidecode
 from .numbers import normalize_numbers
 # Regular expression matching whitespace:
 _whitespace_re = re.compile(r'\s+')
 # List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
-    ('mrs', 'misess'),
+                  for x in [
-    ('mr', 'mister'),
+                      ('mrs', 'misess'),
-    ('dr', 'doctor'),
+                      ('mr', 'mister'),
-    ('st', 'saint'),
+                      ('dr', 'doctor'),
-    ('co', 'company'),
+                      ('st', 'saint'),
-    ('jr', 'junior'),
+                      ('co', 'company'),
-    ('maj', 'major'),
+                      ('jr', 'junior'),
-    ('gen', 'general'),
+                      ('maj', 'major'),
-    ('drs', 'doctors'),
+                      ('gen', 'general'),
-    ('rev', 'reverend'),
+                      ('drs', 'doctors'),
-    ('lt', 'lieutenant'),
+                      ('rev', 'reverend'),
-    ('hon', 'honorable'),
+                      ('lt', 'lieutenant'),
-    ('sgt', 'sergeant'),
+                      ('hon', 'honorable'),
-    ('capt', 'captain'),
+                      ('sgt', 'sergeant'),
-    ('esq', 'esquire'),
+                      ('capt', 'captain'),
-    ('ltd', 'limited'),
+                      ('esq', 'esquire'),
-    ('col', 'colonel'),
+                      ('ltd', 'limited'),
-    ('ft', 'fort'),
+                      ('col', 'colonel'),
-]]
+                      ('ft', 'fort'),
+                  ]]
 def expand_abbreviations(text):

--- a/parakeet/g2p/text/cmudict.py
+++ b/parakeet/g2p/text/cmudict.py
-import re
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
 valid_symbols = [
-    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
+    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
-    'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
+    'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
-    'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
+    'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
-    'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
+    'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
-    'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
+    'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
-    'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
+    'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
-    'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
+    'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
+    'Y', 'Z', 'ZH'
 ]
 _valid_symbol_set = set(valid_symbols)
@@ -24,7 +38,10 @@ class CMUDict:
        else:
            entries = _parse_cmudict(file_or_path)
        if not keep_ambiguous:
-            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+            entries = {
+                word: pron
+                for word, pron in entries.items() if len(pron) == 1
+            }
        self._entries = entries
    def __len__(self):

--- a/parakeet/g2p/text/numbers.py
+++ b/parakeet/g2p/text/numbers.py
@@ -3,7 +3,6 @@
 import inflect
 import re
 _inflect = inflect.engine()
 _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
@@ -56,7 +55,8 @@ def _expand_number(m):
        elif num % 100 == 0:
            return _inflect.number_to_words(num // 100) + ' hundred'
        else:
-            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+            return _inflect.number_to_words(
+                num, andword='', zero='oh', group=2).replace(', ', ' ')
    else:
        return _inflect.number_to_words(num, andword='')

--- a/parakeet/g2p/text/symbols.py
+++ b/parakeet/g2p/text/symbols.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Defines the set of symbols used in text input to the model.

--- a/parakeet/models/__init__.py
+++ b/parakeet/models/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parakeet/models/deepvoice3/__init__.py
+++ b/parakeet/models/deepvoice3/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec
 from parakeet.models.deepvoice3.decoder import Decoder, WindowRange
 from parakeet.models.deepvoice3.converter import Converter

--- a/parakeet/models/deepvoice3/attention.py
+++ b/parakeet/models/deepvoice3/attention.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from collections import namedtuple
 from paddle import fluid
@@ -19,23 +33,19 @@ class Attention(dg.Layer):
                 value_projection=True):
        super(Attention, self).__init__()
        std = np.sqrt(1 / query_dim)
-        self.query_proj = Linear(query_dim,
+        self.query_proj = Linear(
-                                 embed_dim,
+            query_dim, embed_dim, param_attr=I.Normal(scale=std))
-                                 param_attr=I.Normal(scale=std))
        if key_projection:
            std = np.sqrt(1 / embed_dim)
-            self.key_proj = Linear(embed_dim,
+            self.key_proj = Linear(
-                                   embed_dim,
+                embed_dim, embed_dim, param_attr=I.Normal(scale=std))
-                                   param_attr=I.Normal(scale=std))
        if value_projection:
            std = np.sqrt(1 / embed_dim)
-            self.value_proj = Linear(embed_dim,
+            self.value_proj = Linear(
-                                     embed_dim,
+                embed_dim, embed_dim, param_attr=I.Normal(scale=std))
-                                     param_attr=I.Normal(scale=std))
        std = np.sqrt(1 / embed_dim)
-        self.out_proj = Linear(embed_dim,
+        self.out_proj = Linear(
-                               query_dim,
+            embed_dim, query_dim, param_attr=I.Normal(scale=std))
-                               param_attr=I.Normal(scale=std))
        self.key_projection = key_projection
        self.value_projection = value_projection
@@ -102,9 +112,8 @@ class Attention(dg.Layer):
        x = F.softmax(x)
        attn_scores = x
-        x = F.dropout(x,
+        x = F.dropout(
-                      self.dropout,
+            x, self.dropout, dropout_implementation="upscale_in_train")
-                      dropout_implementation="upscale_in_train")
        x = F.matmul(x, values)
        encoder_length = keys.shape[1]
        # CAUTION: is it wrong? let it be now

--- a/parakeet/models/deepvoice3/conv1dglu.py
+++ b/parakeet/models/deepvoice3/conv1dglu.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from paddle import fluid
@@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer):
    has residual connection from the input x, and scale the output by 
    np.sqrt(0.5).
    """
    def __init__(self,
                 n_speakers,
                 speaker_dim,
@@ -50,20 +65,20 @@ class Conv1DGLU(dg.Layer):
            ), "this block uses residual connection"\
                "the input_channes should equals num_filters"
        std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
-        self.conv = Conv1DCell(in_channels,
+        self.conv = Conv1DCell(
-                               2 * num_filters,
+            in_channels,
-                               filter_size,
+            2 * num_filters,
-                               dilation,
+            filter_size,
-                               causal,
+            dilation,
-                               param_attr=I.Normal(scale=std))
+            causal,
+            param_attr=I.Normal(scale=std))
        if n_speakers > 1:
            assert (speaker_dim is not None
                    ), "speaker embed should not be null in multi-speaker case"
            std = np.sqrt(1 / speaker_dim)
-            self.fc = Linear(speaker_dim,
+            self.fc = Linear(
-                             num_filters,
+                speaker_dim, num_filters, param_attr=I.Normal(scale=std))
-                             param_attr=I.Normal(scale=std))
    def forward(self, x, speaker_embed=None):
        """
@@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer):
                C_out means the output channels of Conv1DGLU.
        """
        residual = x
-        x = F.dropout(x,
+        x = F.dropout(
-                      self.dropout,
+            x, self.dropout, dropout_implementation="upscale_in_train")
-                      dropout_implementation="upscale_in_train")
        x = self.conv(x)
        content, gate = F.split(x, num_or_sections=2, dim=1)
@@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer):
                C_out means the output channels of Conv1DGLU.
        """
        residual = x_t
-        x_t = F.dropout(x_t,
+        x_t = F.dropout(
-                        self.dropout,
+            x_t, self.dropout, dropout_implementation="upscale_in_train")
-                        dropout_implementation="upscale_in_train")
        x_t = self.conv.add_input(x_t)
        content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)

--- a/parakeet/models/deepvoice3/converter.py
+++ b/parakeet/models/deepvoice3/converter.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from itertools import chain
@@ -19,44 +33,45 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
-        Conv1DGLU(n_speakers,
+        Conv1DGLU(
-                  speaker_dim,
+            n_speakers,
-                  target_channels,
+            speaker_dim,
-                  target_channels,
-                  3,
-                  dilation=1,
-                  std_mul=1.,
-                  dropout=dropout),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=3,
-                  std_mul=4.,
-                  dropout=dropout),
-        Conv1DTranspose(
            target_channels,
            target_channels,
-            2,
+            3,
-            stride=2,
+            dilation=1,
-            param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
+            std_mul=1.,
-        Conv1DGLU(n_speakers,
+            dropout=dropout), Conv1DGLU(
-                  speaker_dim,
+                n_speakers,
-                  target_channels,
+                speaker_dim,
-                  target_channels,
+                target_channels,
-                  3,
+                target_channels,
-                  dilation=1,
+                3,
-                  std_mul=1.,
+                dilation=3,
-                  dropout=dropout),
+                std_mul=4.,
-        Conv1DGLU(n_speakers,
+                dropout=dropout), Conv1DTranspose(
-                  speaker_dim,
+                    target_channels,
-                  target_channels,
+                    target_channels,
-                  target_channels,
+                    2,
-                  3,
+                    stride=2,
-                  dilation=3,
+                    param_attr=I.Normal(scale=np.sqrt(
-                  std_mul=4.,
+                        4. / (2 * target_channels)))), Conv1DGLU(
-                  dropout=dropout)
+                            n_speakers,
+                            speaker_dim,
+                            target_channels,
+                            target_channels,
+                            3,
+                            dilation=1,
+                            std_mul=1.,
+                            dropout=dropout), Conv1DGLU(
+                                n_speakers,
+                                speaker_dim,
+                                target_channels,
+                                target_channels,
+                                3,
+                                dilation=3,
+                                std_mul=4.,
+                                dropout=dropout)
    ]
    return upsampling_convolutions
@@ -69,36 +84,38 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
-        Conv1DGLU(n_speakers,
+        Conv1DGLU(
-                  speaker_dim,
+            n_speakers,
-                  target_channels,
+            speaker_dim,
-                  target_channels,
+            target_channels,
-                  3,
+            target_channels,
-                  dilation=1,
+            3,
-                  std_mul=1.,
+            dilation=1,
-                  dropout=dropout),
+            std_mul=1.,
-        Conv1DGLU(n_speakers,
+            dropout=dropout), Conv1DGLU(
-                  speaker_dim,
+                n_speakers,
-                  target_channels,
+                speaker_dim,
-                  target_channels,
+                target_channels,
-                  3,
+                target_channels,
-                  dilation=3,
+                3,
-                  std_mul=4.,
+                dilation=3,
-                  dropout=dropout)
+                std_mul=4.,
+                dropout=dropout)
    ]
    return upsampling_convolutions
 def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout):
    upsampling_convolutions = [
-        Conv1DGLU(n_speakers,
+        Conv1DGLU(
-                  speaker_dim,
+            n_speakers,
-                  target_channels,
+            speaker_dim,
-                  target_channels,
+            target_channels,
-                  3,
+            target_channels,
-                  dilation=3,
+            3,
-                  std_mul=4.,
+            dilation=3,
-                  dropout=dropout)
+            std_mul=4.,
+            dropout=dropout)
    ]
    return upsampling_convolutions
@@ -108,6 +125,7 @@ class Converter(dg.Layer):
    Vocoder that transforms mel spectrogram (or ecoder hidden states) 
    to waveform.
    """
    def __init__(self,
                 n_speakers,
                 speaker_dim,
@@ -161,33 +179,36 @@ class Converter(dg.Layer):
                std = np.sqrt(std_mul / in_channels)
                # CAUTION: relu
                self.convolutions.append(
-                    Conv1D(in_channels,
+                    Conv1D(
-                           out_channels,
+                        in_channels,
-                           1,
+                        out_channels,
-                           act="relu",
+                        1,
-                           param_attr=I.Normal(scale=std)))
+                        act="relu",
+                        param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0
            self.convolutions.append(
-                Conv1DGLU(n_speakers,
+                Conv1DGLU(
-                          speaker_dim,
+                    n_speakers,
-                          in_channels,
+                    speaker_dim,
-                          out_channels,
+                    in_channels,
-                          filter_size,
+                    out_channels,
-                          dilation=dilation,
+                    filter_size,
-                          std_mul=std_mul,
+                    dilation=dilation,
-                          dropout=dropout))
+                    std_mul=std_mul,
+                    dropout=dropout))
            in_channels = out_channels
            std_mul = 4.0
        # final conv proj, channel transformed to linear dim
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        # CAUTION: sigmoid
-        self.last_conv_proj = Conv1D(in_channels,
+        self.last_conv_proj = Conv1D(
-                                     linear_dim,
+            in_channels,
-                                     1,
+            linear_dim,
-                                     act="sigmoid",
+            1,
-                                     param_attr=I.Normal(scale=std))
+            act="sigmoid",
+            param_attr=I.Normal(scale=std))
    def forward(self, x, speaker_embed=None):
        """
@@ -229,4 +250,4 @@ class Converter(dg.Layer):
        out = self.last_conv_proj(x)
        out = F.transpose(out, [0, 2, 1])
        return out
\ No newline at end of file
--- a/parakeet/models/deepvoice3/decoder.py
+++ b/parakeet/models/deepvoice3/decoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import paddle.fluid.layers as F
 import paddle.fluid.initializer as I
@@ -80,25 +94,25 @@ def unfold_adjacent_frames(folded_frames, r):
 class Decoder(dg.Layer):
    def __init__(
-        self,
+            self,
-        n_speakers,
+            n_speakers,
-        speaker_dim,
+            speaker_dim,
-        embed_dim,
+            embed_dim,
-        mel_dim,
+            mel_dim,
-        r=1,
+            r=1,
-        max_positions=512,
+            max_positions=512,
-        padding_idx=None,  # remove it!
+            padding_idx=None,  # remove it!
-        preattention=(ConvSpec(128, 5, 1), ) * 4,
+            preattention=(ConvSpec(128, 5, 1), ) * 4,
-        convolutions=(ConvSpec(128, 5, 1), ) * 4,
+            convolutions=(ConvSpec(128, 5, 1), ) * 4,
-        attention=True,
+            attention=True,
-        dropout=0.0,
+            dropout=0.0,
-        use_memory_mask=False,
+            use_memory_mask=False,
-        force_monotonic_attention=False,
+            force_monotonic_attention=False,
-        query_position_rate=1.0,
+            query_position_rate=1.0,
-        key_position_rate=1.0,
+            key_position_rate=1.0,
-        window_range=WindowRange(-1, 3),
+            window_range=WindowRange(-1, 3),
-        key_projection=True,
+            key_projection=True,
-        value_projection=True):
+            value_projection=True):
        super(Decoder, self).__init__()
        self.dropout = dropout
@@ -111,23 +125,17 @@ class Decoder(dg.Layer):
        conv_channels = convolutions[0].out_channels
        # only when padding idx is 0 can we easilt handle it
-        self.embed_keys_positions = PositionEmbedding(max_positions,
+        self.embed_keys_positions = PositionEmbedding(
-                                                      embed_dim,
+            max_positions, embed_dim, padding_idx=0)
-                                                      padding_idx=0)
+        self.embed_query_positions = PositionEmbedding(
-        self.embed_query_positions = PositionEmbedding(max_positions,
+            max_positions, conv_channels, padding_idx=0)
-                                                       conv_channels,
-                                                       padding_idx=0)
        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
-            self.speaker_proj1 = Linear(speaker_dim,
+            self.speaker_proj1 = Linear(
-                                        1,
+                speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
-                                        act="sigmoid",
+            self.speaker_proj2 = Linear(
-                                        param_attr=I.Normal(scale=std))
+                speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
-            self.speaker_proj2 = Linear(speaker_dim,
-                                        1,
-                                        act="sigmoid",
-                                        param_attr=I.Normal(scale=std))
        # prenet
        self.prenet = dg.LayerList()
@@ -138,24 +146,26 @@ class Decoder(dg.Layer):
                # conv1d & relu
                std = np.sqrt(std_mul / in_channels)
                self.prenet.append(
-                    Conv1D(in_channels,
+                    Conv1D(
-                           out_channels,
+                        in_channels,
-                           1,
+                        out_channels,
-                           act="relu",
+                        1,
-                           param_attr=I.Normal(scale=std)))
+                        act="relu",
+                        param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0
            self.prenet.append(
-                Conv1DGLU(n_speakers,
+                Conv1DGLU(
-                          speaker_dim,
+                    n_speakers,
-                          in_channels,
+                    speaker_dim,
-                          out_channels,
+                    in_channels,
-                          filter_size,
+                    out_channels,
-                          dilation,
+                    filter_size,
-                          std_mul,
+                    dilation,
-                          dropout,
+                    std_mul,
-                          causal=True,
+                    dropout,
-                          residual=True))
+                    causal=True,
+                    residual=True))
            in_channels = out_channels
            std_mul = 4.0
@@ -184,16 +194,17 @@ class Decoder(dg.Layer):
            assert (
                in_channels == out_channels
            ), "the stack of convolution & attention does not change channels"
-            conv_layer = Conv1DGLU(n_speakers,
+            conv_layer = Conv1DGLU(
-                                   speaker_dim,
+                n_speakers,
-                                   in_channels,
+                speaker_dim,
-                                   out_channels,
+                in_channels,
-                                   filter_size,
+                out_channels,
-                                   dilation,
+                filter_size,
-                                   std_mul,
+                dilation,
-                                   dropout,
+                std_mul,
-                                   causal=True,
+                dropout,
-                                   residual=False)
+                causal=True,
+                residual=False)
            attn_layer = Attention(
                out_channels,
                embed_dim,
@@ -211,10 +222,8 @@ class Decoder(dg.Layer):
        # 1 * 1 conv to transform channels
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
-        self.last_conv = Conv1D(in_channels,
+        self.last_conv = Conv1D(
-                                mel_dim * r,
+            in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std))
-                                1,
-                                param_attr=I.Normal(scale=std))
        # mel (before sigmoid) to done hat
        std = np.sqrt(1 / in_channels)
@@ -308,9 +317,8 @@ class Decoder(dg.Layer):
        # (B, C, T)
        frames = F.transpose(frames, [0, 2, 1])
        x = frames
-        x = F.dropout(x,
+        x = F.dropout(
-                      self.dropout,
+            x, self.dropout, dropout_implementation="upscale_in_train")
-                      dropout_implementation="upscale_in_train")
        # Prenet
        for layer in self.prenet:
            if isinstance(layer, Conv1DGLU):
@@ -408,14 +416,13 @@ class Decoder(dg.Layer):
            test_inputs = fold_adjacent_frames(test_inputs, self.r)
            test_inputs = F.transpose(test_inputs, [0, 2, 1])
-        initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1),
+        initial_input = F.zeros(
-                                dtype=keys.dtype)
+            (batch_size, self.mel_dim * self.r, 1), dtype=keys.dtype)
        t = 0  # decoder time step
        while True:
-            frame_pos = F.fill_constant((batch_size, 1),
+            frame_pos = F.fill_constant(
-                                        value=t + 1,
+                (batch_size, 1), value=t + 1, dtype="int64")
-                                        dtype="int64")
            w = self.query_position_rate
            if self.n_speakers > 1:
                w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1])
@@ -433,9 +440,8 @@ class Decoder(dg.Layer):
                    current_input = initial_input
            x_t = current_input
-            x_t = F.dropout(x_t,
+            x_t = F.dropout(
-                            self.dropout,
+                x_t, self.dropout, dropout_implementation="upscale_in_train")
-                            dropout_implementation="upscale_in_train")
            # Prenet
            for layer in self.prenet:
@@ -453,15 +459,15 @@ class Decoder(dg.Layer):
                    x_t = F.transpose(x_t, [0, 2, 1])
                    if frame_pos_embed is not None:
                        x_t += frame_pos_embed
-                    x_t, attn_scores = attn(
+                    x_t, attn_scores = attn(x_t, (keys, values), mask,
-                        x_t, (keys, values), mask,
+                                            last_attended[i]
-                        last_attended[i] if test_inputs is None else None)
+                                            if test_inputs is None else None)
                    x_t = F.transpose(x_t, [0, 2, 1])
                    step_attn_scores.append(attn_scores)  #(B, T_dec=1, T_enc)
                    # update last attended when necessary
                    if self.force_monotonic_attention[i]:
-                        last_attended[i] = np.argmax(attn_scores.numpy(),
+                        last_attended[i] = np.argmax(
-                                                     axis=-1)[0][0]
+                            attn_scores.numpy(), axis=-1)[0][0]
                x_t = F.scale(residual + x_t, np.sqrt(0.5))
            if len(step_attn_scores):
                # (B, 1, T_enc) again
@@ -485,8 +491,8 @@ class Decoder(dg.Layer):
            t += 1
            if test_inputs is None:
-                if F.reduce_min(done_t).numpy(
+                if F.reduce_min(done_t).numpy()[
-                )[0] > 0.5 and t > self.min_decoder_steps:
+                        0] > 0.5 and t > self.min_decoder_steps:
                    break
                elif t > self.max_decoder_steps:
                    break

--- a/parakeet/models/deepvoice3/encoder.py
+++ b/parakeet/models/deepvoice3/encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from collections import namedtuple
@@ -33,14 +47,16 @@ class Encoder(dg.Layer):
        self.dropout = dropout
        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
-            self.sp_proj1 = Linear(speaker_dim,
+            self.sp_proj1 = Linear(
-                                   embed_dim,
+                speaker_dim,
-                                   act="softsign",
+                embed_dim,
-                                   param_attr=I.Normal(scale=std))
+                act="softsign",
-            self.sp_proj2 = Linear(speaker_dim,
+                param_attr=I.Normal(scale=std))
-                                   embed_dim,
+            self.sp_proj2 = Linear(
-                                   act="softsign",
+                speaker_dim,
-                                   param_attr=I.Normal(scale=std))
+                embed_dim,
+                act="softsign",
+                param_attr=I.Normal(scale=std))
        self.n_speakers = n_speakers
        self.convolutions = dg.LayerList()
@@ -51,31 +67,34 @@ class Encoder(dg.Layer):
            if in_channels != out_channels:
                std = np.sqrt(std_mul / in_channels)
                self.convolutions.append(
-                    Conv1D(in_channels,
+                    Conv1D(
-                           out_channels,
+                        in_channels,
-                           1,
+                        out_channels,
-                           act="relu",
+                        1,
-                           param_attr=I.Normal(scale=std)))
+                        act="relu",
+                        param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0
            self.convolutions.append(
-                Conv1DGLU(n_speakers,
+                Conv1DGLU(
-                          speaker_dim,
+                    n_speakers,
-                          in_channels,
+                    speaker_dim,
-                          out_channels,
+                    in_channels,
-                          filter_size,
+                    out_channels,
-                          dilation,
+                    filter_size,
-                          std_mul,
+                    dilation,
-                          dropout,
+                    std_mul,
-                          causal=False,
+                    dropout,
-                          residual=True))
+                    causal=False,
+                    residual=True))
            in_channels = out_channels
            std_mul = 4.0
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        self.convolutions.append(
-            Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
+            Conv1D(
+                in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
    def forward(self, x, speaker_embed=None):
        """
@@ -96,9 +115,8 @@ class Encoder(dg.Layer):
                representation for values.
        """
        x = self.embed(x)
-        x = F.dropout(x,
+        x = F.dropout(
-                      self.dropout,
+            x, self.dropout, dropout_implementation="upscale_in_train")
-                      dropout_implementation="upscale_in_train")
        x = F.transpose(x, [0, 2, 1])
        if self.n_speakers > 1 and speaker_embed is not None:

--- a/parakeet/models/deepvoice3/loss.py
+++ b/parakeet/models/deepvoice3/loss.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from numba import jit
@@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g):
    return W
-def guided_attentions(encoder_lengths,
+def guided_attentions(encoder_lengths, decoder_lengths, max_decoder_len,
-                      decoder_lengths,
-                      max_decoder_len,
                      g=0.2):
    B = len(encoder_lengths)
    max_input_len = encoder_lengths.max()
@@ -93,9 +105,8 @@ class TTSLoss(object):
    def binary_divergence(self, prediction, target, mask):
        flattened_prediction = F.reshape(prediction, [-1, 1])
        flattened_target = F.reshape(target, [-1, 1])
-        flattened_loss = F.log_loss(flattened_prediction,
+        flattened_loss = F.log_loss(
-                                    flattened_target,
+            flattened_prediction, flattened_target, epsilon=1e-8)
-                                    epsilon=1e-8)
        bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
        w = self.masked_weight
@@ -163,23 +174,20 @@ class TTSLoss(object):
        max_mel_steps = max_frames // self.downsample_factor
        max_decoder_steps = max_mel_steps // self.r
-        decoder_mask = F.sequence_mask(n_frames // self.downsample_factor //
+        decoder_mask = F.sequence_mask(
-                                       self.r,
+            n_frames // self.downsample_factor // self.r,
-                                       max_decoder_steps,
+            max_decoder_steps,
-                                       dtype="float32")
+            dtype="float32")
-        mel_mask = F.sequence_mask(n_frames // self.downsample_factor,
+        mel_mask = F.sequence_mask(
-                                   max_mel_steps,
+            n_frames // self.downsample_factor, max_mel_steps, dtype="float32")
-                                   dtype="float32")
        lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32")
        if compute_lin_loss:
            lin_hyp = lin_hyp[:, :-self.time_shift, :]
            lin_ref = lin_ref[:, self.time_shift:, :]
            lin_mask = lin_mask[:, self.time_shift:, :]
-            lin_l1_loss = self.l1_loss(lin_hyp,
+            lin_l1_loss = self.l1_loss(
-                                       lin_ref,
+                lin_hyp, lin_ref, lin_mask, priority_bin=self.priority_bin)
-                                       lin_mask,
-                                       priority_bin=self.priority_bin)
            lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask)
            lin_loss = self.binary_divergence_weight * lin_bce_loss \
                     + (1 - self.binary_divergence_weight) * lin_l1_loss
@@ -197,9 +205,10 @@ class TTSLoss(object):
            total_loss += mel_loss
        if compute_attn_loss:
-            attn_loss = self.attention_loss(
+            attn_loss = self.attention_loss(attn_hyp,
-                attn_hyp, input_lengths.numpy(),
+                                            input_lengths.numpy(),
-                n_frames.numpy() // (self.downsample_factor * self.r))
+                                            n_frames.numpy() //
+                                            (self.downsample_factor * self.r))
            total_loss += attn_loss
        if compute_done_loss:

--- a/parakeet/models/deepvoice3/model.py
+++ b/parakeet/models/deepvoice3/model.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import paddle.fluid.layers as F
@@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer):
        mel_outputs, alignments, done, decoder_states = self.decoder(
            (keys, values), valid_lengths, mel_inputs, text_positions,
            frame_positions, speaker_embed)
-        linear_outputs = self.converter(
+        linear_outputs = self.converter(decoder_states
-            decoder_states if self.use_decoder_states else mel_outputs,
+                                        if self.use_decoder_states else
-            speaker_embed)
+                                        mel_outputs, speaker_embed)
        return mel_outputs, linear_outputs, alignments, done
    def transduce(self, text_sequences, text_positions, speaker_indices=None):
@@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer):
        keys, values = self.encoder(text_sequences, speaker_embed)
        mel_outputs, alignments, done, decoder_states = self.decoder.decode(
            (keys, values), text_positions, speaker_embed)
-        linear_outputs = self.converter(
+        linear_outputs = self.converter(decoder_states
-            decoder_states if self.use_decoder_states else mel_outputs,
+                                        if self.use_decoder_states else
-            speaker_embed)
+                                        mel_outputs, speaker_embed)
        return mel_outputs, linear_outputs, alignments, done
--- a/parakeet/models/deepvoice3/position_embedding.py
+++ b/parakeet/models/deepvoice3/position_embedding.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from paddle import fluid
 import paddle.fluid.layers as F
@@ -95,10 +109,11 @@ class PositionEmbedding(dg.Layer):
                                            speaker_position_rate)  # (B, V, C)
        # make indices for gather_nd
        batch_id = F.expand(
-            F.unsqueeze(F.range(0, batch_size, 1, dtype="int64"), [1]),
+            F.unsqueeze(
-            [1, time_steps])
+                F.range(
+                    0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
        # (B, T, 2)
        gather_nd_id = F.stack([batch_id, indices], -1)
        out = F.gather_nd(weight, gather_nd_id)
        return out
\ No newline at end of file
--- a/parakeet/models/fastspeech/__init__.py
+++ b/parakeet/models/fastspeech/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
--- a/parakeet/models/fastspeech/decoder.py
+++ b/parakeet/models/fastspeech/decoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.fastspeech.fft_block import FFTBlock
 class Decoder(dg.Layer):
    def __init__(self,
                 len_max_seq,
@@ -18,16 +32,29 @@ class Decoder(dg.Layer):
        super(Decoder, self).__init__()
        n_position = len_max_seq + 1
-        self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
+        self.pos_inp = get_sinusoid_encoding_table(
-        self.position_enc = dg.Embedding(size=[n_position, d_model],
+            n_position, d_model, padding_idx=0)
-                                 padding_idx=0,
+        self.position_enc = dg.Embedding(
-                                 param_attr=fluid.ParamAttr(
+            size=[n_position, d_model],
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
+            padding_idx=0,
-                                     trainable=False))
+            param_attr=fluid.ParamAttr(
-        self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] 
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.layer_stack = [
+            FFTBlock(
+                d_model,
+                d_inner,
+                n_head,
+                d_k,
+                d_v,
+                fft_conv1d_kernel,
+                fft_conv1d_padding,
+                dropout=dropout) for _ in range(n_layers)
+        ]
        for i, layer in enumerate(self.layer_stack):
            self.add_sublayer('fft_{}'.format(i), layer)
    def forward(self, enc_seq, enc_pos):
        """
        Decoder layer of FastSpeech.
@@ -57,4 +84,4 @@ class Decoder(dg.Layer):
                slf_attn_mask=slf_attn_mask)
            dec_slf_attn_list += [dec_slf_attn]
        return dec_output, dec_slf_attn_list
\ No newline at end of file
--- a/parakeet/models/fastspeech/encoder.py
+++ b/parakeet/models/fastspeech/encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.fastspeech.fft_block import FFTBlock
 class Encoder(dg.Layer):
    def __init__(self,
                 n_src_vocab,
@@ -19,14 +33,28 @@ class Encoder(dg.Layer):
        super(Encoder, self).__init__()
        n_position = len_max_seq + 1
-        self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
+        self.src_word_emb = dg.Embedding(
-        self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
+            size=[n_src_vocab, d_model], padding_idx=0)
-        self.position_enc = dg.Embedding(size=[n_position, d_model],
+        self.pos_inp = get_sinusoid_encoding_table(
-                                 padding_idx=0,
+            n_position, d_model, padding_idx=0)
-                                 param_attr=fluid.ParamAttr(
+        self.position_enc = dg.Embedding(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
+            size=[n_position, d_model],
-                                     trainable=False))
+            padding_idx=0,
-        self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.layer_stack = [
+            FFTBlock(
+                d_model,
+                d_inner,
+                n_head,
+                d_k,
+                d_v,
+                fft_conv1d_kernel,
+                fft_conv1d_padding,
+                dropout=dropout) for _ in range(n_layers)
+        ]
        for i, layer in enumerate(self.layer_stack):
            self.add_sublayer('fft_{}'.format(i), layer)
@@ -52,7 +80,8 @@ class Encoder(dg.Layer):
        non_pad_mask = get_non_pad_mask(character)
        # -- Forward
-        enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
+        enc_output = self.src_word_emb(character) + self.position_enc(
+            text_pos)  #(N, T, C)
        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(
@@ -60,5 +89,5 @@ class Encoder(dg.Layer):
                non_pad_mask=non_pad_mask,
                slf_attn_mask=slf_attn_mask)
            enc_slf_attn_list += [enc_slf_attn]
        return enc_output, non_pad_mask, enc_slf_attn_list
\ No newline at end of file
--- a/parakeet/models/fastspeech/fastspeech.py
+++ b/parakeet/models/fastspeech/fastspeech.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
@@ -7,54 +20,67 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator
 from parakeet.models.fastspeech.encoder import Encoder
 from parakeet.models.fastspeech.decoder import Decoder
 class FastSpeech(dg.Layer):
    def __init__(self, cfg):
        " FastSpeech"
        super(FastSpeech, self).__init__()
-        self.encoder = Encoder(n_src_vocab=len(symbols)+1,
+        self.encoder = Encoder(
-                               len_max_seq=cfg['max_seq_len'],
+            n_src_vocab=len(symbols) + 1,
-                               n_layers=cfg['encoder_n_layer'],
+            len_max_seq=cfg['max_seq_len'],
-                               n_head=cfg['encoder_head'],
+            n_layers=cfg['encoder_n_layer'],
-                               d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
+            n_head=cfg['encoder_head'],
-                               d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
+            d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
-                               d_model=cfg['fs_hidden_size'],
+            d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
-                               d_inner=cfg['encoder_conv1d_filter_size'],
+            d_model=cfg['fs_hidden_size'],
-                               fft_conv1d_kernel=cfg['fft_conv1d_filter'], 
+            d_inner=cfg['encoder_conv1d_filter_size'],
-                               fft_conv1d_padding=cfg['fft_conv1d_padding'],
+            fft_conv1d_kernel=cfg['fft_conv1d_filter'],
-                               dropout=0.1)
+            fft_conv1d_padding=cfg['fft_conv1d_padding'],
-        self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'], 
+            dropout=0.1)
-                                                out_channels=cfg['duration_predictor_output_size'], 
+        self.length_regulator = LengthRegulator(
-                                                filter_size=cfg['duration_predictor_filter_size'], 
+            input_size=cfg['fs_hidden_size'],
-                                                dropout=cfg['dropout'])
+            out_channels=cfg['duration_predictor_output_size'],
-        self.decoder = Decoder(len_max_seq=cfg['max_seq_len'],
+            filter_size=cfg['duration_predictor_filter_size'],
-                                n_layers=cfg['decoder_n_layer'],
+            dropout=cfg['dropout'])
-                                n_head=cfg['decoder_head'],
+        self.decoder = Decoder(
-                                d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
+            len_max_seq=cfg['max_seq_len'],
-                                d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
+            n_layers=cfg['decoder_n_layer'],
-                                d_model=cfg['fs_hidden_size'],
+            n_head=cfg['decoder_head'],
-                                d_inner=cfg['decoder_conv1d_filter_size'],
+            d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
-                                fft_conv1d_kernel=cfg['fft_conv1d_filter'], 
+            d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
-                                fft_conv1d_padding=cfg['fft_conv1d_padding'],
+            d_model=cfg['fs_hidden_size'],
-                                dropout=0.1)
+            d_inner=cfg['decoder_conv1d_filter_size'],
-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
+            fft_conv1d_kernel=cfg['fft_conv1d_filter'],
+            fft_conv1d_padding=cfg['fft_conv1d_padding'],
+            dropout=0.1)
+        self.weight = fluid.ParamAttr(
+            initializer=fluid.initializer.XavierInitializer())
        k = math.sqrt(1 / cfg['fs_hidden_size'])
-        self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
+        self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-        self.mel_linear = dg.Linear(cfg['fs_hidden_size'], 
+            low=-k, high=k))
-                                    cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'],
+        self.mel_linear = dg.Linear(
-                                    param_attr = self.weight,
+            cfg['fs_hidden_size'],
-                                    bias_attr = self.bias,)
+            cfg['audio']['num_mels'] * cfg['audio']['outputs_per_step'],
-        self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'],
+            param_attr=self.weight,
-                 num_hidden=512,
+            bias_attr=self.bias, )
-                 filter_size=5,
+        self.postnet = PostConvNet(
-                 padding=int(5 / 2),
+            n_mels=cfg['audio']['num_mels'],
-                 num_conv=5,
+            num_hidden=512,
-                 outputs_per_step=cfg['audio']['outputs_per_step'],
+            filter_size=5,
-                 use_cudnn=True,
+            padding=int(5 / 2),
-                 dropout=0.1,
+            num_conv=5,
-                 batchnorm_last=True)
+            outputs_per_step=cfg['audio']['outputs_per_step'],
+            use_cudnn=True,
+            dropout=0.1,
+            batchnorm_last=True)
-    def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0):
+    def forward(self,
+                character,
+                text_pos,
+                mel_pos=None,
+                length_target=None,
+                alpha=1.0):
        """
        FastSpeech model.
@@ -80,22 +106,25 @@ class FastSpeech(dg.Layer):
            dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
        """
-        encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos)
+        encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(
+            character, text_pos)
        if fluid.framework._dygraph_tracer()._train_mode:
-            length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
+            length_regulator_output, duration_predictor_output = self.length_regulator(
-                                                                                       target=length_target,
+                encoder_output, target=length_target, alpha=alpha)
-                                                                                       alpha=alpha)
+            decoder_output, dec_slf_attn_list = self.decoder(
-            decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos)
+                length_regulator_output, mel_pos)
            mel_output = self.mel_linear(decoder_output)
            mel_output_postnet = self.postnet(mel_output) + mel_output
            return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
        else:
-            length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha)
+            length_regulator_output, decoder_pos = self.length_regulator(
-            decoder_output, _ = self.decoder(length_regulator_output, decoder_pos)
+                encoder_output, alpha=alpha)
+            decoder_output, _ = self.decoder(length_regulator_output,
+                                             decoder_pos)
            mel_output = self.mel_linear(decoder_output)
            mel_output_postnet = self.postnet(mel_output) + mel_output
            return mel_output, mel_output_postnet
\ No newline at end of file
--- a/parakeet/models/fastspeech/fft_block.py
+++ b/parakeet/models/fastspeech/fft_block.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import math
 import paddle.fluid.dygraph as dg
@@ -6,11 +19,32 @@ import paddle.fluid as fluid
 from parakeet.modules.multihead_attention import MultiheadAttention
 from parakeet.modules.ffn import PositionwiseFeedForward
 class FFTBlock(dg.Layer):
-    def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
+    def __init__(self,
+                 d_model,
+                 d_inner,
+                 n_head,
+                 d_k,
+                 d_v,
+                 filter_size,
+                 padding,
+                 dropout=0.2):
        super(FFTBlock, self).__init__()
-        self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
+        self.slf_attn = MultiheadAttention(
-        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
+            d_model,
+            d_k,
+            d_v,
+            num_head=n_head,
+            is_bias=True,
+            dropout=dropout,
+            is_concat=False)
+        self.pos_ffn = PositionwiseFeedForward(
+            d_model,
+            d_inner,
+            filter_size=filter_size,
+            padding=padding,
+            dropout=dropout)
    def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
        """
@@ -27,10 +61,11 @@ class FFTBlock(dg.Layer):
            output (Variable), Shape(B, T, C), the output after self-attention & ffn.
            slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
        """
-        output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
+        output, slf_attn = self.slf_attn(
+            enc_input, enc_input, enc_input, mask=slf_attn_mask)
        output *= non_pad_mask
        output = self.pos_ffn(output)
        output *= non_pad_mask
        return output, slf_attn
\ No newline at end of file
--- a/parakeet/models/fastspeech/length_regulator.py
+++ b/parakeet/models/fastspeech/length_regulator.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import math
 import parakeet.models.fastspeech.utils
@@ -6,47 +19,50 @@ import paddle.fluid.layers as layers
 import paddle.fluid as fluid
 from parakeet.modules.customized import Conv1D
 class LengthRegulator(dg.Layer):
    def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
        super(LengthRegulator, self).__init__()
-        self.duration_predictor = DurationPredictor(input_size=input_size, 
+        self.duration_predictor = DurationPredictor(
-                                                    out_channels=out_channels, 
+            input_size=input_size,
-                                                    filter_size=filter_size, 
+            out_channels=out_channels,
-                                                    dropout=dropout)
+            filter_size=filter_size,
+            dropout=dropout)
    def LR(self, x, duration_predictor_output, alpha=1.0):
        output = []
        batch_size = x.shape[0]
        for i in range(batch_size):
-            output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha))
+            output.append(
+                self.expand(x[i:i + 1], duration_predictor_output[i:i + 1],
+                            alpha))
        output = self.pad(output)
        return output
    def pad(self, input_ele):
        max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
        out_list = []
        for i in range(len(input_ele)):
            pad_len = max_len - input_ele[i].shape[0]
-            one_batch_padded = layers.pad(
+            one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0],
-                input_ele[i], [0, pad_len, 0, 0], pad_value=0.0)
+                                          pad_value=0.0)
            out_list.append(one_batch_padded)
        out_padded = layers.stack(out_list)
        return out_padded
    def expand(self, batch, predicted, alpha):
        out = []
        time_steps = batch.shape[1]
        fertilities = predicted.numpy()
-        batch = layers.squeeze(batch,[0]) 
+        batch = layers.squeeze(batch, [0])
        for i in range(time_steps):
-            if fertilities[0,i]==0:
+            if fertilities[0, i] == 0:
                continue
-            out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1]))
+            out.append(
+                layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1]))
        out = layers.concat(out, axis=0)
        return out
    def forward(self, x, alpha=1.0, target=None):
        """
@@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer):
        else:
            duration_predictor_output = layers.round(duration_predictor_output)
            output = self.LR(x, duration_predictor_output, alpha)
-            mel_pos = dg.to_variable(np.arange(1, output.shape[1]+1))
+            mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1))
            mel_pos = layers.unsqueeze(mel_pos, [0])
            return output, mel_pos
 class DurationPredictor(dg.Layer):
    def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
        super(DurationPredictor, self).__init__()
@@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer):
        self.dropout = dropout
        k = math.sqrt(1 / self.input_size)
-        self.conv1 = Conv1D(num_channels = self.input_size, 
+        self.conv1 = Conv1D(
-                        num_filters = self.out_channels, 
+            num_channels=self.input_size,
-                        filter_size = self.filter_size,
+            num_filters=self.out_channels,
-                        padding=1,
+            filter_size=self.filter_size,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            padding=1,
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+            param_attr=fluid.ParamAttr(
-                        #data_format='NTC')
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        #data_format='NTC')
        k = math.sqrt(1 / self.out_channels)
-        self.conv2 = Conv1D(num_channels = self.out_channels, 
+        self.conv2 = Conv1D(
-                        num_filters = self.out_channels, 
+            num_channels=self.out_channels,
-                        filter_size = self.filter_size,
+            num_filters=self.out_channels,
-                        padding=1,
+            filter_size=self.filter_size,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            padding=1,
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+            param_attr=fluid.ParamAttr(
-                        #data_format='NTC')
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        #data_format='NTC')
        self.layer_norm1 = dg.LayerNorm(self.out_channels)
        self.layer_norm2 = dg.LayerNorm(self.out_channels)
-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
+        self.weight = fluid.ParamAttr(
+            initializer=fluid.initializer.XavierInitializer())
        k = math.sqrt(1 / self.out_channels)
-        self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
+        self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+            low=-k, high=k))
-        self.linear =dg.Linear(self.out_channels, 1, param_attr = self.weight,
+        self.linear = dg.Linear(
-                            bias_attr = self.bias)
+            self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias)
    def forward(self, encoder_output):
        """
@@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer):
            out (Variable), Shape(B, T, C), the output of duration predictor.
        """
        # encoder_output.shape(N, T, C)
-        out = layers.transpose(encoder_output, [0,2,1])
+        out = layers.transpose(encoder_output, [0, 2, 1])
        out = self.conv1(out)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        out = self.conv2(out)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout)
        out = layers.relu(self.linear(out))
        out = layers.squeeze(out, axes=[-1])
-        return out
+        return out
--- a/parakeet/models/fastspeech/utils.py
+++ b/parakeet/models/fastspeech/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 def get_alignment(attn_probs, mel_lens, n_head):
    max_F = 0
    assert attn_probs[0].shape[0] % n_head == 0
@@ -8,27 +22,27 @@ def get_alignment(attn_probs, mel_lens, n_head):
    for i in range(len(attn_probs)):
        multi_attn = attn_probs[i].numpy()
        for j in range(n_head):
-            attn = multi_attn[j*batch_size:(j+1)*batch_size]
+            attn = multi_attn[j * batch_size:(j + 1) * batch_size]
            F = score_F(attn)
            if max_F < F:
                max_F = F
                max_attn = attn
    alignment = compute_duration(max_attn, mel_lens)
    return alignment
 def score_F(attn):
    max = np.max(attn, axis=-1)
    mean = np.mean(max)
    return mean
 def compute_duration(attn, mel_lens):
-    alignment = np.zeros([attn.shape[0],attn.shape[2]])
+    alignment = np.zeros([attn.shape[0], attn.shape[2]])
    mel_lens = mel_lens.numpy()
    for i in range(attn.shape[0]):
        for j in range(mel_lens[i]):
-            max_index = np.argmax(attn[i,j])
+            max_index = np.argmax(attn[i, j])
-            alignment[i,max_index] += 1
+            alignment[i, max_index] += 1
    return alignment
--- a/parakeet/models/transformer_tts/__init__.py
+++ b/parakeet/models/transformer_tts/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
--- a/parakeet/models/transformer_tts/cbhg.py
+++ b/parakeet/models/transformer_tts/cbhg.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from parakeet.g2p.text.symbols import symbols
 import paddle.fluid.dygraph as dg
@@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D
 from parakeet.modules.dynamic_gru import DynamicGRU
 import numpy as np
 class CBHG(dg.Layer):
-    def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2, 
+    def __init__(self,
-                 max_pool_kernel_size=2, is_post=False):
+                 hidden_size,
+                 batch_size,
+                 K=16,
+                 projection_size=256,
+                 num_gru_layers=2,
+                 max_pool_kernel_size=2,
+                 is_post=False):
        super(CBHG, self).__init__()
        """
        :param hidden_size: dimension of hidden unit
@@ -24,28 +44,39 @@ class CBHG(dg.Layer):
        self.projection_size = projection_size
        self.conv_list = []
        k = math.sqrt(1 / projection_size)
-        self.conv_list.append(Conv1D(num_channels = projection_size,
+        self.conv_list.append(
-                            num_filters = hidden_size,
+            Conv1D(
-                            filter_size = 1,
+                num_channels=projection_size,
-                            padding = int(np.floor(1/2)),
+                num_filters=hidden_size,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                filter_size=1,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
+                padding=int(np.floor(1 / 2)),
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k))))
        k = math.sqrt(1 / hidden_size)
-        for i in range(2,K+1):
+        for i in range(2, K + 1):
-            self.conv_list.append(Conv1D(num_channels = hidden_size,
+            self.conv_list.append(
-                            num_filters = hidden_size,
+                Conv1D(
-                            filter_size = i,
+                    num_channels=hidden_size,
-                            padding = int(np.floor(i/2)),
+                    num_filters=hidden_size,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                    filter_size=i,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
+                    padding=int(np.floor(i / 2)),
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)
        self.batchnorm_list = []
        for i in range(K):
-            self.batchnorm_list.append(dg.BatchNorm(hidden_size, 
+            self.batchnorm_list.append(
-                            data_layout='NCHW'))
+                dg.BatchNorm(
+                    hidden_size, data_layout='NCHW'))
        for i, layer in enumerate(self.batchnorm_list):
            self.add_sublayer("batchnorm_list_{}".format(i), layer)
@@ -53,91 +84,120 @@ class CBHG(dg.Layer):
        conv_outdim = hidden_size * K
        k = math.sqrt(1 / conv_outdim)
-        self.conv_projection_1 = Conv1D(num_channels = conv_outdim,
+        self.conv_projection_1 = Conv1D(
-                            num_filters = hidden_size,
+            num_channels=conv_outdim,
-                            filter_size = 3,
+            num_filters=hidden_size,
-                            padding = int(np.floor(3/2)),
+            filter_size=3,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            padding=int(np.floor(3 / 2)),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
        k = math.sqrt(1 / hidden_size)
-        self.conv_projection_2 = Conv1D(num_channels = hidden_size,
+        self.conv_projection_2 = Conv1D(
-                            num_filters = projection_size,
+            num_channels=hidden_size,
-                            filter_size = 3,
+            num_filters=projection_size,
-                            padding = int(np.floor(3/2)),
+            filter_size=3,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            padding=int(np.floor(3 / 2)),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
-        self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, 
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                            data_layout='NCHW')
+                low=-k, high=k)))
-        self.batchnorm_proj_2 = dg.BatchNorm(projection_size, 
-                            data_layout='NCHW')
+        self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
-        self.max_pool = Pool1D(pool_size = max_pool_kernel_size, 
+        self.batchnorm_proj_2 = dg.BatchNorm(
-                    pool_type='max', 
+            projection_size, data_layout='NCHW')
-                    pool_stride=1, 
+        self.max_pool = Pool1D(
-                    pool_padding=1,
+            pool_size=max_pool_kernel_size,
-                    data_format = "NCT")
+            pool_type='max',
+            pool_stride=1,
+            pool_padding=1,
+            data_format="NCT")
        self.highway = Highwaynet(self.projection_size)
        h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
        h_0 = dg.to_variable(h_0)
        k = math.sqrt(1 / hidden_size)
-        self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
+        self.fc_forward1 = dg.Linear(
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            hidden_size,
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            hidden_size // 2 * 3,
-        self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
-                            param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-        self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
+                low=-k, high=k)))
-                              is_reverse = False,
+        self.fc_reverse1 = dg.Linear(
-                              origin_mode = True,
+            hidden_size,
-                              h_0 = h_0)
+            hidden_size // 2 * 3,
-        self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
+            param_attr=fluid.ParamAttr(
-                              is_reverse=True,
+                initializer=fluid.initializer.XavierInitializer()),
-                              origin_mode=True,
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                              h_0 = h_0)
+                low=-k, high=k)))
+        self.gru_forward1 = DynamicGRU(
-        self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
+            size=self.hidden_size // 2,
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            is_reverse=False,
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            origin_mode=True,
-        self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
+            h_0=h_0)
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+        self.gru_reverse1 = DynamicGRU(
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            size=self.hidden_size // 2,
-        self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
+            is_reverse=True,
-                              is_reverse = False,
+            origin_mode=True,
-                              origin_mode = True,
+            h_0=h_0)
-                              h_0 = h_0)
-        self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
+        self.fc_forward2 = dg.Linear(
-                              is_reverse=True,
+            hidden_size,
-                              origin_mode=True,
+            hidden_size // 2 * 3,
-                              h_0 = h_0)
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.fc_reverse2 = dg.Linear(
+            hidden_size,
+            hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.gru_forward2 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=False,
+            origin_mode=True,
+            h_0=h_0)
+        self.gru_reverse2 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=True,
+            origin_mode=True,
+            h_0=h_0)
    def _conv_fit_dim(self, x, filter_size=3):
        if filter_size % 2 == 0:
-            return x[:,:,:-1]
+            return x[:, :, :-1]
        else:
-            return x 
+            return x
    def forward(self, input_):
        # input_.shape = [N, C, T]
        conv_list = []
        conv_input = input_
-        for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
+        for i, (conv, batchnorm
-            conv_input = self._conv_fit_dim(conv(conv_input), i+1)
+                ) in enumerate(zip(self.conv_list, self.batchnorm_list)):
+            conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
            conv_input = layers.relu(batchnorm(conv_input))
            conv_list.append(conv_input)
        conv_cat = layers.concat(conv_list, axis=1)
-        conv_pool = self.max_pool(conv_cat)[:,:,:-1]
+        conv_pool = self.max_pool(conv_cat)[:, :, :-1]
+        conv_proj = layers.relu(
-        conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
+            self.batchnorm_proj_1(
-        conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
+                self._conv_fit_dim(self.conv_projection_1(conv_pool))))
+        conv_proj = self.batchnorm_proj_2(
+            self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
        # conv_proj.shape = [N, C, T]
-        highway = layers.transpose(conv_proj, [0,2,1])
+        highway = layers.transpose(conv_proj, [0, 2, 1])
        highway = self.highway(highway)
        # highway.shape = [N, T, C]
@@ -151,9 +211,10 @@ class CBHG(dg.Layer):
        out_forward = self.gru_forward2(fc_forward)
        out_reverse = self.gru_reverse2(fc_reverse)
        out = layers.concat([out_forward, out_reverse], axis=-1)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        return out
 class Highwaynet(dg.Layer):
    def __init__(self, num_units, num_layers=4):
        super(Highwaynet, self).__init__()
@@ -164,14 +225,26 @@ class Highwaynet(dg.Layer):
        self.linears = []
        k = math.sqrt(1 / num_units)
        for i in range(num_layers):
-            self.linears.append(dg.Linear(num_units, num_units,
+            self.linears.append(
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                dg.Linear(
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
+                    num_units,
-            self.gates.append(dg.Linear(num_units, num_units,
+                    num_units,
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                    param_attr=fluid.ParamAttr(
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
-        for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
+            self.gates.append(
+                dg.Linear(
+                    num_units,
+                    num_units,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
+        for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
            self.add_sublayer("linears_{}".format(i), linear)
            self.add_sublayer("gates_{}".format(i), gate)
@@ -183,12 +256,6 @@ class Highwaynet(dg.Layer):
            t_ = fluid.layers.sigmoid(gate(out))
            c = 1 - t_
-            out  = h * t_ + out  * c
+            out = h * t_ + out * c
-        return out
+        return out
--- a/parakeet/models/transformer_tts/decoder.py
+++ b/parakeet/models/transformer_tts/decoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
@@ -7,70 +20,110 @@ from parakeet.modules.ffn import PositionwiseFeedForward
 from parakeet.models.transformer_tts.prenet import PreNet
 from parakeet.models.transformer_tts.post_convnet import PostConvNet
 class Decoder(dg.Layer):
    def __init__(self, num_hidden, config, num_head=4):
        super(Decoder, self).__init__()
        self.num_hidden = num_hidden
        param = fluid.ParamAttr()
-        self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
+        self.alpha = self.create_parameter(
-                        default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
+            shape=(1, ),
-        self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
+            attr=param,
-        self.pos_emb = dg.Embedding(size=[1024, num_hidden],
+            dtype='float32',
-                                 padding_idx=0,
+            default_initializer=fluid.initializer.ConstantInitializer(
-                                 param_attr=fluid.ParamAttr(
+                value=1.0))
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
+        self.pos_inp = get_sinusoid_encoding_table(
-                                     trainable=False))
+            1024, self.num_hidden, padding_idx=0)
-        self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'], 
+        self.pos_emb = dg.Embedding(
-                                            hidden_size = num_hidden * 2, 
+            size=[1024, num_hidden],
-                                            output_size = num_hidden, 
+            padding_idx=0,
-                                            dropout_rate=0.2)
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.decoder_prenet = PreNet(
+            input_size=config['audio']['num_mels'],
+            hidden_size=num_hidden * 2,
+            output_size=num_hidden,
+            dropout_rate=0.2)
        k = math.sqrt(1 / num_hidden)
-        self.linear = dg.Linear(num_hidden, num_hidden,
+        self.linear = dg.Linear(
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            num_hidden,
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            num_hidden,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
-        self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        self.selfattn_layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
        for i, layer in enumerate(self.selfattn_layers):
            self.add_sublayer("self_attn_{}".format(i), layer)
-        self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        self.attn_layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
        for i, layer in enumerate(self.attn_layers):
            self.add_sublayer("attn_{}".format(i), layer)
-        self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
+        self.ffns = [
+            PositionwiseFeedForward(
+                num_hidden, num_hidden * num_head, filter_size=1)
+            for _ in range(3)
+        ]
        for i, layer in enumerate(self.ffns):
            self.add_sublayer("ffns_{}".format(i), layer)
-        self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'],
+        self.mel_linear = dg.Linear(
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            num_hidden,
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            config['audio']['num_mels'] * config['audio']['outputs_per_step'],
-        self.stop_linear = dg.Linear(num_hidden, 1,
+            param_attr=fluid.ParamAttr(
-                                  param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                initializer=fluid.initializer.XavierInitializer()),
-                                  bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
-        self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'], 
+        self.stop_linear = dg.Linear(
-                                       filter_size = 5, padding = 4, num_conv=5, 
+            num_hidden,
-                                       outputs_per_step=config['audio']['outputs_per_step'], 
+            1,
-                                       use_cudnn = True)
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.postconvnet = PostConvNet(
+            config['audio']['num_mels'],
+            config['hidden_size'],
+            filter_size=5,
+            padding=4,
+            num_conv=5,
+            outputs_per_step=config['audio']['outputs_per_step'],
+            use_cudnn=True)
    def forward(self, key, value, query, c_mask, positional):
        # get decoder mask with triangular matrix
        if fluid.framework._dygraph_tracer()._train_mode:
            m_mask = get_non_pad_mask(positional)
-            mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
+            mask = get_attn_key_pad_mask((positional == 0).astype(np.float32),
-            triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
+                                         query)
+            triu_tensor = dg.to_variable(
+                get_triu_tensor(query.numpy(), query.numpy())).astype(
+                    np.float32)
            mask = mask + triu_tensor
            mask = fluid.layers.cast(mask == 0, np.float32)
            # (batch_size, decoder_len, encoder_len)
-            zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
+            zero_mask = get_attn_key_pad_mask(
+                layers.squeeze(c_mask, [-1]), query)
        else:
-            mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
+            mask = get_triu_tensor(query.numpy(),
+                                   query.numpy()).astype(np.float32)
            mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
            m_mask, zero_mask = None, None
        # Decoder pre-network
        query = self.decoder_prenet(query)
        # Centered position
        query = self.linear(query)
@@ -84,10 +137,13 @@ class Decoder(dg.Layer):
        # Attention decoder-decoder, encoder-decoder
        selfattn_list = list()
        attn_list = list()
-        for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
+        for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
-            query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
+                                       self.ffns):
-            query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
+            query, attn_dec = selfattn(
+                query, query, query, mask=mask, query_mask=m_mask)
+            query, attn_dot = attn(
+                key, value, query, mask=zero_mask, query_mask=m_mask)
            query = ffn(query)
            selfattn_list.append(attn_dec)
            attn_list.append(attn_dot)
@@ -96,7 +152,7 @@ class Decoder(dg.Layer):
        # Post Mel Network
        out = self.postconvnet(mel_out)
        out = mel_out + out
        # Stop tokens
        stop_tokens = self.stop_linear(query)
        stop_tokens = layers.squeeze(stop_tokens, [-1])

--- a/parakeet/models/transformer_tts/encoder.py
+++ b/parakeet/models/transformer_tts/encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
@@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention
 from parakeet.modules.ffn import PositionwiseFeedForward
 from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
 class Encoder(dg.Layer):
    def __init__(self, embedding_size, num_hidden, num_head=4):
        super(Encoder, self).__init__()
        self.num_hidden = num_hidden
-        param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
+        param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
-        self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
+            value=1.0))
-        self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
+        self.alpha = self.create_parameter(
-        self.pos_emb = dg.Embedding(size=[1024, num_hidden],
+            shape=(1, ), attr=param, dtype='float32')
-                                 padding_idx=0,
+        self.pos_inp = get_sinusoid_encoding_table(
-                                 param_attr=fluid.ParamAttr(
+            1024, self.num_hidden, padding_idx=0)
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
+        self.pos_emb = dg.Embedding(
-                                     trainable=False))
+            size=[1024, num_hidden],
-        self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, 
+            padding_idx=0,
-                                            num_hidden = num_hidden, 
+            param_attr=fluid.ParamAttr(
-                                            use_cudnn=True)
+                initializer=fluid.initializer.NumpyArrayInitializer(
-        self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+                    self.pos_inp),
+                trainable=False))
+        self.encoder_prenet = EncoderPrenet(
+            embedding_size=embedding_size,
+            num_hidden=num_hidden,
+            use_cudnn=True)
+        self.layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
        for i, layer in enumerate(self.layers):
            self.add_sublayer("self_attn_{}".format(i), layer)
-        self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)]
+        self.ffns = [
+            PositionwiseFeedForward(
+                num_hidden,
+                num_hidden * num_head,
+                filter_size=1,
+                use_cudnn=True) for _ in range(3)
+        ]
        for i, layer in enumerate(self.ffns):
            self.add_sublayer("ffns_{}".format(i), layer)
@@ -33,25 +62,23 @@ class Encoder(dg.Layer):
            mask = get_attn_key_pad_mask(positional, x)
        else:
            query_mask, mask = None, None
        # Encoder pre_network
-        x = self.encoder_prenet(x) #(N,T,C)
+        x = self.encoder_prenet(x)  #(N,T,C)
        # Get positional encoding
-        positional = self.pos_emb(positional) 
+        positional = self.pos_emb(positional)
-        x = positional * self.alpha + x #(N, T, C)
+        x = positional * self.alpha + x  #(N, T, C)
        # Positional dropout
        x = layers.dropout(x, 0.1)
        # Self attention encoder
        attentions = list()
        for layer, ffn in zip(self.layers, self.ffns):
-            x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
+            x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
            x = ffn(x)
            attentions.append(attention)
        return x, query_mask, attentions
\ No newline at end of file
--- a/parakeet/models/transformer_tts/encoderprenet.py
+++ b/parakeet/models/transformer_tts/encoderprenet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from parakeet.g2p.text.symbols import symbols
 import paddle.fluid.dygraph as dg
@@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer):
        self.embedding_size = embedding_size
        self.num_hidden = num_hidden
        self.use_cudnn = use_cudnn
-        self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
+        self.embedding = dg.Embedding(
-                                        padding_idx = None)
+            size=[len(symbols), embedding_size], padding_idx=None)
        self.conv_list = []
        k = math.sqrt(1 / embedding_size)
-        self.conv_list.append(Conv1D(num_channels = embedding_size, 
+        self.conv_list.append(
-                            num_filters = num_hidden, 
+            Conv1D(
-                            filter_size = 5,
+                num_channels=embedding_size,
-                            padding = int(np.floor(5/2)),
+                num_filters=num_hidden,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                filter_size=5,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+                padding=int(np.floor(5 / 2)),
-                            use_cudnn = use_cudnn))
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))
        k = math.sqrt(1 / num_hidden)
        for _ in range(2):
-            self.conv_list.append(Conv1D(num_channels = num_hidden, 
+            self.conv_list.append(
-                                num_filters = num_hidden, 
+                Conv1D(
-                                filter_size = 5,
+                    num_channels=num_hidden,
-                                padding = int(np.floor(5/2)),
+                    num_filters=num_hidden,
-                                param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                    filter_size=5,
-                                bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+                    padding=int(np.floor(5 / 2)),
-                                use_cudnn = use_cudnn))
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k)),
+                    use_cudnn=use_cudnn))
        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)
-        self.batch_norm_list = [dg.BatchNorm(num_hidden, 
+        self.batch_norm_list = [
-                            data_layout='NCHW') for _ in range(3)]
+            dg.BatchNorm(
+                num_hidden, data_layout='NCHW') for _ in range(3)
+        ]
        for i, layer in enumerate(self.batch_norm_list):
            self.add_sublayer("batch_norm_list_{}".format(i), layer)
        k = math.sqrt(1 / num_hidden)
-        self.projection = dg.Linear(num_hidden, num_hidden,
+        self.projection = dg.Linear(
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            num_hidden,
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            num_hidden,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
    def forward(self, x):
-        x = self.embedding(x) #(batch_size, seq_len, embending_size)
+        x = self.embedding(x)  #(batch_size, seq_len, embending_size)
-        x = layers.transpose(x,[0,2,1])
+        x = layers.transpose(x, [0, 2, 1])
        for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
            x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
-        x = layers.transpose(x,[0,2,1]) #(N,T,C)
+        x = layers.transpose(x, [0, 2, 1])  #(N,T,C)
        x = self.projection(x)
        return x
\ No newline at end of file
--- a/parakeet/models/transformer_tts/post_convnet.py
+++ b/parakeet/models/transformer_tts/post_convnet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from parakeet.modules.customized import Conv1D
 class PostConvNet(dg.Layer):
-    def __init__(self, 
+    def __init__(self,
                 n_mels=80,
                 num_hidden=512,
                 filter_size=5,
@@ -16,49 +30,66 @@ class PostConvNet(dg.Layer):
                 dropout=0.1,
                 batchnorm_last=False):
        super(PostConvNet, self).__init__()
        self.dropout = dropout
        self.num_conv = num_conv
        self.batchnorm_last = batchnorm_last
        self.conv_list = []
        k = math.sqrt(1 / (n_mels * outputs_per_step))
-        self.conv_list.append(Conv1D(num_channels = n_mels * outputs_per_step,
+        self.conv_list.append(
-                            num_filters = num_hidden,
+            Conv1D(
-                            filter_size = filter_size,
+                num_channels=n_mels * outputs_per_step,
-                            padding = padding,
+                num_filters=num_hidden,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                filter_size=filter_size,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+                padding=padding,
-                            use_cudnn = use_cudnn))
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))
        k = math.sqrt(1 / num_hidden)
-        for _ in range(1, num_conv-1):
+        for _ in range(1, num_conv - 1):
-            self.conv_list.append(Conv1D(num_channels = num_hidden,
+            self.conv_list.append(
-                                num_filters = num_hidden,
+                Conv1D(
-                                filter_size = filter_size,
+                    num_channels=num_hidden,
-                                padding = padding,
+                    num_filters=num_hidden,
-                                param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                    filter_size=filter_size,
-                                bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+                    padding=padding,
-                                use_cudnn = use_cudnn))
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k)),
+                    use_cudnn=use_cudnn))
-        self.conv_list.append(Conv1D(num_channels = num_hidden,
+        self.conv_list.append(
-                            num_filters = n_mels * outputs_per_step,
+            Conv1D(
-                            filter_size = filter_size,
+                num_channels=num_hidden,
-                            padding = padding,
+                num_filters=n_mels * outputs_per_step,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                filter_size=filter_size,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+                padding=padding,
-                            use_cudnn = use_cudnn))
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))
        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)
-        self.batch_norm_list = [dg.BatchNorm(num_hidden, 
+        self.batch_norm_list = [
-                            data_layout='NCHW') for _ in range(num_conv-1)]
+            dg.BatchNorm(
+                num_hidden, data_layout='NCHW') for _ in range(num_conv - 1)
+        ]
        if self.batchnorm_last:
-            self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, 
+            self.batch_norm_list.append(
-                                data_layout='NCHW'))
+                dg.BatchNorm(
+                    n_mels * outputs_per_step, data_layout='NCHW'))
        for i, layer in enumerate(self.batch_norm_list):
            self.add_sublayer("batch_norm_list_{}".format(i), layer)
    def forward(self, input):
        """
@@ -69,18 +100,19 @@ class PostConvNet(dg.Layer):
        Returns:
            output (Variable), Shape(B, T, C), the result after postconvnet.
        """
-        input = layers.transpose(input, [0,2,1])
+        input = layers.transpose(input, [0, 2, 1])
        len = input.shape[-1]
-        for i in range(self.num_conv-1):
+        for i in range(self.num_conv - 1):
            batch_norm = self.batch_norm_list[i]
            conv = self.conv_list[i]
-            input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
+            input = layers.dropout(
-        conv = self.conv_list[self.num_conv-1]
+                layers.tanh(batch_norm(conv(input)[:, :, :len])), self.dropout)
-        input = conv(input)[:,:,:len]
+        conv = self.conv_list[self.num_conv - 1]
+        input = conv(input)[:, :, :len]
        if self.batchnorm_last:
-            batch_norm = self.batch_norm_list[self.num_conv-1]
+            batch_norm = self.batch_norm_list[self.num_conv - 1]
            input = layers.dropout(batch_norm(input), self.dropout)
-        output = layers.transpose(input, [0,2,1])
+        output = layers.transpose(input, [0, 2, 1])
        return output
\ No newline at end of file
--- a/parakeet/models/transformer_tts/prenet.py
+++ b/parakeet/models/transformer_tts/prenet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 class PreNet(dg.Layer):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
        """
@@ -17,13 +31,21 @@ class PreNet(dg.Layer):
        self.dropout_rate = dropout_rate
        k = math.sqrt(1 / input_size)
-        self.linear1 = dg.Linear(input_size, hidden_size,
+        self.linear1 = dg.Linear(
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            input_size,
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            hidden_size,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
        k = math.sqrt(1 / hidden_size)
-        self.linear2 = dg.Linear(hidden_size, output_size,
+        self.linear2 = dg.Linear(
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            hidden_size,
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            output_size,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
    def forward(self, x):
        """

--- a/parakeet/models/transformer_tts/transformer_tts.py
+++ b/parakeet/models/transformer_tts/transformer_tts.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.encoder import Encoder
 from parakeet.models.transformer_tts.decoder import Decoder
 class TransformerTTS(dg.Layer):
    def __init__(self, config):
        super(TransformerTTS, self).__init__()
@@ -11,16 +25,10 @@ class TransformerTTS(dg.Layer):
        self.config = config
    def forward(self, characters, mel_input, pos_text, pos_mel):
-        key, c_mask, attns_enc = self.encoder(characters, pos_text)
-        mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
-        return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
+        key, c_mask, attns_enc = self.encoder(characters, pos_text)
+        mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
+            key, key, mel_input, c_mask, pos_mel)
+        return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
--- a/parakeet/models/transformer_tts/utils.py
+++ b/parakeet/models/transformer_tts/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import librosa
 import os, copy
@@ -6,14 +19,15 @@ import paddle.fluid.layers as layers
 def get_positional_table(d_pos_vec, n_position=1024):
-    position_enc = np.array([
+    position_enc = np.array(
-        [pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)]
+        [[pos / np.power(10000, 2 * i / d_pos_vec) for i in range(d_pos_vec)]
-        if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
+         if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
-    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
+    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
-    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
+    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
    return position_enc
 def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
    ''' Sinusoid position encoding table '''
@@ -23,7 +37,8 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_hid)]
-    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table = np.array(
+        [get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
@@ -34,8 +49,10 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
    return sinusoid_table
 def get_non_pad_mask(seq):
-    return layers.unsqueeze((seq != 0).astype(np.float32),[-1])
+    return layers.unsqueeze((seq != 0).astype(np.float32), [-1])
 def get_attn_key_pad_mask(seq_k, seq_q):
    ''' For masking out the padding part of key sequence. '''
@@ -43,32 +60,37 @@ def get_attn_key_pad_mask(seq_k, seq_q):
    # Expand to fit the shape of key query attention matrix.
    len_q = seq_q.shape[1]
    padding_mask = (seq_k != 0).astype(np.float32)
-    padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1]) 
+    padding_mask = layers.expand(
+        layers.unsqueeze(padding_mask, [1]), [1, len_q, 1])
    return padding_mask
 def get_triu_tensor(seq_k, seq_q):
    ''' For make a triu tensor '''
    len_k = seq_k.shape[1]
    len_q = seq_q.shape[1]
    batch_size = seq_k.shape[0]
    triu_tensor = np.triu(np.ones([len_k, len_q]), 1)
-    triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0)
+    triu_tensor = np.repeat(
+        np.expand_dims(
+            triu_tensor, axis=0), batch_size, axis=0)
    return triu_tensor
 def guided_attention(N, T, g=0.2):
    '''Guided attention. Refer to page 3 on the paper.'''
    W = np.zeros((N, T), dtype=np.float32)
    for n_pos in range(W.shape[0]):
        for t_pos in range(W.shape[1]):
-            W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
+            W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N))
+                                         **2 / (2 * g * g))
    return W
 def cross_entropy(input, label, position_weight=1.0, epsilon=1e-30):
-    output = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
+    output = -1 * label * layers.log(input + epsilon) - (
+        1 - label) * layers.log(1 - input + epsilon)
    output = output * (label * (position_weight - 1) + 1)
    return layers.reduce_sum(output, dim=[0, 1])
--- a/parakeet/models/transformer_tts/vocoder.py
+++ b/parakeet/models/transformer_tts/vocoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.modules.customized import Conv1D
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.transformer_tts.cbhg import CBHG
 class Vocoder(dg.Layer):
    """
    CBHG Network (mel -> linear)
    """
    def __init__(self, config, batch_size):
        super(Vocoder, self).__init__()
-        self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'], 
+        self.pre_proj = Conv1D(
-                             num_filters = config['hidden_size'],
+            num_channels=config['audio']['num_mels'],
-                             filter_size=1)
+            num_filters=config['hidden_size'],
+            filter_size=1)
        self.cbhg = CBHG(config['hidden_size'], batch_size)
-        self.post_proj = Conv1D(num_channels = config['hidden_size'], 
+        self.post_proj = Conv1D(
-                             num_filters = (config['audio']['n_fft'] // 2) + 1,
+            num_channels=config['hidden_size'],
-                             filter_size=1)
+            num_filters=(config['audio']['n_fft'] // 2) + 1,
+            filter_size=1)
    def forward(self, mel):
-        mel = layers.transpose(mel, [0,2,1])
+        mel = layers.transpose(mel, [0, 2, 1])
        mel = self.pre_proj(mel)
        mel = self.cbhg(mel)
        mag_pred = self.post_proj(mel)
-        mag_pred = layers.transpose(mag_pred, [0,2,1])
+        mag_pred = layers.transpose(mag_pred, [0, 2, 1])
        return mag_pred
--- a/parakeet/models/waveflow/__init__.py
+++ b/parakeet/models/waveflow/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from parakeet.models.waveflow.waveflow import WaveFlow
--- a/parakeet/models/waveflow/data.py
+++ b/parakeet/models/waveflow/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import random
 import librosa

--- a/parakeet/models/waveflow/waveflow.py
+++ b/parakeet/models/waveflow/waveflow.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import itertools
 import os
 import time

--- a/parakeet/models/waveflow/waveflow_modules.py
+++ b/parakeet/models/waveflow/waveflow_modules.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import itertools
 import numpy as np
 import paddle.fluid.dygraph as dg

--- a/parakeet/models/wavenet/README.md
+++ b/parakeet/models/wavenet/README.md
@@ -2,7 +2,7 @@
 Paddle fluid implementation of WaveNet, a deep generative model of raw audio waveforms.
 WaveNet model is originally proposed in [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499).
-Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels. 
+Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels.
 We implement WaveNet model in paddle fluid with dynamic graph, which is convenient for flexible network architectures.
@@ -51,10 +51,10 @@ python -u train.py --config=${yaml} \
 #### Save and Load checkpoints
 Our model will save model parameters as checkpoints in `./runs/wavenet/${ModelName}/checkpoint/` every 10000 iterations by default.
-The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. 
+The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.
 There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
-1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed. 
+1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed.
 2. Use `--iteration=500000`.
 3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/wavenet/${ModelName}/checkpoint`.
@@ -91,7 +91,7 @@ python -u synthesis.py --config=${yaml} \
    --root=./data/LJSpeech-1.1 \
    --name=${ModelName} --use_gpu=true \
    --output=./syn_audios \
-    --sample=${SAMPLE} 
+    --sample=${SAMPLE}
 ```
 In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset.
--- a/parakeet/models/wavenet/data.py
+++ b/parakeet/models/wavenet/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import random
 import librosa
@@ -18,8 +32,8 @@ class Dataset(ljspeech.LJSpeech):
        self.fft_window_shift = config.fft_window_shift
        # Calculate context frames.
        frames_per_second = config.sample_rate // self.fft_window_shift
-        train_clip_frames = int(np.ceil(
+        train_clip_frames = int(
-            config.train_clip_second * frames_per_second))
+            np.ceil(config.train_clip_second * frames_per_second))
        context_frames = config.context_size // self.fft_window_shift
        self.num_frames = train_clip_frames + context_frames
@@ -32,7 +46,7 @@ class Dataset(ljspeech.LJSpeech):
        fft_window_shift = config.fft_window_shift
        fft_window_size = config.fft_window_size
        fft_size = config.fft_size
        audio, loaded_sr = librosa.load(wav_path, sr=None)
        assert loaded_sr == sr
@@ -41,42 +55,46 @@ class Dataset(ljspeech.LJSpeech):
        fft_padding = (fft_size - fft_window_shift) // 2
        desired_length = frames * fft_window_shift + fft_padding * 2
        pad_amount = (desired_length - audio.size) // 2
        if audio.size % 2 == 0:
            audio = np.pad(audio, (pad_amount, pad_amount), mode='reflect')
        else:
            audio = np.pad(audio, (pad_amount, pad_amount + 1), mode='reflect')
        # Normalize audio.
        audio = audio / np.abs(audio).max() * 0.999
        # Compute mel-spectrogram.
        # Turn center to False to prevent internal padding.
        spectrogram = librosa.core.stft(
-            audio, hop_length=fft_window_shift,
+            audio,
-            win_length=fft_window_size, n_fft=fft_size, center=False)
+            hop_length=fft_window_shift,
+            win_length=fft_window_size,
+            n_fft=fft_size,
+            center=False)
        spectrogram_magnitude = np.abs(spectrogram)
        # Compute mel-spectrograms.
-        mel_filter_bank = librosa.filters.mel(sr=sr, n_fft=fft_size,
+        mel_filter_bank = librosa.filters.mel(sr=sr,
+                                              n_fft=fft_size,
                                              n_mels=config.mel_bands)
        mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
        mel_spectrogram = mel_spectrogram.T
        # Rescale mel_spectrogram.
        min_level, ref_level = 1e-5, 20
        mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram))
        mel_spectrogram = mel_spectrogram - ref_level
        mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1)
        # Extract the center of audio that corresponds to mel spectrograms.
-        audio = audio[fft_padding : -fft_padding]
+        audio = audio[fft_padding:-fft_padding]
        assert mel_spectrogram.shape[0] * fft_window_shift == audio.size
        return audio, mel_spectrogram
-class Subset(dataset.Dataset): 
+class Subset(dataset.Dataset):
    def __init__(self, dataset, indices, valid):
        self.dataset = dataset
        self.indices = indices
@@ -100,23 +118,23 @@ class Subset(dataset.Dataset):
            audio_start = frame_start * fft_window_shift
            audio_end = frame_end * fft_window_shift
-            audio = audio[audio_start : audio_end]
+            audio = audio[audio_start:audio_end]
        return audio, mel, audio_start
    def _batch_examples(self, batch):
        audios = [sample[0] for sample in batch]
        audio_starts = [sample[2] for sample in batch]
        # mels shape [num_frames, mel_bands]
-        max_frames = max(sample[1].shape[0] for sample in batch) 
+        max_frames = max(sample[1].shape[0] for sample in batch)
        mels = [utils.pad_to_size(sample[1], max_frames) for sample in batch]
        audios = np.array(audios, dtype=np.float32)
        mels = np.array(mels, dtype=np.float32)
        audio_starts = np.array(audio_starts, dtype=np.int32)
        return audios, mels, audio_starts
    def __len__(self):
@@ -138,17 +156,17 @@ class LJSpeech:
        # Train dataset.
        trainset = Subset(ds, train_indices, valid=False)
-        sampler = DistributedSampler(len(trainset), nranks, rank) 
+        sampler = DistributedSampler(len(trainset), nranks, rank)
        total_bs = config.batch_size
        assert total_bs % nranks == 0
-        train_sampler = BatchSampler(sampler, total_bs // nranks,
+        train_sampler = BatchSampler(
-            drop_last=True)
+            sampler, total_bs // nranks, drop_last=True)
        trainloader = DataCargo(trainset, batch_sampler=train_sampler)
        trainreader = fluid.io.PyReader(capacity=50, return_list=True)
        trainreader.decorate_batch_generator(trainloader, place)
        self.trainloader = (data for _ in iter(int, 1)
-            for data in trainreader())
+                            for data in trainreader())
        # Valid dataset.
        validset = Subset(ds, valid_indices, valid=True)
@@ -156,5 +174,5 @@ class LJSpeech:
        validloader = DataCargo(validset, batch_size=1, shuffle=False)
        validreader = fluid.io.PyReader(capacity=20, return_list=True)
-        validreader.decorate_batch_generator(validloader, place) 
+        validreader.decorate_batch_generator(validloader, place)
        self.validloader = validreader
--- a/parakeet/models/wavenet/slurm.py
+++ b/parakeet/models/wavenet/slurm.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Utility module for restarting training when using SLURM.
 """
@@ -45,8 +58,8 @@ def parse_time(text):
    try:
        return parse_hours(hours) * 3600 + int(minutes) * 60 + int(seconds)
    except ValueError as e:
-        raise ValueError("Error parsing time {}. Got error {}.".format(
+        raise ValueError("Error parsing time {}. Got error {}.".format(text,
-            text, str(e)))
+                                                                       str(e)))
 def restart_command():
@@ -76,8 +89,10 @@ def restart_command():
    gres, partition = info.get("Gres"), info.get("Partition")
    stderr, stdout = info.get("StdErr"), info.get("StdOut")
    job_name = info.get("JobName")
-    command = ["sbatch", "--job-name={}".format(job_name),
+    command = [
-               "--ntasks={}".format(num_tasks)]
+        "sbatch", "--job-name={}".format(job_name),
+        "--ntasks={}".format(num_tasks)
+    ]
    if partition:
        command.extend(["--partition", partition])
@@ -98,12 +113,13 @@ def restart_command():
    dist_setting = ['-m', 'paddle.distributed.launch']
    wrap_cmd = ["srun", python, '-u'] + dist_setting + sys.argv
-    command.append(
+    command.append("--wrap={}".format(" ".join(
-        "--wrap={}".format(" ".join(shlex.quote(arg) for arg in wrap_cmd)))
+        shlex.quote(arg) for arg in wrap_cmd)))
    time_limit_string = info["TimeLimit"]
    if time_limit_string.lower() == "unlimited":
-        print("UNLIMITED detected: restart OFF, infinite learning ON.",
+        print(
-              flush=True)
+            "UNLIMITED detected: restart OFF, infinite learning ON.",
+            flush=True)
        return command, None
    time_limit = parse_time(time_limit_string)
    runtime = parse_time(info["RunTime"])

--- a/parakeet/models/wavenet/synthesis.py
+++ b/parakeet/models/wavenet/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 from pprint import pprint
@@ -12,25 +26,42 @@ from wavenet import WaveNet
 def add_options_to_parser(parser):
-    parser.add_argument('--model', type=str, default='wavenet',
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='wavenet',
        help="general name of the model")
-    parser.add_argument('--name', type=str,
+    parser.add_argument(
-        help="specific name of the training model")
+        '--name', type=str, help="specific name of the training model")
-    parser.add_argument('--root', type=str,
+    parser.add_argument(
-        help="root path of the LJSpeech dataset")
+        '--root', type=str, help="root path of the LJSpeech dataset")
-    parser.add_argument('--use_gpu', type=bool, default=True,
+    parser.add_argument(
+        '--use_gpu',
+        type=bool,
+        default=True,
        help="option to use gpu training")
-    parser.add_argument('--iteration', type=int, default=None,
+    parser.add_argument(
+        '--iteration',
+        type=int,
+        default=None,
        help=("which iteration of checkpoint to load, "
              "default to load the latest checkpoint"))
-    parser.add_argument('--checkpoint', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint',
+        type=str,
+        default=None,
        help="path of the checkpoint to load")
-    parser.add_argument('--output', type=str, default="./syn_audios",
+    parser.add_argument(
+        '--output',
+        type=str,
+        default="./syn_audios",
        help="path to write synthesized audio files")
-    parser.add_argument('--sample', type=int,
+    parser.add_argument(
+        '--sample',
+        type=int,
        help="which of the valid samples to synthesize audio")
@@ -52,7 +83,7 @@ def synthesize(config):
        fluid.default_startup_program().random_seed = seed
        fluid.default_main_program().random_seed = seed
        print("Random Seed: ", seed)
        # Build model.
        model = WaveNet(config, checkpoint_dir)
        model.build(training=False)

--- a/parakeet/models/wavenet/train.py
+++ b/parakeet/models/wavenet/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 import subprocess
@@ -18,24 +32,42 @@ MAXIMUM_SAVE_TIME = 10 * 60
 def add_options_to_parser(parser):
-    parser.add_argument('--model', type=str, default='wavenet',
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='wavenet',
        help="general name of the model")
-    parser.add_argument('--name', type=str,
+    parser.add_argument(
-        help="specific name of the training model")
+        '--name', type=str, help="specific name of the training model")
-    parser.add_argument('--root', type=str,
+    parser.add_argument(
-        help="root path of the LJSpeech dataset")
+        '--root', type=str, help="root path of the LJSpeech dataset")
-    parser.add_argument('--parallel', type=bool, default=True,
+    parser.add_argument(
+        '--parallel',
+        type=bool,
+        default=True,
        help="option to use data parallel training")
-    parser.add_argument('--use_gpu', type=bool, default=True,
+    parser.add_argument(
+        '--use_gpu',
+        type=bool,
+        default=True,
        help="option to use gpu training")
-    parser.add_argument('--iteration', type=int, default=None,
+    parser.add_argument(
+        '--iteration',
+        type=int,
+        default=None,
        help=("which iteration of checkpoint to load, "
              "default to load the latest checkpoint"))
-    parser.add_argument('--checkpoint', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint',
+        type=str,
+        default=None,
        help="path of the checkpoint to load")
-    parser.add_argument('--slurm', type=bool, default=False,
+    parser.add_argument(
+        '--slurm',
+        type=bool,
+        default=False,
        help="whether you are using slurm to submit training jobs")
@@ -104,8 +136,8 @@ def train(config):
            # Check whether reaching the time limit.
            if config.slurm:
-                done = (death_time is not None and death_time - time.time() <
+                done = (death_time is not None and
-                    MAXIMUM_SAVE_TIME)
+                        death_time - time.time() < MAXIMUM_SAVE_TIME)
            if rank == 0 and done:
                print("Saving progress before exiting.")
@@ -127,8 +159,8 @@ def train(config):
 if __name__ == "__main__":
    # Create parser.
-    parser = jsonargparse.ArgumentParser(description="Train WaveNet model",
+    parser = jsonargparse.ArgumentParser(
-        formatter_class='default_argparse')
+        description="Train WaveNet model", formatter_class='default_argparse')
    add_options_to_parser(parser)
    utils.add_config_options_to_parser(parser)
@@ -136,4 +168,4 @@ if __name__ == "__main__":
    # For conflicting updates to the same field, 
    # the preceding update will be overwritten by the following one.
    config = parser.parse_args()
-    train(config) 
+    train(config)
--- a/parakeet/models/wavenet/utils.py
+++ b/parakeet/models/wavenet/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import itertools
 import os
 import time
@@ -8,57 +22,82 @@ import paddle.fluid.dygraph as dg
 def add_config_options_to_parser(parser):
-    parser.add_argument('--valid_size', type=int,
+    parser.add_argument(
-        help="size of the valid dataset")
+        '--valid_size', type=int, help="size of the valid dataset")
-    parser.add_argument('--train_clip_second', type=float,
+    parser.add_argument(
+        '--train_clip_second',
+        type=float,
        help="the length of audio clip for training")
-    parser.add_argument('--sample_rate', type=int,
+    parser.add_argument(
-        help="sampling rate of audio data file")
+        '--sample_rate', type=int, help="sampling rate of audio data file")
-    parser.add_argument('--fft_window_shift', type=int,
+    parser.add_argument(
+        '--fft_window_shift',
+        type=int,
        help="the shift of fft window for each frame")
-    parser.add_argument('--fft_window_size', type=int,
+    parser.add_argument(
+        '--fft_window_size',
+        type=int,
        help="the size of fft window for each frame")
-    parser.add_argument('--fft_size', type=int,
+    parser.add_argument(
-        help="the size of fft filter on each frame")
+        '--fft_size', type=int, help="the size of fft filter on each frame")
-    parser.add_argument('--mel_bands', type=int,
+    parser.add_argument(
+        '--mel_bands',
+        type=int,
        help="the number of mel bands when calculating mel spectrograms")
-    parser.add_argument('--seed', type=int,
+    parser.add_argument(
-        help="seed of random initialization for the model")
+        '--seed', type=int, help="seed of random initialization for the model")
-    parser.add_argument('--batch_size', type=int,
+    parser.add_argument(
-        help="batch size for training")
+        '--batch_size', type=int, help="batch size for training")
-    parser.add_argument('--test_every', type=int,
+    parser.add_argument(
-        help="test interval during training")
+        '--test_every', type=int, help="test interval during training")
-    parser.add_argument('--save_every', type=int,
+    parser.add_argument(
+        '--save_every',
+        type=int,
        help="checkpointing interval during training")
-    parser.add_argument('--max_iterations', type=int,
+    parser.add_argument(
-        help="maximum training iterations")
+        '--max_iterations', type=int, help="maximum training iterations")
-    parser.add_argument('--layers', type=int,
+    parser.add_argument(
-        help="number of dilated convolution layers")
+        '--layers', type=int, help="number of dilated convolution layers")
-    parser.add_argument('--kernel_width', type=int,
+    parser.add_argument(
-        help="dilated convolution kernel width")
+        '--kernel_width', type=int, help="dilated convolution kernel width")
-    parser.add_argument('--dilation_block', type=list,
+    parser.add_argument(
-        help="dilated convolution kernel width")
+        '--dilation_block', type=list, help="dilated convolution kernel width")
    parser.add_argument('--residual_channels', type=int)
    parser.add_argument('--skip_channels', type=int)
-    parser.add_argument('--loss_type', type=str,
+    parser.add_argument(
-        help="mix-gaussian-pdf or softmax")
+        '--loss_type', type=str, help="mix-gaussian-pdf or softmax")
-    parser.add_argument('--num_channels', type=int, default=None,
+    parser.add_argument(
+        '--num_channels',
+        type=int,
+        default=None,
        help="number of channels for softmax output")
-    parser.add_argument('--num_mixtures', type=int, default=None,
+    parser.add_argument(
+        '--num_mixtures',
+        type=int,
+        default=None,
        help="number of gaussian mixtures for gaussian output")
-    parser.add_argument('--log_scale_min', type=float, default=None,
+    parser.add_argument(
+        '--log_scale_min',
+        type=float,
+        default=None,
        help="minimum clip value of log variance of gaussian output")
-    parser.add_argument('--conditioner.filter_sizes', type=list,
+    parser.add_argument(
+        '--conditioner.filter_sizes',
+        type=list,
        help="conv2d tranpose op filter sizes for building conditioner")
-    parser.add_argument('--conditioner.upsample_factors', type=list,
+    parser.add_argument(
+        '--conditioner.upsample_factors',
+        type=list,
        help="list of upsample factors for building conditioner")
    parser.add_argument('--learning_rate', type=float)
    parser.add_argument('--gradient_max_norm', type=float)
-    parser.add_argument('--anneal.every', type=int,
+    parser.add_argument(
+        '--anneal.every',
+        type=int,
        help="step interval for annealing learning rate")
    parser.add_argument('--anneal.rate', type=float)
@@ -113,8 +152,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration):
        handle.write("model_checkpoint_path: step-{}".format(iteration))
-def load_parameters(checkpoint_dir, rank, model, optimizer=None,
+def load_parameters(checkpoint_dir,
-                    iteration=None, file_path=None):
+                    rank,
+                    model,
+                    optimizer=None,
+                    iteration=None,
+                    file_path=None):
    if file_path is None:
        if iteration is None:
            iteration = load_latest_checkpoint(checkpoint_dir, rank)
@@ -128,7 +171,7 @@ def load_parameters(checkpoint_dir, rank, model, optimizer=None,
    if optimizer and optimizer_dict:
        optimizer.set_dict(optimizer_dict)
        print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
-              rank, file_path))
+            rank, file_path))
 def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):

--- a/parakeet/models/wavenet/wavenet.py
+++ b/parakeet/models/wavenet/wavenet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import itertools
 import os
 import time
@@ -13,8 +27,13 @@ from wavenet_modules import WaveNetModule
 class WaveNet():
-    def __init__(self, config, checkpoint_dir, parallel=False, rank=0,
+    def __init__(self,
-                 nranks=1, tb_logger=None):
+                 config,
+                 checkpoint_dir,
+                 parallel=False,
+                 rank=0,
+                 nranks=1,
+                 tb_logger=None):
        # Process config to calculate the context size
        dilations = list(
            itertools.islice(
@@ -29,12 +48,12 @@ class WaveNet():
    def build(self, training=True):
        config = self.config
-        dataset = LJSpeech(config, self.nranks, self.rank) 
+        dataset = LJSpeech(config, self.nranks, self.rank)
        self.trainloader = dataset.trainloader
        self.validloader = dataset.validloader
        wavenet = WaveNetModule("wavenet", config, self.rank)
        # Dry run once to create and initalize all necessary parameters.
        audio = dg.to_variable(np.random.randn(1, 20000).astype(np.float32))
        mel = dg.to_variable(
@@ -45,38 +64,44 @@ class WaveNet():
        if training:
            # Create Learning rate scheduler.
            lr_scheduler = dg.ExponentialDecay(
-                learning_rate = config.learning_rate,
+                learning_rate=config.learning_rate,
-                decay_steps = config.anneal.every,
+                decay_steps=config.anneal.every,
-                decay_rate = config.anneal.rate,
+                decay_rate=config.anneal.rate,
                staircase=True)
            optimizer = fluid.optimizer.AdamOptimizer(
                learning_rate=lr_scheduler)
            clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
                config.gradient_max_norm)
            # Load parameters.
-            utils.load_parameters(self.checkpoint_dir, self.rank,
+            utils.load_parameters(
-                                  wavenet, optimizer,
+                self.checkpoint_dir,
-                                  iteration=config.iteration,
+                self.rank,
-                                  file_path=config.checkpoint)
+                wavenet,
+                optimizer,
+                iteration=config.iteration,
+                file_path=config.checkpoint)
            print("Rank {}: checkpoint loaded.".format(self.rank))
            # Data parallelism.
            if self.parallel:
                strategy = dg.parallel.prepare_context()
                wavenet = dg.parallel.DataParallel(wavenet, strategy)
            self.wavenet = wavenet
            self.optimizer = optimizer
            self.clipper = clipper
        else:
            # Load parameters.
-            utils.load_parameters(self.checkpoint_dir, self.rank, wavenet,
+            utils.load_parameters(
-                                  iteration=config.iteration,
+                self.checkpoint_dir,
-                                  file_path=config.checkpoint)
+                self.rank,
+                wavenet,
+                iteration=config.iteration,
+                file_path=config.checkpoint)
            print("Rank {}: checkpoint loaded.".format(self.rank))
            self.wavenet = wavenet
@@ -104,7 +129,9 @@ class WaveNet():
        else:
            current_lr = self.optimizer._learning_rate
-        self.optimizer.minimize(loss, grad_clip=self.clipper,
+        self.optimizer.minimize(
+            loss,
+            grad_clip=self.clipper,
            parameter_list=self.wavenet.parameters())
        self.wavenet.clear_gradients()
@@ -143,10 +170,16 @@ class WaveNet():
            tb = self.tb_logger
            tb.add_scalar("Valid-Avg-Loss", loss_val, iteration)
-            tb.add_audio("Teacher-Forced-Audio-0", sample_audios[0].numpy(),
+            tb.add_audio(
-                iteration, sample_rate=self.config.sample_rate)
+                "Teacher-Forced-Audio-0",
-            tb.add_audio("Teacher-Forced-Audio-1", sample_audios[1].numpy(),
+                sample_audios[0].numpy(),
-                iteration, sample_rate=self.config.sample_rate)
+                iteration,
+                sample_rate=self.config.sample_rate)
+            tb.add_audio(
+                "Teacher-Forced-Audio-1",
+                sample_audios[1].numpy(),
+                iteration,
+                sample_rate=self.config.sample_rate)
    @dg.no_grad
    def infer(self, iteration):
@@ -165,10 +198,9 @@ class WaveNet():
        start_time = time.time()
        syn_audio = self.wavenet.synthesize(mels_list[sample])
        syn_time = time.time() - start_time
-        print("audio shape {}, synthesis time {}".format(
+        print("audio shape {}, synthesis time {}".format(syn_audio.shape,
-            syn_audio.shape, syn_time))
+                                                         syn_time))
-        librosa.output.write_wav(filename, syn_audio,
+        librosa.output.write_wav(filename, syn_audio, sr=config.sample_rate)
-            sr=config.sample_rate)
    def save(self, iteration):
        utils.save_latest_parameters(self.checkpoint_dir, iteration,

--- a/parakeet/models/wavenet/wavenet_modules.py
+++ b/parakeet/models/wavenet/wavenet_modules.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import itertools
 import numpy as np
@@ -16,11 +30,11 @@ def get_padding(filter_size, stride, padding_type='same'):
 def extract_slices(x, audio_starts, audio_length, rank):
    slices = []
-    for i in range(x.shape[0]): 
+    for i in range(x.shape[0]):
        start = audio_starts.numpy()[i]
        end = start + audio_length
        slice = fluid.layers.slice(
-            x, axes=[0, 1], starts=[i, start], ends=[i+1, end])
+            x, axes=[0, 1], starts=[i, start], ends=[i + 1, end])
        slices.append(fluid.layers.squeeze(slice, [0]))
    x = fluid.layers.stack(slices, axis=0)
@@ -50,7 +64,7 @@ class Conditioner(dg.Layer):
        # Register python list as parameters.
        for i, layer in enumerate(self.deconvs):
            self.add_sublayer("conv_transpose_{}".format(i), layer)
    def forward(self, x):
        x = fluid.layers.unsqueeze(x, 1)
        for layer in self.deconvs:
@@ -62,7 +76,7 @@ class Conditioner(dg.Layer):
 class WaveNetModule(dg.Layer):
    def __init__(self, name_scope, config, rank):
        super(WaveNetModule, self).__init__(name_scope)
        self.rank = rank
        self.conditioner = Conditioner(self.full_name(), config)
        self.dilations = list(
@@ -82,15 +96,13 @@ class WaveNetModule(dg.Layer):
                embed_dim=config.residual_channels,
                std=0.1)
        elif config.loss_type == "mix-gaussian-pdf":
-            self.embedding_fc = modules.FC(
+            self.embedding_fc = modules.FC(self.full_name(),
-                self.full_name(),
+                                           in_features=1,
-                in_features=1,
+                                           size=config.residual_channels,
-                size=config.residual_channels,
+                                           num_flatten_dims=2,
-                num_flatten_dims=2,
+                                           relu=False)
-                relu=False)
        else:
-            raise ValueError(
+            raise ValueError("loss_type {} is unsupported!".format(loss_type))
-                "loss_type {} is unsupported!".format(loss_type))
        self.dilated_causal_convs = []
        for dilation in self.dilations:
@@ -102,56 +114,49 @@ class WaveNetModule(dg.Layer):
                    num_filters=config.residual_channels,
                    filter_size=config.kernel_width,
                    dilation=dilation,
-                    causal=True
+                    causal=True))
-                )
-            )
        for i, layer in enumerate(self.dilated_causal_convs):
-            self.add_sublayer("dilated_causal_conv_{}".format(i), layer) 
+            self.add_sublayer("dilated_causal_conv_{}".format(i), layer)
-        self.fc1 = modules.FC(
+        self.fc1 = modules.FC(self.full_name(),
-            self.full_name(),
+                              in_features=config.residual_channels,
-            in_features=config.residual_channels,
+                              size=config.skip_channels,
-            size=config.skip_channels,
+                              num_flatten_dims=2,
-            num_flatten_dims=2,
+                              relu=True,
-            relu=True,
+                              act="relu")
-            act="relu")
+        self.fc2 = modules.FC(self.full_name(),
-        self.fc2 = modules.FC(
+                              in_features=config.skip_channels,
-            self.full_name(),
+                              size=config.skip_channels,
-            in_features=config.skip_channels,
+                              num_flatten_dims=2,
-            size=config.skip_channels,
+                              relu=True,
-            num_flatten_dims=2,
+                              act="relu")
-            relu=True,
-            act="relu")
        if config.loss_type == "softmax":
-            self.fc3 = modules.FC(
+            self.fc3 = modules.FC(self.full_name(),
-                self.full_name(),
+                                  in_features=config.skip_channels,
-                in_features=config.skip_channels,
+                                  size=config.num_channels,
-                size=config.num_channels,
+                                  num_flatten_dims=2,
-                num_flatten_dims=2,
+                                  relu=False)
-                relu=False)
        elif config.loss_type == "mix-gaussian-pdf":
-            self.fc3 = modules.FC(
+            self.fc3 = modules.FC(self.full_name(),
-                self.full_name(),
+                                  in_features=config.skip_channels,
-                in_features=config.skip_channels,
+                                  size=3 * config.num_mixtures,
-                size=3 * config.num_mixtures,
+                                  num_flatten_dims=2,
-                num_flatten_dims=2,
+                                  relu=False)
-                relu=False)
        else:
-            raise ValueError(
+            raise ValueError("loss_type {} is unsupported!".format(loss_type))
-                "loss_type {} is unsupported!".format(loss_type))
    def sample_softmax(self, mix_parameters):
        batch, length, hidden = mix_parameters.shape
        mix_param_2d = fluid.layers.reshape(mix_parameters,
-            [batch * length, hidden])
+                                            [batch * length, hidden])
        mix_param_2d = fluid.layers.softmax(mix_param_2d, axis=-1)
        # quantized: [batch * length]
-        quantized = fluid.layers.cast(fluid.layers.sampling_id(mix_param_2d),
+        quantized = fluid.layers.cast(
-            dtype="float32")
+            fluid.layers.sampling_id(mix_param_2d), dtype="float32")
        samples = (quantized + 0.5) * (2.0 / self.config.num_channels) - 1.0
        # samples: [batch * length]
@@ -162,23 +167,23 @@ class WaveNetModule(dg.Layer):
        # to [bs * len, 3 * num_mixtures].
        batch, length, hidden = mix_parameters.shape
        mix_param_2d = fluid.layers.reshape(mix_parameters,
-            [batch * length, hidden])
+                                            [batch * length, hidden])
        K = hidden // 3
        # Unpack the parameters of the mixture of gaussian.
-        logits_pi = mix_param_2d[:, 0 : K]
+        logits_pi = mix_param_2d[:, 0:K]
-        mu = mix_param_2d[:, K : 2*K]
+        mu = mix_param_2d[:, K:2 * K]
-        log_s = mix_param_2d[:, 2*K : 3*K]
+        log_s = mix_param_2d[:, 2 * K:3 * K]
        s = fluid.layers.exp(log_s)
        pi = fluid.layers.softmax(logits_pi, axis=-1)
        comp_samples = fluid.layers.sampling_id(pi)
        row_idx = dg.to_variable(np.arange(batch * length))
        comp_samples = fluid.layers.stack([row_idx, comp_samples], axis=-1)
        mu_comp = fluid.layers.gather_nd(mu, comp_samples)
-        s_comp = fluid.layers.gather_nd(s, comp_samples) 
+        s_comp = fluid.layers.gather_nd(s, comp_samples)
        # N(0, 1) normal sample.
        u = fluid.layers.gaussian_random(shape=[batch * length])
@@ -220,8 +225,9 @@ class WaveNetModule(dg.Layer):
        # Calculate gaussian loss.
        targets = fluid.layers.unsqueeze(targets, -1)
-        targets = fluid.layers.expand(targets, [1, 1, self.config.num_mixtures])
+        targets = fluid.layers.expand(targets,
-        x_std =  inv_s * (targets - mu)
+                                      [1, 1, self.config.num_mixtures])
+        x_std = inv_s * (targets - mu)
        exponent = fluid.layers.exp(-0.5 * x_std * x_std)
        pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent
        pdf_x = pi * pdf_x
@@ -239,9 +245,9 @@ class WaveNetModule(dg.Layer):
        # Slice conditioners.
        audio_length = audios.shape[1]
-        conditioner = extract_slices(full_conditioner,
+        conditioner = extract_slices(full_conditioner, audio_starts,
-            audio_starts, audio_length, self.rank)
+                                     audio_length, self.rank)
        # input_audio, target_audio: [bs, len]
        input_audios = audios[:, :-1]
        target_audios = audios[:, 1:]
@@ -263,15 +269,16 @@ class WaveNetModule(dg.Layer):
            layer_input = self.embedding_fc(
                fluid.layers.unsqueeze(input_audios, 2))
        else:
-            raise ValueError(
+            raise ValueError("loss_type {} is unsupported!".format(loss_type))
-                "loss_type {} is unsupported!".format(loss_type))
        # layer_input: [bs, res_channel, 1, len]
        layer_input = fluid.layers.unsqueeze(
-            fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2)
+            fluid.layers.transpose(
+                layer_input, perm=[0, 2, 1]), 2)
        # conditioner: [bs, mel_bands, 1, len]
        conditioner = fluid.layers.unsqueeze(
-            fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2)
+            fluid.layers.transpose(
+                conditioner, perm=[0, 2, 1]), 2)
        skip = None
        for i, layer in enumerate(self.dilated_causal_convs):
@@ -292,23 +299,22 @@ class WaveNetModule(dg.Layer):
            elif loss_type == "mix-gaussian-pdf":
                sample_audios = self.sample_mix_gaussian(mix_parameters)
            else:
-                raise ValueError(
+                raise ValueError("loss_type {} is unsupported!".format(
-                    "loss_type {} is unsupported!".format(loss_type))
+                    loss_type))
        if loss_type == "softmax":
            loss = self.softmax_loss(target_audios, mix_parameters)
        elif loss_type == "mix-gaussian-pdf":
-            loss = self.mixture_density_loss(target_audios,
+            loss = self.mixture_density_loss(target_audios, mix_parameters,
-                mix_parameters, self.log_scale_min)
+                                             self.log_scale_min)
        else:
-            raise ValueError(
+            raise ValueError("loss_type {} is unsupported!".format(loss_type))
-                "loss_type {} is unsupported!".format(loss_type))
        return loss, sample_audios
    def synthesize(self, mels):
        self.start_new_sequence()
-        bs, n_frames, mel_bands = mels.shape 
+        bs, n_frames, mel_bands = mels.shape
        conditioner = self.conditioner(mels)
        time_steps = conditioner.shape[1]
@@ -335,23 +341,24 @@ class WaveNetModule(dg.Layer):
            elif loss_type == "mix-gaussian-pdf":
                audio_input = self.embedding_fc(current_sample)
            else:
-                raise ValueError(
+                raise ValueError("loss_type {} is unsupported!".format(
-                    "loss_type {} is unsupported!".format(loss_type))
+                    loss_type))
            # [bs, channel, 1, 1]
            audio_input = fluid.layers.unsqueeze(
-                fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2)
+                fluid.layers.transpose(
+                    audio_input, perm=[0, 2, 1]), 2)
            # [bs, mel_bands]
            cond_input = conditioner[:, i, :]
            # [bs, mel_bands, 1, 1]
-            cond_input = fluid.layers.reshape(
+            cond_input = fluid.layers.reshape(cond_input,
-                cond_input, cond_input.shape + [1, 1])
+                                              cond_input.shape + [1, 1])
            skip = None
            for layer in self.dilated_causal_convs:
-                audio_input, skip = layer.add_input(
+                audio_input, skip = layer.add_input(audio_input, skip,
-                    audio_input, skip, cond_input)
+                                                    cond_input)
            # [bs, 1, channel]
            skip = fluid.layers.transpose(
                fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
@@ -361,19 +368,19 @@ class WaveNetModule(dg.Layer):
            elif loss_type == "mix-gaussian-pdf":
                sample = self.sample_mix_gaussian(mix_parameters)
            else:
-                raise ValueError(
+                raise ValueError("loss_type {} is unsupported!".format(
-                    "loss_type {} is unsupported!".format(loss_type))
+                    loss_type))
            audio_samples.append(sample)
            # [bs]
            current_sample = audio_samples[-1]
            # [bs, 1, 1]
-            current_sample = fluid.layers.reshape(current_sample,
+            current_sample = fluid.layers.reshape(
-                current_sample.shape + [1, 1])
+                current_sample, current_sample.shape + [1, 1])
        # syn_audio: [num_samples]
        syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy()
-        return syn_audio        
+        return syn_audio
    def start_new_sequence(self):
        for layer in self.sublayers():

--- a/parakeet/modules/__init__.py
+++ b/parakeet/modules/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from . import weight_norm
 from .customized import *
\ No newline at end of file
--- a/parakeet/modules/customized.py
+++ b/parakeet/modules/customized.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from paddle import fluid
 import paddle.fluid.layers as F
 import paddle.fluid.dygraph as dg
@@ -7,14 +21,15 @@ class Pool1D(dg.Layer):
    """
    A Pool 1D block implemented with Pool2D.
    """
    def __init__(self,
-                 pool_size=-1, 
+                 pool_size=-1,
-                 pool_type='max', 
+                 pool_type='max',
-                 pool_stride=1, 
+                 pool_stride=1,
-                 pool_padding=0, 
+                 pool_padding=0,
-                 global_pooling=False, 
+                 global_pooling=False,
-                 use_cudnn=True, 
+                 use_cudnn=True,
-                 ceil_mode=False, 
+                 ceil_mode=False,
                 exclusive=True,
                 data_format='NCT'):
        super(Pool1D, self).__init__()
@@ -28,13 +43,16 @@ class Pool1D(dg.Layer):
        self.exclusive = exclusive
        self.data_format = data_format
+        self.pool2d = dg.Pool2D(
+            [1, pool_size],
+            pool_type=pool_type,
+            pool_stride=[1, pool_stride],
+            pool_padding=[0, pool_padding],
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn,
+            ceil_mode=ceil_mode,
+            exclusive=exclusive)
-        self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
-                                pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
-                                global_pooling = global_pooling, use_cudnn = use_cudnn,
-                                ceil_mode = ceil_mode, exclusive = exclusive)
    def forward(self, x):
        """
        Args:
@@ -53,12 +71,14 @@ class Pool1D(dg.Layer):
            x = fluid.layers.transpose(x, [0, 2, 1])
        return x
 class Conv1D(dg.Conv2D):
    """A standard Conv1D layer that use (B, C, T) data layout. It inherit Conv2D and 
    use (B, C, 1, T) data layout to compute 1D convolution. Nothing more.
    NOTE: we inherit Conv2D instead of encapsulate a Conv2D layer to make it a simple
    layer, instead of a complex one. So we can easily apply weight norm to it.
    """
    def __init__(self,
                 num_channels,
                 num_filters,
@@ -72,17 +92,18 @@ class Conv1D(dg.Conv2D):
                 use_cudnn=True,
                 act=None,
                 dtype='float32'):
-        super(Conv1D, self).__init__(num_channels,
+        super(Conv1D, self).__init__(
-                                     num_filters, (1, filter_size),
+            num_channels,
-                                     stride=(1, stride),
+            num_filters, (1, filter_size),
-                                     padding=(0, padding),
+            stride=(1, stride),
-                                     dilation=(1, dilation),
+            padding=(0, padding),
-                                     groups=groups,
+            dilation=(1, dilation),
-                                     param_attr=param_attr,
+            groups=groups,
-                                     bias_attr=bias_attr,
+            param_attr=param_attr,
-                                     use_cudnn=use_cudnn,
+            bias_attr=bias_attr,
-                                     act=act,
+            use_cudnn=use_cudnn,
-                                     dtype=dtype)
+            act=act,
+            dtype=dtype)
    def forward(self, x):
        x = F.unsqueeze(x, [2])
@@ -105,18 +126,19 @@ class Conv1DTranspose(dg.Conv2DTranspose):
                 use_cudnn=True,
                 act=None,
                 dtype='float32'):
-        super(Conv1DTranspose, self).__init__(num_channels,
+        super(Conv1DTranspose, self).__init__(
-                                              num_filters, (1, filter_size),
+            num_channels,
-                                              output_size=None,
+            num_filters, (1, filter_size),
-                                              padding=(0, padding),
+            output_size=None,
-                                              stride=(1, stride),
+            padding=(0, padding),
-                                              dilation=(1, dilation),
+            stride=(1, stride),
-                                              groups=groups,
+            dilation=(1, dilation),
-                                              param_attr=param_attr,
+            groups=groups,
-                                              bias_attr=bias_attr,
+            param_attr=param_attr,
-                                              use_cudnn=use_cudnn,
+            bias_attr=bias_attr,
-                                              act=act,
+            use_cudnn=use_cudnn,
-                                              dtype=dtype)
+            act=act,
+            dtype=dtype)
    def forward(self, x):
        x = F.unsqueeze(x, [2])
@@ -134,6 +156,7 @@ class Conv1DCell(Conv1D):
    It is a cell that it acts like an RNN cell. It does not support stride > 1, and it
    ensures 1-to-1 mapping from input time steps to output timesteps.
    """
    def __init__(self,
                 num_channels,
                 num_filters,
@@ -150,18 +173,19 @@ class Conv1DCell(Conv1D):
        padding = receptive_field - 1 if causal else receptive_field // 2
        self._receptive_field = receptive_field
        self.causal = causal
-        super(Conv1DCell, self).__init__(num_channels,
+        super(Conv1DCell, self).__init__(
-                                         num_filters,
+            num_channels,
-                                         filter_size,
+            num_filters,
-                                         stride=1,
+            filter_size,
-                                         padding=padding,
+            stride=1,
-                                         dilation=dilation,
+            padding=padding,
-                                         groups=groups,
+            dilation=dilation,
-                                         param_attr=param_attr,
+            groups=groups,
-                                         bias_attr=bias_attr,
+            param_attr=param_attr,
-                                         use_cudnn=use_cudnn,
+            bias_attr=bias_attr,
-                                         act=act,
+            use_cudnn=use_cudnn,
-                                         dtype=dtype)
+            act=act,
+            dtype=dtype)
    def forward(self, x):
        # it ensures that ouput time steps == input time steps
@@ -189,15 +213,16 @@ class Conv1DCell(Conv1D):
    def add_input(self, x_t):
        batch_size, c_in, _ = x_t.shape
        if self._buffer is None:
-            self._buffer = F.zeros((batch_size, c_in, self.receptive_field),
+            self._buffer = F.zeros(
-                                   dtype=x_t.dtype)
+                (batch_size, c_in, self.receptive_field), dtype=x_t.dtype)
        self._buffer = F.concat([self._buffer[:, :, 1:], x_t], -1)
        if self._dilation[1] > 1:
-            input = F.strided_slice(self._buffer,
+            input = F.strided_slice(
-                                    axes=[2],
+                self._buffer,
-                                    starts=[0],
+                axes=[2],
-                                    ends=[self.receptive_field],
+                starts=[0],
-                                    strides=[self._dilation[1]])
+                ends=[self.receptive_field],
+                strides=[self._dilation[1]])
        else:
            input = self._buffer
        input = F.reshape(input, (batch_size, -1))

--- a/parakeet/modules/dynamic_gru.py
+++ b/parakeet/modules/dynamic_gru.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid.layers as layers
 class DynamicGRU(dg.Layer):
    def __init__(self,
                 size,
@@ -49,4 +63,3 @@ class DynamicGRU(dg.Layer):
            res = res[::-1]
        res = layers.concat(res, axis=1)
        return res
--- a/parakeet/modules/ffn.py
+++ b/parakeet/modules/ffn.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid.layers as layers
 import paddle.fluid as fluid
@@ -7,28 +20,41 @@ from parakeet.modules.customized import Conv1D
 class PositionwiseFeedForward(dg.Layer):
    ''' A two-feed-forward-layer module '''
-    def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1):
+    def __init__(self,
+                 d_in,
+                 num_hidden,
+                 filter_size,
+                 padding=0,
+                 use_cudnn=True,
+                 dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.num_hidden = num_hidden
        self.use_cudnn = use_cudnn
        self.dropout = dropout
        k = math.sqrt(1 / d_in)
-        self.w_1 = Conv1D(num_channels = d_in, 
+        self.w_1 = Conv1D(
-                        num_filters = num_hidden, 
+            num_channels=d_in,
-                        filter_size = filter_size,
+            num_filters=num_hidden,
-                        padding=padding,
+            filter_size=filter_size,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            padding=padding,
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+            param_attr=fluid.ParamAttr(
-                        use_cudnn = use_cudnn)
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)),
+            use_cudnn=use_cudnn)
        k = math.sqrt(1 / num_hidden)
-        self.w_2 = Conv1D(num_channels = num_hidden,
+        self.w_2 = Conv1D(
-                        num_filters = d_in,
+            num_channels=num_hidden,
-                        filter_size = filter_size,
+            num_filters=d_in,
-                        padding=padding,
+            filter_size=filter_size,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            padding=padding,
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+            param_attr=fluid.ParamAttr(
-                        use_cudnn = use_cudnn)
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)),
+            use_cudnn=use_cudnn)
        self.layer_norm = dg.LayerNorm(d_in)
    def forward(self, input):
@@ -40,18 +66,18 @@ class PositionwiseFeedForward(dg.Layer):
        Returns:
            output (Variable), Shape(B, T, C), the result after FFN.
        """
-        x = layers.transpose(input, [0,2,1])
+        x = layers.transpose(input, [0, 2, 1])
        #FFN Networt
        x = self.w_2(layers.relu(self.w_1(x)))
        # dropout
        x = layers.dropout(x, self.dropout)
-        x = layers.transpose(x, [0,2,1])
+        x = layers.transpose(x, [0, 2, 1])
        # residual connection
        x = x + input
        #layer normalization
        output = self.layer_norm(x)
        return output
\ No newline at end of file
--- a/parakeet/modules/multihead_attention.py
+++ b/parakeet/modules/multihead_attention.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.dygraph as dg
 import paddle.fluid.layers as layers
 class Linear(dg.Layer):
-    def __init__(self, in_features, out_features, is_bias=True, dtype="float32"):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 is_bias=True,
+                 dtype="float32"):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.dtype = dtype
-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
+        self.weight = fluid.ParamAttr(
-        self.bias  = is_bias
+            initializer=fluid.initializer.XavierInitializer())
+        self.bias = is_bias
        if is_bias is not False:
            k = math.sqrt(1 / in_features)
-            self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
+            self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k))
+        self.linear = dg.Linear(
+            in_features,
+            out_features,
+            param_attr=self.weight,
+            bias_attr=self.bias, )
-        self.linear = dg.Linear(in_features, out_features, param_attr = self.weight,
-                            bias_attr = self.bias,)
    def forward(self, x):
        x = self.linear(x)
        return x
 class ScaledDotProductAttention(dg.Layer):
    def __init__(self, d_key):
        super(ScaledDotProductAttention, self).__init__()
        self.d_key = d_key
    # please attention this mask is diff from pytorch
-    def forward(self, key, value, query, mask=None, query_mask=None, dropout=0.1):
+    def forward(self,
+                key,
+                value,
+                query,
+                mask=None,
+                query_mask=None,
+                dropout=0.1):
        """
        Scaled Dot Product Attention.
@@ -47,27 +77,36 @@ class ScaledDotProductAttention(dg.Layer):
            attention (Variable), Shape(n_head * B, T, C), the attention of key.
        """
        # Compute attention score
-        attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y
+        attention = layers.matmul(
+            query, key, transpose_y=True)  #transpose the last dim in y
        attention = attention / math.sqrt(self.d_key)
        # Mask key to ignore padding
        if mask is not None:
            attention = attention * mask
-            mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1)
+            mask = (mask == 0).astype(np.float32) * (-2**32 + 1)
            attention = attention + mask
        attention = layers.softmax(attention)
        attention = layers.dropout(attention, dropout)
        # Mask query to ignore padding
        if query_mask is not None:
            attention = attention * query_mask
        result = layers.matmul(attention, value)
        return result, attention
 class MultiheadAttention(dg.Layer):
-    def __init__(self, num_hidden, d_k, d_q, num_head=4, is_bias=False, dropout=0.1, is_concat=True):
+    def __init__(self,
+                 num_hidden,
+                 d_k,
+                 d_q,
+                 num_head=4,
+                 is_bias=False,
+                 dropout=0.1,
+                 is_concat=True):
        super(MultiheadAttention, self).__init__()
        self.num_hidden = num_hidden
        self.num_head = num_head
@@ -109,30 +148,44 @@ class MultiheadAttention(dg.Layer):
        # repeat masks h times
        if query_mask is not None:
-            query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key])
+            query_mask = layers.expand(query_mask,
+                                       [self.num_head, 1, seq_len_key])
        if mask is not None:
            mask = layers.expand(mask, (self.num_head, 1, 1))
        # Make multihead attention
        # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
-        key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
+        key = layers.reshape(
-        value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k])
+            self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
-        query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q])
+        value = layers.reshape(
+            self.value(value),
+            [batch_size, seq_len_key, self.num_head, self.d_k])
+        query = layers.reshape(
+            self.query(query_input),
+            [batch_size, seq_len_query, self.num_head, self.d_q])
+        key = layers.reshape(
+            layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
+        value = layers.reshape(
+            layers.transpose(value, [2, 0, 1, 3]),
+            [-1, seq_len_key, self.d_k])
+        query = layers.reshape(
+            layers.transpose(query, [2, 0, 1, 3]),
+            [-1, seq_len_query, self.d_q])
+        result, attention = self.scal_attn(
+            key, value, query, mask=mask, query_mask=query_mask)
-        key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
-        value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
-        query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q])
-        result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
        # concat all multihead result
-        result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
+        result = layers.reshape(
-        result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
+            result, [self.num_head, batch_size, seq_len_query, self.d_q])
+        result = layers.reshape(
+            layers.transpose(result, [1, 2, 0, 3]),
+            [batch_size, seq_len_query, -1])
        if self.is_concat:
-            result = layers.concat([query_input,result], axis=-1)
+            result = layers.concat([query_input, result], axis=-1)
        result = layers.dropout(self.fc(result), self.dropout)
        result = result + query_input
        result = self.layer_norm(result)
        return result, attention
\ No newline at end of file
--- a/parakeet/modules/weight_norm.py
+++ b/parakeet/modules/weight_norm.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from paddle import fluid
 import paddle.fluid.dygraph as dg

--- a/parakeet/utils/layer_tools.py
+++ b/parakeet/utils/layer_tools.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from torch import nn
 import paddle.fluid.dygraph as dg
@@ -10,8 +24,8 @@ def summary(layer):
        print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
        num_elements += np.prod(param.shape)
        num_params += 1
-    print("layer has {} parameters, {} elements.".format(
+    print("layer has {} parameters, {} elements.".format(num_params,
-        num_params, num_elements))
+                                                         num_elements))
 def freeze(layer):
@@ -31,5 +45,5 @@ def torch_summary(layer):
        print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
        num_elements += np.prod(param.shape)
        num_params += 1
-    print("layer has {} parameters, {} elements.".format(
+    print("layer has {} parameters, {} elements.".format(num_params,
-        num_params, num_elements))
+                                                         num_elements))
--- a/setup.py
+++ b/setup.py
-import os 
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-import io 
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import io
 import re
 from setuptools import setup, find_packages
 def read(*names, **kwargs):
    with io.open(
-        os.path.join(os.path.dirname(__file__), *names),
+            os.path.join(os.path.dirname(__file__), *names),
-        encoding=kwargs.get("encoding", "utf8")
+            encoding=kwargs.get("encoding", "utf8")) as fp:
-    ) as fp:
        return fp.read()
@@ -19,6 +33,7 @@ def find_version(*file_paths):
        return version_match.group(1)
    raise RuntimeError("Unable to find version string.")
 VERSION = find_version('parakeet', '__init__.py')
 long_description = read('README.md')
@@ -32,17 +47,26 @@ setup_info = dict(
    description='Speech synthesis tools and models based on Paddlepaddle',
    long_description=long_description,
    license='Apache 2',
    install_requires=[
-        'numpy', 'nltk', 'inflect', 'librosa', 'unidecode', 'numba', 
+        'numpy',
-        'tqdm', 'matplotlib', 'tensorboardX', 'tensorboard', 'scipy',
+        'nltk',
-        'ruamel.yaml', 'pandas', 'sox', 'soundfile',  
+        'inflect',
+        'librosa',
+        'unidecode',
+        'numba',
+        'tqdm',
+        'matplotlib',
+        'tensorboardX',
+        'tensorboard',
+        'scipy',
+        'ruamel.yaml',
+        'pandas',
+        'sox',
+        'soundfile',
    ],
    # Package info
    packages=find_packages(exclude=('tests', 'tests.*')),
+    zip_safe=True, )
-    zip_safe=True,
+setup(**setup_info)
-)
-setup(**setup_info)
\ No newline at end of file
--- a/tests/test_ljspeech.py
+++ b/tests/test_ljspeech.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from parakeet.datasets.ljspeech import LJSpeech
 from parakeet.data.datacargo import DataCargo

--- a/tests/test_vctk.py
+++ b/tests/test_vctk.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from parakeet.datasets import vctk
 from pathlib import Path
 from parakeet.data.datacargo import DataCargo
 root = Path("/workspace/datasets/VCTK-Corpus")
 vctk_dataset = vctk.VCTK(root)
-vctk_cargo = DataCargo(vctk_dataset, batch_size=16, shuffle=True, drop_last=True)
+vctk_cargo = DataCargo(
+    vctk_dataset, batch_size=16, shuffle=True, drop_last=True)
 for i, batch in enumerate(vctk_cargo):
    print(i)
--- a/tools/copyright.hook
+++ b/tools/copyright.hook
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+import argparse
+import io, re
+import sys, os
+import subprocess
+import platform
+COPYRIGHT = '''
+Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+LANG_COMMENT_MARK = None
+NEW_LINE_MARK = None
+COPYRIGHT_HEADER = None
+if platform.system() == "Windows":
+    NEW_LINE_MARK = "\r\n"
+else:
+    NEW_LINE_MARK = '\n'
+    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
+    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
+    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
+    date, err = process.communicate()
+    date = date.decode("utf-8").rstrip("\n")
+    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
+def generate_copyright(template, lang='C'):
+    if lang == 'Python':
+        LANG_COMMENT_MARK = '#'
+    else:
+        LANG_COMMENT_MARK = "//"
+    lines = template.split(NEW_LINE_MARK)
+    BLANK = " "
+    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
+    for lino, line in enumerate(lines):
+        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
+        if len(line)  == 0:
+            BLANK = ""
+        else:
+            BLANK = " "
+        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
+    return ans + "\n"
+def lang_type(filename):
+    if filename.endswith(".py"):
+        return "Python"
+    elif filename.endswith(".h"):
+        return "C"
+    elif filename.endswith(".c"):
+        return "C"
+    elif filename.endswith(".hpp"):
+        return "C"
+    elif filename.endswith(".cc"):
+        return "C"
+    elif filename.endswith(".cpp"):
+        return "C"
+    elif filename.endswith(".cu"):
+        return "C"
+    elif filename.endswith(".cuh"):
+        return "C"
+    elif filename.endswith(".go"):
+        return "C"
+    elif filename.endswith(".proto"):
+        return "C"
+    else:
+        print("Unsupported filetype %s", filename)
+        exit(0)
+PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+    retv = 0
+    for filename in args.filenames:
+        fd = io.open(filename, encoding="utf-8")
+        first_line = fd.readline()
+        second_line = fd.readline()
+        if "COPYRIGHT (C)" in first_line.upper(): continue
+        if first_line.startswith("#!") or PYTHON_ENCODE.match(
+                second_line) != None or PYTHON_ENCODE.match(first_line) != None:
+            continue
+        original_contents = io.open(filename, encoding="utf-8").read()
+        new_contents = generate_copyright(
+            COPYRIGHT, lang_type(filename)) + original_contents
+        print('Auto Insert Copyright Header {}'.format(filename))
+        retv = 1
+        with io.open(filename, 'w') as output_file:
+            output_file.write(new_contents)
+    return retv
+if __name__ == '__main__':
+    exit(main())