add license

9d796994 · lifuchen · f84d6bec · 9d796994 · 9d796994 · 9d796994
92 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,3 +25,11 @@
        files: \.md$
    -   id: remove-tabs
        files: \.md$
+-   repo: local
+    hooks:
+    -   id: copyright_checker
+        name: copyright_checker
+        entry: python ./tools/copyright.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
+        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
--- a/examples/deepvoice3/README.md
+++ b/examples/deepvoice3/README.md
-# Deepvoice 3 
+# Deepvoice 3
 Paddle implementation of deepvoice 3 in dynamic graph, a convolutional network based text-to-speech synthesis model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654).
@@ -22,7 +22,7 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed
 ## Project Structure
 ```text
-├── data.py          data_processing 
+├── data.py          data_processing
 ├── ljspeech.yaml    (example) configuration file
 ├── sentences.txt    sample sentences
 ├── synthesis.py     script to synthesize waveform from text
@@ -50,7 +50,7 @@ optional arguments:
                        The directory to save result.
  -g DEVICE, --device DEVICE
                        device to use
-``` 
+```
 1. `--config` is the configuration file to use. The provided `ljspeech.yaml` can be used directly. And you can change some values in the configuration file and train the model with a different config.
 2. `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt).
@@ -61,7 +61,7 @@ optional arguments:
 ├── checkpoints      # checkpoint
 ├── log              # tensorboard log
 └── states           # train and evaluation results
-    ├── alignments   # attention 
+    ├── alignments   # attention
    ├── lin_spec     # linear spectrogram
    ├── mel_spec     # mel spectrogram
    └── waveform     # waveform (.wav files)
@@ -112,4 +112,3 @@ example script:
 ```bash
 python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated
 ```
--- a/examples/deepvoice3/data.py
+++ b/examples/deepvoice3/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import csv
 from pathlib import Path
@@ -79,10 +93,11 @@ class Transform(object):
        y = signal.lfilter([1., -self.preemphasis], [1.], wav)
        # STFT
-        D = librosa.stft(y=y,
+        D = librosa.stft(
-                         n_fft=self.n_fft,
+            y=y,
-                         win_length=self.win_length,
+            n_fft=self.n_fft,
-                         hop_length=self.hop_length)
+            win_length=self.win_length,
+            hop_length=self.hop_length)
        S = np.abs(D)
        # to db and normalize to 0-1
@@ -96,11 +111,8 @@ class Transform(object):
        # mel scale and to db and normalize to 0-1,
        # CAUTION: pass linear scale S, not dbscaled S
-        S_mel = librosa.feature.melspectrogram(S=S,
+        S_mel = librosa.feature.melspectrogram(
-                                               n_mels=self.n_mels,
+            S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
-                                               fmin=self.fmin,
-                                               fmax=self.fmax,
-                                               power=1.)
        S_mel = 20 * np.log10(np.maximum(amplitude_min,
                                         S_mel)) - self.ref_level_db
        S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
@@ -148,20 +160,18 @@ class DataCollector(object):
            (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
             S_mel_norm, num_frames) = example
            text_sequences.append(
-                np.pad(mix_grapheme_phonemes,
+                np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
-                       (0, max_text_length - text_length)))
+                                               )))
            lin_specs.append(
-                np.pad(S_norm,
+                np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
-                       ((0, 0), (self._pad_begin,
+                                         self._pad_begin - num_frames))))
-                                 max_frames - self._pad_begin - num_frames))))
            mel_specs.append(
-                np.pad(S_mel_norm,
+                np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
-                       ((0, 0), (self._pad_begin,
+                                             self._pad_begin - num_frames))))
-                                 max_frames - self._pad_begin - num_frames))))
            done_flags.append(
                np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
-                       (0, max_decoder_length -
+                       (0, max_decoder_length - int(
-                        int(np.ceil(num_frames // self._factor))),
+                           np.ceil(num_frames // self._factor))),
                       constant_values=1))
        text_sequences = np.array(text_sequences).astype(np.int64)
        lin_specs = np.transpose(np.array(lin_specs),

--- a/examples/deepvoice3/synthesis.py
+++ b/examples/deepvoice3/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import argparse
 import ruamel.yaml
@@ -22,11 +36,8 @@ if __name__ == "__main__":
    parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
    parser.add_argument("text", type=str, help="text file to synthesize")
    parser.add_argument("output_path", type=str, help="path to save results")
-    parser.add_argument("-g",
+    parser.add_argument(
-                        "--device",
+        "-g", "--device", type=int, default=-1, help="device to use")
-                        type=int,
-                        default=-1,
-                        help="device to use")
    args = parser.parse_args()
    with open(args.config, 'rt') as f:
@@ -76,15 +87,14 @@ if __name__ == "__main__":
        window_ahead = model_config["window_ahead"]
        key_projection = model_config["key_projection"]
        value_projection = model_config["value_projection"]
-        dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
+        dv3 = make_model(
-                         padding_idx, embedding_std, max_positions, n_vocab,
+            n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
-                         freeze_embedding, filter_size, encoder_channels,
+            embedding_std, max_positions, n_vocab, freeze_embedding,
-                         n_mels, decoder_channels, r,
+            filter_size, encoder_channels, n_mels, decoder_channels, r,
-                         trainable_positional_encodings, use_memory_mask,
+            trainable_positional_encodings, use_memory_mask,
-                         query_position_rate, key_position_rate,
+            query_position_rate, key_position_rate, window_backward,
-                         window_backward, window_ahead, key_projection,
+            window_ahead, key_projection, value_projection, downsample_factor,
-                         value_projection, downsample_factor, linear_dim,
+            linear_dim, use_decoder_states, converter_channels, dropout)
-                         use_decoder_states, converter_channels, dropout)
        summary(dv3)
        state, _ = dg.load_dygraph(args.checkpoint)

--- a/examples/deepvoice3/train.py
+++ b/examples/deepvoice3/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import argparse
 import ruamel.yaml

--- a/examples/deepvoice3/utils.py
+++ b/examples/deepvoice3/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import numpy as np
 from matplotlib import cm
@@ -28,8 +42,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
               converter_channels, dropout):
    """just a simple function to create a deepvoice 3 model"""
    if n_speakers > 1:
-        spe = dg.Embedding((n_speakers, speaker_dim),
+        spe = dg.Embedding(
-                           param_attr=I.Normal(scale=speaker_embed_std))
+            (n_speakers, speaker_dim),
+            param_attr=I.Normal(scale=speaker_embed_std))
    else:
        spe = None
@@ -45,17 +60,17 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
-        ConvSpec(h, k, 3),
+        ConvSpec(h, k, 3), )
-    )
+    enc = Encoder(
-    enc = Encoder(n_vocab,
+        n_vocab,
-                  embed_dim,
+        embed_dim,
-                  n_speakers,
+        n_speakers,
-                  speaker_dim,
+        speaker_dim,
-                  padding_idx=None,
+        padding_idx=None,
-                  embedding_weight_std=embedding_std,
+        embedding_weight_std=embedding_std,
-                  convolutions=encoder_convolutions,
+        convolutions=encoder_convolutions,
-                  max_positions=max_positions,
+        max_positions=max_positions,
-                  dropout=dropout)
+        dropout=dropout)
    if freeze_embedding:
        freeze(enc.embed)
@@ -66,28 +81,28 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
-        ConvSpec(h, k, 1),
+        ConvSpec(h, k, 1), )
-    )
    attention = [True, False, False, False, True]
    force_monotonic_attention = [True, False, False, False, True]
-    dec = Decoder(n_speakers,
+    dec = Decoder(
-                  speaker_dim,
+        n_speakers,
-                  embed_dim,
+        speaker_dim,
-                  mel_dim,
+        embed_dim,
-                  r=r,
+        mel_dim,
-                  max_positions=max_positions,
+        r=r,
-                  padding_idx=padding_idx,
+        max_positions=max_positions,
-                  preattention=prenet_convolutions,
+        padding_idx=padding_idx,
-                  convolutions=attentive_convolutions,
+        preattention=prenet_convolutions,
-                  attention=attention,
+        convolutions=attentive_convolutions,
-                  dropout=dropout,
+        attention=attention,
-                  use_memory_mask=use_memory_mask,
+        dropout=dropout,
-                  force_monotonic_attention=force_monotonic_attention,
+        use_memory_mask=use_memory_mask,
-                  query_position_rate=query_position_rate,
+        force_monotonic_attention=force_monotonic_attention,
-                  key_position_rate=key_position_rate,
+        query_position_rate=query_position_rate,
-                  window_range=WindowRange(window_behind, window_ahead),
+        key_position_rate=key_position_rate,
-                  key_projection=key_projection,
+        window_range=WindowRange(window_behind, window_ahead),
-                  value_projection=value_projection)
+        key_projection=key_projection,
+        value_projection=value_projection)
    if not trainable_positional_encodings:
        freeze(dec.embed_keys_positions)
        freeze(dec.embed_query_positions)
@@ -97,15 +112,15 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(2 * h, k, 1),
-        ConvSpec(2 * h, k, 3),
+        ConvSpec(2 * h, k, 3), )
-    )
+    cvt = Converter(
-    cvt = Converter(n_speakers,
+        n_speakers,
-                    speaker_dim,
+        speaker_dim,
-                    dec.state_dim if use_decoder_states else mel_dim,
+        dec.state_dim if use_decoder_states else mel_dim,
-                    linear_dim,
+        linear_dim,
-                    time_upsampling=downsample_factor,
+        time_upsampling=downsample_factor,
-                    convolutions=postnet_convolutions,
+        convolutions=postnet_convolutions,
-                    dropout=dropout)
+        dropout=dropout)
    dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
    return dv3
@@ -115,8 +130,10 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
               ref_level_db, power, n_iter, win_length, hop_length,
               preemphasis):
    """generate waveform from text using a deepvoice 3 model"""
-    text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob),
+    text = np.array(
-                    dtype=np.int64)
+        en.text_to_sequence(
+            text, p=replace_pronounciation_prob),
+        dtype=np.int64)
    length = len(text)
    print("text sequence's length: {}".format(length))
    text_positions = np.arange(1, 1 + length)
@@ -145,10 +162,11 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter,
    """
    denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db
    lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
-    wav = librosa.griffinlim(lin_scaled**power,
+    wav = librosa.griffinlim(
-                             n_iter=n_iter,
+        lin_scaled**power,
-                             hop_length=hop_length,
+        n_iter=n_iter,
-                             win_length=win_length)
+        hop_length=hop_length,
+        win_length=win_length)
    if preemphasis > 0:
        wav = signal.lfilter([1.], [1., -preemphasis], wav)
    return wav
@@ -225,28 +243,30 @@ def save_state(save_dir,
        plt.colorbar()
        plt.title("mel_input")
        plt.savefig(
-            os.path.join(path,
+            os.path.join(path, "target_mel_spec_step{:09d}.png".format(
-                         "target_mel_spec_step{:09d}.png".format(global_step)))
+                global_step)))
        plt.close()
-        writer.add_image("target/mel_spec",
+        writer.add_image(
-                         cm.viridis(mel_input),
+            "target/mel_spec",
-                         global_step,
+            cm.viridis(mel_input),
-                         dataformats="HWC")
+            global_step,
+            dataformats="HWC")
        plt.figure(figsize=(10, 3))
        display.specshow(mel_output)
        plt.colorbar()
        plt.title("mel_output")
        plt.savefig(
-            os.path.join(
+            os.path.join(path, "predicted_mel_spec_step{:09d}.png".format(
-                path, "predicted_mel_spec_step{:09d}.png".format(global_step)))
+                global_step)))
        plt.close()
-        writer.add_image("predicted/mel_spec",
+        writer.add_image(
-                         cm.viridis(mel_output),
+            "predicted/mel_spec",
-                         global_step,
+            cm.viridis(mel_output),
-                         dataformats="HWC")
+            global_step,
+            dataformats="HWC")
    if lin_input is not None and lin_output is not None:
        lin_input = lin_input[0].numpy().T
@@ -258,28 +278,30 @@ def save_state(save_dir,
        plt.colorbar()
        plt.title("mel_input")
        plt.savefig(
-            os.path.join(path,
+            os.path.join(path, "target_lin_spec_step{:09d}.png".format(
-                         "target_lin_spec_step{:09d}.png".format(global_step)))
+                global_step)))
        plt.close()
-        writer.add_image("target/lin_spec",
+        writer.add_image(
-                         cm.viridis(lin_input),
+            "target/lin_spec",
-                         global_step,
+            cm.viridis(lin_input),
-                         dataformats="HWC")
+            global_step,
+            dataformats="HWC")
        plt.figure(figsize=(10, 3))
        display.specshow(lin_output)
        plt.colorbar()
        plt.title("mel_input")
        plt.savefig(
-            os.path.join(
+            os.path.join(path, "predicted_lin_spec_step{:09d}.png".format(
-                path, "predicted_lin_spec_step{:09d}.png".format(global_step)))
+                global_step)))
        plt.close()
-        writer.add_image("predicted/lin_spec",
+        writer.add_image(
-                         cm.viridis(lin_output),
+            "predicted/lin_spec",
-                         global_step,
+            cm.viridis(lin_output),
-                         dataformats="HWC")
+            global_step,
+            dataformats="HWC")
    if alignments is not None and len(alignments.shape) == 4:
        path = os.path.join(save_dir, "alignments")
@@ -290,10 +312,11 @@ def save_state(save_dir,
                "train_attn_layer_{}_step_{}.png".format(idx, global_step))
            plot_alignment(attn_layer, save_path)
-            writer.add_image("train_attn/layer_{}".format(idx),
+            writer.add_image(
-                             cm.viridis(attn_layer),
+                "train_attn/layer_{}".format(idx),
-                             global_step,
+                cm.viridis(attn_layer),
-                             dataformats="HWC")
+                global_step,
+                dataformats="HWC")
    if lin_output is not None:
        wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power,
@@ -302,7 +325,5 @@ def save_state(save_dir,
        save_path = os.path.join(
            path, "train_sample_step_{:09d}.wav".format(global_step))
        sf.write(save_path, wav, sample_rate)
-        writer.add_audio("train_sample",
+        writer.add_audio(
-                         wav,
+            "train_sample", wav, global_step, sample_rate=sample_rate)
-                         global_step,
-                         sample_rate=sample_rate)
--- a/examples/fastspeech/README.md
+++ b/examples/fastspeech/README.md
@@ -57,7 +57,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
 if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--fastspeech_step``
-For more help on arguments: 
+For more help on arguments:
 ``python train.py --help``.
 ## Synthesis
@@ -75,5 +75,5 @@ or you can run the script file directly.
 sh synthesis.sh
 ```
-For more help on arguments: 
+For more help on arguments:
 ``python synthesis.py --help``.
--- a/examples/fastspeech/parse.py
+++ b/examples/fastspeech/parse.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 def add_config_options_to_parser(parser):
-    parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml',
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default='config/fastspeech.yaml',
        help="the yaml config file path.")
-    parser.add_argument('--batch_size', type=int, default=32,
+    parser.add_argument(
-        help="batch size for training.")
+        '--batch_size', type=int, default=32, help="batch size for training.")
-    parser.add_argument('--epochs', type=int, default=10000,
+    parser.add_argument(
+        '--epochs',
+        type=int,
+        default=10000,
        help="the number of epoch for training.")
-    parser.add_argument('--lr', type=float, default=0.001,
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=0.001,
        help="the learning rate for training.")
-    parser.add_argument('--save_step', type=int, default=500,
+    parser.add_argument(
+        '--save_step',
+        type=int,
+        default=500,
        help="checkpointing interval during training.")
-    parser.add_argument('--fastspeech_step', type=int, default=70000,
+    parser.add_argument(
+        '--fastspeech_step',
+        type=int,
+        default=70000,
        help="Global step to restore checkpoint of fastspeech.")
-    parser.add_argument('--use_gpu', type=int, default=1,
+    parser.add_argument(
+        '--use_gpu',
+        type=int,
+        default=1,
        help="use gpu or not during training.")
-    parser.add_argument('--use_data_parallel', type=int, default=0,
+    parser.add_argument(
+        '--use_data_parallel',
+        type=int,
+        default=0,
        help="use data parallel or not during training.")
-    parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='./dataset/LJSpeech-1.1',
        help="the path of dataset.")
-    parser.add_argument('--checkpoint_path', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint_path',
+        type=str,
+        default=None,
        help="the path to load checkpoint or pretrain model.")
-    parser.add_argument('--save_path', type=str, default='./checkpoint',
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./checkpoint',
        help="the path to save checkpoint.")
-    parser.add_argument('--log_dir', type=str, default='./log',
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default='./log',
        help="the directory to save tensorboard log.")
-    parser.add_argument('--sample_path', type=str, default='./sample',
+    parser.add_argument(
+        '--sample_path',
+        type=str,
+        default='./sample',
        help="the directory to save audio sample in synthesis.")
-    parser.add_argument('--transtts_path', type=str, default='./log',
+    parser.add_argument(
+        '--transtts_path',
+        type=str,
+        default='./log',
        help="the directory to load pretrain transformerTTS model.")
-    parser.add_argument('--transformer_step', type=int, default=160000,
+    parser.add_argument(
+        '--transformer_step',
+        type=int,
+        default=160000,
        help="the step to load transformerTTS model.")
--- a/examples/fastspeech/synthesis.py
+++ b/examples/fastspeech/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from tensorboardX import SummaryWriter
 from collections import OrderedDict
@@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence
 from parakeet import audio
 from parakeet.models.fastspeech.fastspeech import FastSpeech
 def load_checkpoint(step, model_path):
    model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
    new_state_dict = OrderedDict()
@@ -22,13 +36,14 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict
 def synthesis(text_input, args):
    place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
    # tensorboard
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
+        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'synthesis')
+    path = os.path.join(args.log_dir, 'synthesis')
    with open(args.config_path) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)
@@ -37,24 +52,28 @@ def synthesis(text_input, args):
    with dg.guard(place):
        model = FastSpeech(cfg)
-        model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")))
+        model.set_dict(
+            load_checkpoint(
+                str(args.fastspeech_step),
+                os.path.join(args.checkpoint_path, "fastspeech")))
        model.eval()
        text = np.asarray(text_to_sequence(text_input))
-        text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
+        text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
-        pos_text = np.arange(1, text.shape[1]+1)
+        pos_text = np.arange(1, text.shape[1] + 1)
-        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
+        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
-        mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
+        mel_output, mel_output_postnet = model(
+            text, pos_text, alpha=args.alpha)
        _ljspeech_processor = audio.AudioProcessor(
-            sample_rate=cfg['audio']['sr'], 
+            sample_rate=cfg['audio']['sr'],
-            num_mels=cfg['audio']['num_mels'], 
+            num_mels=cfg['audio']['num_mels'],
-            min_level_db=cfg['audio']['min_level_db'], 
+            min_level_db=cfg['audio']['min_level_db'],
-            ref_level_db=cfg['audio']['ref_level_db'], 
+            ref_level_db=cfg['audio']['ref_level_db'],
-            n_fft=cfg['audio']['n_fft'], 
+            n_fft=cfg['audio']['n_fft'],
-            win_length= cfg['audio']['win_length'], 
+            win_length=cfg['audio']['win_length'],
-            hop_length= cfg['audio']['hop_length'],
+            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
@@ -67,14 +86,17 @@ def synthesis(text_input, args):
            do_trim_silence=False,
            sound_norm=False)
-        mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0])
+        mel_output_postnet = fluid.layers.transpose(
-        wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy())
+            fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
+        wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy(
+        ))
        writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
        print("Synthesis completed !!!")
    writer.close()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train Fastspeech model")
    add_config_options_to_parser(parser)
    args = parser.parse_args()
    synthesis("Transformer model is so fast!", args)
\ No newline at end of file
--- a/examples/fastspeech/train.py
+++ b/examples/fastspeech/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import argparse
 import os
@@ -20,8 +33,10 @@ import sys
 sys.path.append("../transformer_tts")
 from data import LJSpeechLoader
 def load_checkpoint(step, model_path):
-    model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
+    model_dict, opti_dict = fluid.dygraph.load_dygraph(
+        os.path.join(model_path, step))
    new_state_dict = OrderedDict()
    for param in model_dict:
        if param.startswith('_layers.'):
@@ -30,6 +45,7 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict, opti_dict
 def main(args):
    local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
    nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
@@ -43,26 +59,33 @@ def main(args):
             if args.use_gpu else fluid.CPUPlace())
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
+        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'fastspeech')
+    path = os.path.join(args.log_dir, 'fastspeech')
    writer = SummaryWriter(path) if local_rank == 0 else None
    with dg.guard(place):
        with fluid.unique_name.guard():
            transformerTTS = TransformerTTS(cfg)
-            model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer"))
+            model_dict, _ = load_checkpoint(
+                str(args.transformer_step),
+                os.path.join(args.transtts_path, "transformer"))
            transformerTTS.set_dict(model_dict)
            transformerTTS.eval()
        model = FastSpeech(cfg)
        model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
+        optimizer = fluid.optimizer.AdamOptimizer(
-                                                  parameter_list=model.parameters())
+            learning_rate=dg.NoamDecay(1 / (
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
+            parameter_list=model.parameters())
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, shuffle=True).reader()
        if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.fastspeech_step),
+                os.path.join(args.checkpoint_path, "fastspeech"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.fastspeech_step
@@ -76,31 +99,42 @@ def main(args):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data
-                _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
+                _, _, attn_probs, _, _, _ = transformerTTS(
-                alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32)
+                    character, mel_input, pos_text, pos_mel)
+                alignment = dg.to_variable(
+                    get_alignment(attn_probs, mel_lens, cfg[
+                        'transformer_head'])).astype(np.float32)
                global_step += 1
                #Forward
-                result= model(character, 
+                result = model(
-                              pos_text, 
+                    character,
-                              mel_pos=pos_mel,  
+                    pos_text,
-                              length_target=alignment)
+                    mel_pos=pos_mel,
+                    length_target=alignment)
                mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
                mel_loss = layers.mse_loss(mel_output, mel)
                mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
-                duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment)))
+                duration_loss = layers.mean(
+                    layers.abs(
+                        layers.elementwise_sub(duration_predictor_output,
+                                               alignment)))
                total_loss = mel_loss + mel_postnet_loss + duration_loss
-                if local_rank==0:
+                if local_rank == 0:
-                    writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
+                    writer.add_scalar('mel_loss',
-                    writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
+                                      mel_loss.numpy(), global_step)
-                    writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
+                    writer.add_scalar('post_mel_loss',
-                    writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
+                                      mel_postnet_loss.numpy(), global_step)
+                    writer.add_scalar('duration_loss',
+                                      duration_loss.numpy(), global_step)
+                    writer.add_scalar('learning_rate',
+                                      optimizer._learning_rate.step().numpy(),
+                                      global_step)
                if args.use_data_parallel:
                    total_loss = model.scale_loss(total_loss)
@@ -108,21 +142,25 @@ def main(args):
                    model.apply_collective_grads()
                else:
                    total_loss.backward()
-                optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    total_loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                model.clear_gradients()
-                 # save checkpoint
+                # save checkpoint
-                if local_rank==0 and global_step % args.save_step == 0:
+                if local_rank == 0 and global_step % args.save_step == 0:
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
-                    save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step)
+                    save_path = os.path.join(args.save_path,
+                                             'fastspeech/%d' % global_step)
                    dg.save_dygraph(model.state_dict(), save_path)
                    dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
            writer.close()
-if __name__ =='__main__':
+if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train Fastspeech model")
    add_config_options_to_parser(parser)
    args = parser.parse_args()

--- a/examples/transformer_tts/README.md
+++ b/examples/transformer_tts/README.md
@@ -50,7 +50,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
 if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--transformer_step``
-For more help on arguments: 
+For more help on arguments:
 ``python train_transformer.py --help``.
 ## Train Vocoder
@@ -78,7 +78,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
 ```
 if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--vocoder_step``
-For more help on arguments: 
+For more help on arguments:
 ``python train_vocoder.py --help``.
 ## Synthesis
@@ -101,5 +101,5 @@ sh synthesis.sh
 And the audio file will be saved in ``--sample_path``.
-For more help on arguments: 
+For more help on arguments:
 ``python synthesis.py --help``.
--- a/examples/transformer_tts/data.py
+++ b/examples/transformer_tts/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 import numpy as np
 import pandas as pd
@@ -12,23 +25,43 @@ from parakeet.data.datacargo import DataCargo
 from parakeet.data.batch import TextIDBatcher, SpecBatcher
 from parakeet.data.dataset import DatasetMixin, TransformDataset
 class LJSpeechLoader:
-    def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True):
+    def __init__(self,
+                 config,
+                 args,
+                 nranks,
+                 rank,
+                 is_vocoder=False,
+                 shuffle=True):
        place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()
        LJSPEECH_ROOT = Path(args.data_path)
        metadata = LJSpeechMetaData(LJSPEECH_ROOT)
        transformer = LJSpeech(config)
        dataset = TransformDataset(metadata, transformer)
-        sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
+        sampler = DistributedSampler(
+            len(metadata), nranks, rank, shuffle=shuffle)
        assert args.batch_size % nranks == 0
        each_bs = args.batch_size // nranks
        if is_vocoder:
-            dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
+            dataloader = DataCargo(
+                dataset,
+                sampler=sampler,
+                batch_size=each_bs,
+                shuffle=shuffle,
+                batch_fn=batch_examples_vocoder,
+                drop_last=True)
        else:
-            dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True)
+            dataloader = DataCargo(
+                dataset,
+                sampler=sampler,
+                batch_size=each_bs,
+                shuffle=shuffle,
+                batch_fn=batch_examples,
+                drop_last=True)
        self.reader = fluid.io.DataLoader.from_generator(
            capacity=32,
            iterable=True,
@@ -63,13 +96,13 @@ class LJSpeech(object):
        super(LJSpeech, self).__init__()
        self.config = config
        self._ljspeech_processor = audio.AudioProcessor(
-            sample_rate=config['audio']['sr'], 
+            sample_rate=config['audio']['sr'],
-            num_mels=config['audio']['num_mels'], 
+            num_mels=config['audio']['num_mels'],
-            min_level_db=config['audio']['min_level_db'], 
+            min_level_db=config['audio']['min_level_db'],
-            ref_level_db=config['audio']['ref_level_db'], 
+            ref_level_db=config['audio']['ref_level_db'],
-            n_fft=config['audio']['n_fft'], 
+            n_fft=config['audio']['n_fft'],
-            win_length= config['audio']['win_length'], 
+            win_length=config['audio']['win_length'],
-            hop_length= config['audio']['hop_length'],
+            hop_length=config['audio']['hop_length'],
            power=config['audio']['power'],
            preemphasis=config['audio']['preemphasis'],
            signal_norm=True,
@@ -81,7 +114,7 @@ class LJSpeech(object):
            griffin_lim_iters=60,
            do_trim_silence=False,
            sound_norm=False)
    def __call__(self, metadatum):
        """All the code for generating an Example from a metadatum. If you want a 
        different preprocessing pipeline, you can override this method. 
@@ -90,13 +123,15 @@ class LJSpeech(object):
        method.
        """
        fname, raw_text, normalized_text = metadatum
        # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
        wav = self._ljspeech_processor.load_wav(str(fname))
        mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
        mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
-        phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
+        phonemes = np.array(
-        return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
+            g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
+        return (mag, mel, phonemes
+                )  # maybe we need to implement it as a map in the future
 def batch_examples(batch):
@@ -109,44 +144,71 @@ def batch_examples(batch):
    pos_mels = []
    for data in batch:
        _, mel, text = data
-        mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
+        mel_inputs.append(
+            np.concatenate(
+                [np.zeros([mel.shape[0], 1], np.float32), mel[:, :-1]],
+                axis=-1))
        mel_lens.append(mel.shape[1])
        text_lens.append(len(text))
        pos_texts.append(np.arange(1, len(text) + 1))
        pos_mels.append(np.arange(1, mel.shape[1] + 1))
        mels.append(mel)
        texts.append(text)
    # Sort by text_len in descending order
-    texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
+    texts = [
-    mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
+        i
-    mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
+        for i, _ in sorted(
-    mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)]
+            zip(texts, text_lens), key=lambda x: x[1], reverse=True)
-    pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
+    ]
-    pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
+    mels = [
+        i
+        for i, _ in sorted(
+            zip(mels, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    mel_inputs = [
+        i
+        for i, _ in sorted(
+            zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    mel_lens = [
+        i
+        for i, _ in sorted(
+            zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    pos_texts = [
+        i
+        for i, _ in sorted(
+            zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    pos_mels = [
+        i
+        for i, _ in sorted(
+            zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)
+    ]
    text_lens = sorted(text_lens, reverse=True)
    # Pad sequence with largest len of the batch
-    texts = TextIDBatcher(pad_id=0)(texts)   #(B, T)
+    texts = TextIDBatcher(pad_id=0)(texts)  #(B, T)
-    pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
+    pos_texts = TextIDBatcher(pad_id=0)(pos_texts)  #(B,T)
-    pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
+    pos_mels = TextIDBatcher(pad_id=0)(pos_mels)  #(B,T)
-    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
+    mels = np.transpose(
-    mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
+        SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))  #(B,T,num_mels)
-    return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens))
+    mel_inputs = np.transpose(
+        SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1))  #(B,T,num_mels)
+    return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
+            np.array(mel_lens))
 def batch_examples_vocoder(batch):
-    mels=[]
+    mels = []
-    mags=[]
+    mags = []
    for data in batch:
        mag, mel, _ = data
        mels.append(mel)
        mags.append(mag)
-    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
+    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))
-    mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
+    mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0, 2, 1))
    return (mels, mags)
--- a/examples/transformer_tts/parse.py
+++ b/examples/transformer_tts/parse.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse
 def add_config_options_to_parser(parser):
-    parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml',
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default='config/train_transformer.yaml',
        help="the yaml config file path.")
-    parser.add_argument('--batch_size', type=int, default=32,
+    parser.add_argument(
-        help="batch size for training.")
+        '--batch_size', type=int, default=32, help="batch size for training.")
-    parser.add_argument('--epochs', type=int, default=10000,
+    parser.add_argument(
+        '--epochs',
+        type=int,
+        default=10000,
        help="the number of epoch for training.")
-    parser.add_argument('--lr', type=float, default=0.001,
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=0.001,
        help="the learning rate for training.")
-    parser.add_argument('--save_step', type=int, default=500,
+    parser.add_argument(
+        '--save_step',
+        type=int,
+        default=500,
        help="checkpointing interval during training.")
-    parser.add_argument('--image_step', type=int, default=2000,
+    parser.add_argument(
+        '--image_step',
+        type=int,
+        default=2000,
        help="attention image interval during training.")
-    parser.add_argument('--max_len', type=int, default=400,
+    parser.add_argument(
+        '--max_len',
+        type=int,
+        default=400,
        help="The max length of audio when synthsis.")
-    parser.add_argument('--transformer_step', type=int, default=160000,
+    parser.add_argument(
+        '--transformer_step',
+        type=int,
+        default=160000,
        help="Global step to restore checkpoint of transformer.")
-    parser.add_argument('--vocoder_step', type=int, default=90000,
+    parser.add_argument(
+        '--vocoder_step',
+        type=int,
+        default=90000,
        help="Global step to restore checkpoint of postnet.")
-    parser.add_argument('--use_gpu', type=int, default=1,
+    parser.add_argument(
+        '--use_gpu',
+        type=int,
+        default=1,
        help="use gpu or not during training.")
-    parser.add_argument('--use_data_parallel', type=int, default=0,
+    parser.add_argument(
+        '--use_data_parallel',
+        type=int,
+        default=0,
        help="use data parallel or not during training.")
-    parser.add_argument('--stop_token', type=int, default=0,
+    parser.add_argument(
+        '--stop_token',
+        type=int,
+        default=0,
        help="use stop token loss in network or not.")
-    parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='./dataset/LJSpeech-1.1',
        help="the path of dataset.")
-    parser.add_argument('--checkpoint_path', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint_path',
+        type=str,
+        default=None,
        help="the path to load checkpoint or pretrain model.")
-    parser.add_argument('--save_path', type=str, default='./checkpoint',
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./checkpoint',
        help="the path to save checkpoint.")
-    parser.add_argument('--log_dir', type=str, default='./log',
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default='./log',
        help="the directory to save tensorboard log.")
-    parser.add_argument('--sample_path', type=str, default='./sample',
+    parser.add_argument(
+        '--sample_path',
+        type=str,
+        default='./sample',
        help="the directory to save audio sample in synthesis.")
--- a/examples/transformer_tts/synthesis.py
+++ b/examples/transformer_tts/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from scipy.io.wavfile import write
 from parakeet.g2p.en import text_to_sequence
@@ -16,6 +29,7 @@ from parakeet import audio
 from parakeet.models.transformer_tts.vocoder import Vocoder
 from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
 def load_checkpoint(step, model_path):
    model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
    new_state_dict = OrderedDict()
@@ -26,6 +40,7 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict
 def synthesis(text_input, args):
    place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())
@@ -34,46 +49,53 @@ def synthesis(text_input, args):
    # tensorboard
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
+        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'synthesis')
+    path = os.path.join(args.log_dir, 'synthesis')
    writer = SummaryWriter(path)
    with dg.guard(place):
        with fluid.unique_name.guard():
            model = TransformerTTS(cfg)
-            model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")))
+            model.set_dict(
+                load_checkpoint(
+                    str(args.transformer_step),
+                    os.path.join(args.checkpoint_path, "transformer")))
            model.eval()
        with fluid.unique_name.guard():
            model_vocoder = Vocoder(cfg, args.batch_size)
-            model_vocoder.set_dict(load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")))
+            model_vocoder.set_dict(
+                load_checkpoint(
+                    str(args.vocoder_step),
+                    os.path.join(args.checkpoint_path, "vocoder")))
            model_vocoder.eval()
        # init input
        text = np.asarray(text_to_sequence(text_input))
-        text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
+        text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
-        mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32)
+        mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
-        pos_text = np.arange(1, text.shape[1]+1)
+        pos_text = np.arange(1, text.shape[1] + 1)
-        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
+        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])
        pbar = tqdm(range(args.max_len))
        for i in pbar:
-            pos_mel = np.arange(1, mel_input.shape[1]+1)
+            pos_mel = np.arange(1, mel_input.shape[1] + 1)
-            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0])
+            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
-            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel)
+            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
-            mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1)
+                text, mel_input, pos_text, pos_mel)
+            mel_input = fluid.layers.concat(
+                [mel_input, postnet_pred[:, -1:, :]], axis=1)
        mag_pred = model_vocoder(postnet_pred)
        _ljspeech_processor = audio.AudioProcessor(
-            sample_rate=cfg['audio']['sr'], 
+            sample_rate=cfg['audio']['sr'],
-            num_mels=cfg['audio']['num_mels'], 
+            num_mels=cfg['audio']['num_mels'],
-            min_level_db=cfg['audio']['min_level_db'], 
+            min_level_db=cfg['audio']['min_level_db'],
-            ref_level_db=cfg['audio']['ref_level_db'], 
+            ref_level_db=cfg['audio']['ref_level_db'],
-            n_fft=cfg['audio']['n_fft'], 
+            n_fft=cfg['audio']['n_fft'],
-            win_length= cfg['audio']['win_length'], 
+            win_length=cfg['audio']['win_length'],
-            hop_length= cfg['audio']['hop_length'],
+            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
@@ -86,13 +108,18 @@ def synthesis(text_input, args):
            do_trim_silence=False,
            sound_norm=False)
-        wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
+        wav = _ljspeech_processor.inv_spectrogram(
+            fluid.layers.transpose(
+                fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
        writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
        if not os.path.exists(args.sample_path):
            os.mkdir(args.sample_path)
-        write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav)
+        write(
+            os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'],
+            wav)
    writer.close()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Synthesis model")
    add_config_options_to_parser(parser)

--- a/examples/transformer_tts/train_transformer.py
+++ b/examples/transformer_tts/train_transformer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from tqdm import tqdm
 from tensorboardX import SummaryWriter
@@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy
 from data import LJSpeechLoader
 from parakeet.models.transformer_tts.transformer_tts import TransformerTTS
 def load_checkpoint(step, model_path):
-    model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
+    model_dict, opti_dict = fluid.dygraph.load_dygraph(
+        os.path.join(model_path, step))
    new_state_dict = OrderedDict()
    for param in model_dict:
        if param.startswith('_layers.'):
@@ -40,22 +55,27 @@ def main(args):
             if args.use_gpu else fluid.CPUPlace())
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
+        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'transformer')
+    path = os.path.join(args.log_dir, 'transformer')
    writer = SummaryWriter(path) if local_rank == 0 else None
    with dg.guard(place):
        model = TransformerTTS(cfg)
        model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), 
+        optimizer = fluid.optimizer.AdamOptimizer(
-                                                  parameter_list=model.parameters())
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
+            parameter_list=model.parameters())
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, shuffle=True).reader()
        if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.transformer_step),
+                os.path.join(args.checkpoint_path, "transformer"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.transformer_step
@@ -64,86 +84,112 @@ def main(args):
        if args.use_data_parallel:
            strategy = dg.parallel.prepare_context()
            model = fluid.dygraph.parallel.DataParallel(model, strategy)
        for epoch in range(args.epochs):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                character, mel, mel_input, pos_text, pos_mel, text_length, _ = data
                global_step += 1
-                mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
+                mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
+                    character, mel_input, pos_text, pos_mel)
                label = (pos_mel == 0).astype(np.float32)
-                mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
+                mel_loss = layers.mean(
-                post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
+                    layers.abs(layers.elementwise_sub(mel_pred, mel)))
+                post_mel_loss = layers.mean(
+                    layers.abs(layers.elementwise_sub(postnet_pred, mel)))
                loss = mel_loss + post_mel_loss
                # Note: When used stop token loss the learning did not work.
                if args.stop_token:
                    stop_loss = cross_entropy(stop_preds, label)
                    loss = loss + stop_loss
-                if local_rank==0:
+                if local_rank == 0:
                    writer.add_scalars('training_loss', {
-                        'mel_loss':mel_loss.numpy(),
+                        'mel_loss': mel_loss.numpy(),
-                        'post_mel_loss':post_mel_loss.numpy()
+                        'post_mel_loss': post_mel_loss.numpy()
                    }, global_step)
                    if args.stop_token:
-                        writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
+                        writer.add_scalar('stop_loss',
+                                          stop_loss.numpy(), global_step)
                    if args.use_data_parallel:
                        writer.add_scalars('alphas', {
-                            'encoder_alpha':model._layers.encoder.alpha.numpy(),
+                            'encoder_alpha':
-                            'decoder_alpha':model._layers.decoder.alpha.numpy(),
+                            model._layers.encoder.alpha.numpy(),
+                            'decoder_alpha':
+                            model._layers.decoder.alpha.numpy(),
                        }, global_step)
                    else:
                        writer.add_scalars('alphas', {
-                            'encoder_alpha':model.encoder.alpha.numpy(),
+                            'encoder_alpha': model.encoder.alpha.numpy(),
-                            'decoder_alpha':model.decoder.alpha.numpy(),
+                            'decoder_alpha': model.decoder.alpha.numpy(),
                        }, global_step)
-                    writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
+                    writer.add_scalar('learning_rate',
+                                      optimizer._learning_rate.step().numpy(),
+                                      global_step)
                    if global_step % args.image_step == 1:
                        for i, prob in enumerate(attn_probs):
                            for j in range(4):
-                                    x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
+                                x = np.uint8(
-                                    writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
                        for i, prob in enumerate(attn_enc):
                            for j in range(4):
-                                x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
+                                x = np.uint8(
-                                writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_enc_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
                        for i, prob in enumerate(attn_dec):
                            for j in range(4):
-                                x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
+                                x = np.uint8(
-                                writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_dec_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
                if args.use_data_parallel:
                    loss = model.scale_loss(loss)
                    loss.backward()
                    model.apply_collective_grads()
                else:
                    loss.backward()
-                optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                model.clear_gradients()
                # save checkpoint
-                if local_rank==0 and global_step % args.save_step == 0:
+                if local_rank == 0 and global_step % args.save_step == 0:
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
-                    save_path = os.path.join(args.save_path,'transformer/%d' % global_step)
+                    save_path = os.path.join(args.save_path,
+                                             'transformer/%d' % global_step)
                    dg.save_dygraph(model.state_dict(), save_path)
                    dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
            writer.close()
-if __name__ =='__main__':
+if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train TransformerTTS model")
    add_config_options_to_parser(parser)

--- a/examples/transformer_tts/train_vocoder.py
+++ b/examples/transformer_tts/train_vocoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from tensorboardX import SummaryWriter
 import os
 from tqdm import tqdm
@@ -13,6 +26,7 @@ import paddle.fluid.layers as layers
 from data import LJSpeechLoader
 from parakeet.models.transformer_tts.vocoder import Vocoder
 def load_checkpoint(step, model_path):
    model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
    new_state_dict = OrderedDict()
@@ -23,8 +37,9 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict, opti_dict
 def main(args):
    local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
    nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
@@ -35,23 +50,26 @@ def main(args):
    place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
             if args.use_data_parallel else fluid.CUDAPlace(0)
             if args.use_gpu else fluid.CPUPlace())
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
+        os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'vocoder')
+    path = os.path.join(args.log_dir, 'vocoder')
    writer = SummaryWriter(path) if local_rank == 0 else None
-    with dg.guard(place):   
+    with dg.guard(place):
        model = Vocoder(cfg, args.batch_size)
        model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
+        optimizer = fluid.optimizer.AdamOptimizer(
-                                                  parameter_list=model.parameters())
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
+            parameter_list=model.parameters())
        if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.vocoder_step),
+                os.path.join(args.checkpoint_path, "vocoder"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.vocoder_step
@@ -61,48 +79,55 @@ def main(args):
            strategy = dg.parallel.prepare_context()
            model = fluid.dygraph.parallel.DataParallel(model, strategy)
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader()
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, is_vocoder=True).reader()
        for epoch in range(args.epochs):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                mel, mag = data
                mag = dg.to_variable(mag.numpy())
                mel = dg.to_variable(mel.numpy())
                global_step += 1
                mag_pred = model(mel)
-                loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
+                loss = layers.mean(
+                    layers.abs(layers.elementwise_sub(mag_pred, mag)))
                if args.use_data_parallel:
                    loss = model.scale_loss(loss)
                    loss.backward()
                    model.apply_collective_grads()
                else:
                    loss.backward()
-                optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                model.clear_gradients()
-                if local_rank==0:
+                if local_rank == 0:
-                    writer.add_scalars('training_loss',{
+                    writer.add_scalars('training_loss', {
-                        'loss':loss.numpy(),
+                        'loss': loss.numpy(),
                    }, global_step)
                    if global_step % args.save_step == 0:
                        if not os.path.exists(args.save_path):
                            os.mkdir(args.save_path)
-                        save_path = os.path.join(args.save_path,'vocoder/%d' % global_step)
+                        save_path = os.path.join(args.save_path,
+                                                 'vocoder/%d' % global_step)
                        dg.save_dygraph(model.state_dict(), save_path)
                        dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
            writer.close()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train vocoder model")
    add_config_options_to_parser(parser)
    args = parser.parse_args()
    # Print the whole config setting.
    pprint(args)
    main(args)
\ No newline at end of file
--- a/examples/waveflow/benchmark.py
+++ b/examples/waveflow/benchmark.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 from pprint import pprint

--- a/examples/waveflow/synthesis.py
+++ b/examples/waveflow/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 from pprint import pprint

--- a/examples/waveflow/train.py
+++ b/examples/waveflow/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import random
 import subprocess

--- a/examples/waveflow/utils.py
+++ b/examples/waveflow/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import itertools
 import os
 import time

--- a/parakeet/__init__.py
+++ b/parakeet/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 __version__ = "0.0.0"
 from . import data, g2p, models, modules
--- a/parakeet/audio/__init__.py
+++ b/parakeet/audio/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from .audio import AudioProcessor
\ No newline at end of file
--- a/parakeet/audio/audio.py
+++ b/parakeet/audio/audio.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import librosa
 import soundfile as sf
 import numpy as np
 import scipy.io
 import scipy.signal
 class AudioProcessor(object):
-    def __init__(self,
+    def __init__(
-                 sample_rate=None, # int, sampling rate
+            self,
-                 num_mels=None, # int, bands of mel spectrogram
+            sample_rate=None,  # int, sampling rate
-                 min_level_db=None, # float, minimum level db
+            num_mels=None,  # int, bands of mel spectrogram
-                 ref_level_db=None, # float, reference level db
+            min_level_db=None,  # float, minimum level db
-                 n_fft=None, # int: number of samples in a frame for stft
+            ref_level_db=None,  # float, reference level db
-                 win_length=None, # int: the same meaning with n_fft
+            n_fft=None,  # int: number of samples in a frame for stft
-                 hop_length=None, # int: number of samples between neighboring frame
+            win_length=None,  # int: the same meaning with n_fft
-                 power=None, # float:power to raise before griffin-lim
+            hop_length=None,  # int: number of samples between neighboring frame
-                 preemphasis=None, # float: preemphasis coefficident
+            power=None,  # float:power to raise before griffin-lim
-                 signal_norm=None, # 
+            preemphasis=None,  # float: preemphasis coefficident
-                 symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
+            signal_norm=None,  # 
-                 max_norm=None, # float, max norm
+            symmetric_norm=False,  # bool, apply clip norm in [-max_norm, max_form]
-                 mel_fmin=None, # int: mel spectrogram's minimum frequency
+            max_norm=None,  # float, max norm
-                 mel_fmax=None, # int: mel spectrogram's maximum frequency
+            mel_fmin=None,  # int: mel spectrogram's minimum frequency
-                 clip_norm=True, # bool: clip spectrogram's norm
+            mel_fmax=None,  # int: mel spectrogram's maximum frequency
-                 griffin_lim_iters=None, # int:
+            clip_norm=True,  # bool: clip spectrogram's norm
-                 do_trim_silence=False, # bool: trim silence
+            griffin_lim_iters=None,  # int:
-                 sound_norm=False,
+            do_trim_silence=False,  # bool: trim silence
-                 **kwargs):
+            sound_norm=False,
+            **kwargs):
        self.sample_rate = sample_rate
        self.num_mels = num_mels
        self.min_level_db = min_level_db
@@ -34,8 +50,8 @@ class AudioProcessor(object):
        self.n_fft = n_fft
        self.win_length = win_length or n_fft
        # hop length defaults to 1/4 window_length
-        self.hop_length = hop_length or 0.25 * self.win_length 
+        self.hop_length = hop_length or 0.25 * self.win_length
        self.power = power
        self.preemphasis = float(preemphasis)
@@ -52,7 +68,8 @@ class AudioProcessor(object):
        self.do_trim_silence = do_trim_silence
        self.sound_norm = sound_norm
-        self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters()
+        self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters(
+        )
    def _stft_parameters(self):
        """compute frame length and hop length in ms"""
@@ -65,44 +82,54 @@ class AudioProcessor(object):
        """object repr"""
        cls_name_str = self.__class__.__name__
        members = vars(self)
-        dict_str = "\n".join(["  {}: {},".format(k, v) for k, v in members.items()])
+        dict_str = "\n".join(
+            ["  {}: {},".format(k, v) for k, v in members.items()])
        repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
        return repr_str
    def save_wav(self, path, wav):
        """save audio with scipy.io.wavfile in 16bit integers"""
        wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
-        scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16))
+        scipy.io.wavfile.write(path, self.sample_rate,
+                               wav_norm.as_type(np.int16))
    def load_wav(self, path, sr=None):
        """load wav -> trim_silence -> rescale"""
        x, sr = librosa.load(path, sr=None)
-        assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate)
+        assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(
+            sr, self.sample_rate)
        if self.do_trim_silence:
            try:
                x = self.trim_silence(x)
            except ValueError:
-                print(" [!] File cannot be trimmed for silence - {}".format(path))
+                print(" [!] File cannot be trimmed for silence - {}".format(
+                    path))
        if self.sound_norm:
-            x = x / x.max() * 0.9 # why 0.9 ?
+            x = x / x.max() * 0.9  # why 0.9 ?
        return x
    def trim_silence(self, wav):
        """Trim soilent parts with a threshold and 0.01s margin"""
        margin = int(self.sample_rate * 0.01)
-        wav = wav[margin: -margin]
+        wav = wav[margin:-margin]
-        trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
+        trimed_wav = librosa.effects.trim(
+            wav,
+            top_db=60,
+            frame_length=self.win_length,
+            hop_length=self.hop_length)[0]
        return trimed_wav
    def apply_preemphasis(self, x):
        if self.preemphasis == 0.:
-            raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
+            raise RuntimeError(
+                " !! Preemphasis coefficient should be positive. ")
        return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)
    def apply_inv_preemphasis(self, x):
        if self.preemphasis == 0.:
-            raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
+            raise RuntimeError(
+                " !! Preemphasis coefficient should be positive. ")
        return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)
    def _amplitude_to_db(self, x):
@@ -125,12 +152,11 @@ class AudioProcessor(object):
        """return mel basis for mel scale"""
        if self.mel_fmax is not None:
            assert self.mel_fmax <= self.sample_rate // 2
-        return librosa.filters.mel(
+        return librosa.filters.mel(self.sample_rate,
-            self.sample_rate, 
+                                   self.n_fft,
-            self.n_fft, 
+                                   n_mels=self.num_mels,
-            n_mels=self.num_mels,
+                                   fmin=self.mel_fmin,
-            fmin=self.mel_fmin,
+                                   fmax=self.mel_fmax)
-            fmax=self.mel_fmax)
    def _normalize(self, S):
        """put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
@@ -156,25 +182,29 @@ class AudioProcessor(object):
            if self.symmetric_norm:
                if self.clip_norm:
                    S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
-                S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db
+                S_denorm = (S_denorm + self.max_norm) * (
+                    -self.min_level_db) / (2 * self.max_norm
+                                           ) + self.min_level_db
                return S_denorm
            else:
                if self.clip_norm:
                    S_denorm = np.clip(S_denorm, 0, self.max_norm)
-                S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db
+                S_denorm = S_denorm * (-self.min_level_db
+                                       ) / self.max_norm + self.min_level_db
                return S_denorm
        else:
            return S
    def _stft(self, y):
        return librosa.stft(
-            y=y, 
+            y=y,
            n_fft=self.n_fft,
            win_length=self.win_length,
            hop_length=self.hop_length)
    def _istft(self, S):
-        return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length)
+        return librosa.istft(
+            S, hop_length=self.hop_length, win_length=self.win_length)
    def spectrogram(self, y):
        """compute linear spectrogram(amplitude)
@@ -195,7 +225,8 @@ class AudioProcessor(object):
            D = self._stft(self.apply_preemphasis(y))
        else:
            D = self._stft(y)
-        S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
+        S = self._amplitude_to_db(self._linear_to_mel(np.abs(
+            D))) - self.ref_level_db
        return self._normalize(S)
    def inv_spectrogram(self, spectrogram):
@@ -203,16 +234,16 @@ class AudioProcessor(object):
        S = self._denormalize(spectrogram)
        S = self._db_to_amplitude(S + self.ref_level_db)
        if self.preemphasis:
-            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
+            return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
-        return self._griffin_lim(S ** self.power)
+        return self._griffin_lim(S**self.power)
    def inv_melspectrogram(self, mel_spectrogram):
        S = self._denormalize(mel_spectrogram)
        S = self._db_to_amplitude(S + self.ref_level_db)
        S = self._mel_to_linear(np.abs(S))
        if self.preemphasis:
-            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
+            return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
-        return self._griffin_lim(S ** self.power)
+        return self._griffin_lim(S**self.power)
    def out_linear_to_mel(self, linear_spec):
        """convert output linear spec to mel spec"""
@@ -222,7 +253,7 @@ class AudioProcessor(object):
        S = self._amplitude_to_db(S) - self.ref_level_db
        mel = self._normalize(S)
        return mel
    def _griffin_lim(self, S):
        angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
        S_complex = np.abs(S).astype(np.complex)
@@ -234,18 +265,18 @@ class AudioProcessor(object):
    @staticmethod
    def mulaw_encode(wav, qc):
-        mu = 2 ** qc - 1
+        mu = 2**qc - 1
        # wav_abs = np.minimum(np.abs(wav), 1.0)
        signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
        # Quantize signal to the specified number of levels.
        signal = (signal + 1) / 2 * mu + 0.5
-        return np.floor(signal,)
+        return np.floor(signal, )
    @staticmethod
    def mulaw_decode(wav, qc):
        """Recovers waveform from quantized values."""
-        mu = 2 ** qc - 1
+        mu = 2**qc - 1
-        x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
+        x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1)
        return x
    @staticmethod

--- a/parakeet/data/__init__.py
+++ b/parakeet/data/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from .dataset import *
 from .datacargo import *
 from .sampler import *

--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 functions to make batch for arrays which satisfy some conditions.
 """
 import numpy as np
 class TextIDBatcher(object):
    """A wrapper class for a function to build a functor, which holds the configs to pass to the function."""
    def __init__(self, pad_id=0, dtype=np.int64):
        self.pad_id = pad_id
        self.dtype = dtype
    def __call__(self, minibatch):
        out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype)
        return out
 def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
    """
    minibatch: List[Example]
@@ -20,26 +36,32 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
    """
    peek_example = minibatch[0]
    assert len(peek_example.shape) == 1, "text example is an 1D tensor"
-    lengths = [example.shape[0] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
+    lengths = [example.shape[0] for example in minibatch
+               ]  # assume (channel, n_samples) or (n_samples, )
    max_len = np.max(lengths)
    batch = []
    for example in minibatch:
        pad_len = max_len - example.shape[0]
-        batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_id))
+        batch.append(
+            np.pad(example, [(0, pad_len)],
+                   mode='constant',
+                   constant_values=pad_id))
    return np.array(batch, dtype=dtype)
 class WavBatcher(object):
    def __init__(self, pad_value=0., dtype=np.float32):
        self.pad_value = pad_value
        self.dtype = dtype
    def __call__(self, minibatch):
        out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype)
        return out
 def batch_wav(minibatch, pad_value=0., dtype=np.float32):
    """
    minibatch: List[Example]
@@ -51,18 +73,25 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
        mono_channel = True
    elif len(peek_example.shape) == 2:
        mono_channel = False
-    lengths = [example.shape[-1] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
+    lengths = [example.shape[-1] for example in minibatch
+               ]  # assume (channel, n_samples) or (n_samples, )
    max_len = np.max(lengths)
    batch = []
    for example in minibatch:
        pad_len = max_len - example.shape[-1]
        if mono_channel:
-            batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_value))
+            batch.append(
+                np.pad(example, [(0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))
        else:
-            batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
+            batch.append(
+                np.pad(example, [(0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))  # what about PCM, no
    return np.array(batch, dtype=dtype)
@@ -75,6 +104,7 @@ class SpecBatcher(object):
        out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype)
        return out
 def batch_spec(minibatch, pad_value=0., dtype=np.float32):
    """
    minibatch: List[Example]
@@ -86,16 +116,23 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
        mono_channel = True
    elif len(peek_example.shape) == 3:
        mono_channel = False
-    lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame)
+    lengths = [example.shape[-1] for example in minibatch
-    max_len = np.max(lengths)  
+               ]  # assume (channel, F, n_frame) or (F, n_frame)
+    max_len = np.max(lengths)
    batch = []
    for example in minibatch:
        pad_len = max_len - example.shape[-1]
        if mono_channel:
-            batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value))
+            batch.append(
+                np.pad(example, [(0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))
        else:
-            batch.append(np.pad(example, [(0, 0), (0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
+            batch.append(
+                np.pad(example, [(0, 0), (0, 0), (0, pad_len)],
-    return np.array(batch, dtype=dtype) 
+                       mode='constant',
\ No newline at end of file
+                       constant_values=pad_value))  # what about PCM, no
+    return np.array(batch, dtype=dtype)
--- a/parakeet/data/datacargo.py
+++ b/parakeet/data/datacargo.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import six
 from .sampler import SequentialSampler, RandomSampler, BatchSampler

--- a/parakeet/data/dataset.py
+++ b/parakeet/data/dataset.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import six
 import numpy as np
@@ -9,8 +23,7 @@ class DatasetMixin(object):
        if isinstance(index, slice):
            start, stop, step = index.indices(len(self))
            return [
-                self.get_example(i)
+                self.get_example(i) for i in six.moves.range(start, stop, step)
-                for i in six.moves.range(start, stop, step)
            ]
        elif isinstance(index, (list, np.ndarray)):
            return [self.get_example(i) for i in index]
@@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin):
    def get_example(self, i):
        if i < 0:
-            raise IndexError(
+            raise IndexError("ChainDataset doesnot support negative indexing.")
-                "ChainDataset doesnot support negative indexing.")
        for dataset in self._datasets:
            if i < len(dataset):

--- a/parakeet/data/sampler.py
+++ b/parakeet/data/sampler.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__.
@@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices.
 So the sampler is only responsible for generating valid indices.
 """
 import numpy as np
 import random
 class Sampler(object):
    def __init__(self, data_source):
        pass
@@ -23,7 +36,7 @@ class Sampler(object):
 class SequentialSampler(Sampler):
    def __init__(self, data_source):
        self.data_source = data_source
    def __iter__(self):
        return iter(range(len(self.data_source)))
@@ -42,12 +55,14 @@ class RandomSampler(Sampler):
                             "replacement={}".format(self.replacement))
        if self._num_samples is not None and not replacement:
-            raise ValueError("With replacement=False, num_samples should not be specified, "
+            raise ValueError(
-                             "since a random permutation will be performed.")
+                "With replacement=False, num_samples should not be specified, "
+                "since a random permutation will be performed.")
        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
            raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(self.num_samples))
+                             "value, but got num_samples={}".format(
+                                 self.num_samples))
    @property
    def num_samples(self):
@@ -59,7 +74,9 @@ class RandomSampler(Sampler):
    def __iter__(self):
        n = len(self.data_source)
        if self.replacement:
-            return iter(np.random.randint(0, n, size=(self.num_samples,), dtype=np.int64).tolist())
+            return iter(
+                np.random.randint(
+                    0, n, size=(self.num_samples, ), dtype=np.int64).tolist())
        return iter(np.random.permutation(n).tolist())
    def __len__(self):
@@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler):
        self.indices = indices
    def __iter__(self):
-        return (self.indices[i] for i in np.random.permutation(len(self.indices)))
+        return (self.indices[i]
+                for i in np.random.permutation(len(self.indices)))
    def __len__(self):
        return len(self.indices)
@@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
    3. Permutate mini-batchs
    """
-    def __init__(self, lengths, batch_size=4, batch_group_size=None,
+    def __init__(self,
+                 lengths,
+                 batch_size=4,
+                 batch_group_size=None,
                 permutate=True):
-        _lengths = np.array(lengths, dtype=np.int64) # maybe better implement length as a sort key
+        _lengths = np.array(
+            lengths,
+            dtype=np.int64)  # maybe better implement length as a sort key
        self.lengths = np.sort(_lengths)
        self.sorted_indices = np.argsort(_lengths)
@@ -112,20 +135,21 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
        for i in range(len(indices) // batch_group_size):
            s = i * batch_group_size
            e = s + batch_group_size
-            random.shuffle(indices[s: e]) # inplace
+            random.shuffle(indices[s:e])  # inplace
        # Permutate batches
        if self.permutate:
            perm = np.arange(len(indices[:e]) // self.batch_size)
            random.shuffle(perm)
-            indices[:e] = indices[:e].reshape(-1, self.batch_size)[perm, :].reshape(-1)
+            indices[:e] = indices[:e].reshape(
+                -1, self.batch_size)[perm, :].reshape(-1)
        # Handle last elements
        s += batch_group_size
        #print(indices)
        if s < len(indices):
            random.shuffle(indices[s:])
        return iter(indices)
    def __len__(self):
@@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler):
    def __init__(self, weights, num_samples, replacement):
        if not isinstance(num_samples, int) or num_samples <= 0:
            raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(num_samples))
+                             "value, but got num_samples={}".format(
+                                 num_samples))
        self.weights = np.array(weights, dtype=np.float64)
        self.num_samples = num_samples
        self.replacement = replacement
    def __iter__(self):
-        return iter(np.random.choice(len(self.weights), size=(self.num_samples, ),  
+        return iter(
-                                     replace=self.replacement, p=self.weights).tolist())
+            np.random.choice(
+                len(self.weights),
+                size=(self.num_samples, ),
+                replace=self.replacement,
+                p=self.weights).tolist())
    def __len__(self):
        return self.num_samples
@@ -184,7 +213,7 @@ class DistributedSampler(Sampler):
        # Subset samples for each trainer.
        indices = indices[self.rank:self.total_size:self.num_trainers]
-        assert len(indices) ==  self.num_samples
+        assert len(indices) == self.num_samples
        return iter(indices)
@@ -209,8 +238,7 @@ class BatchSampler(Sampler):
    def __init__(self, sampler, batch_size, drop_last):
        if not isinstance(sampler, Sampler):
            raise ValueError("sampler should be an instance of "
-                             "Sampler, but got sampler={}"
+                             "Sampler, but got sampler={}".format(sampler))
-                             .format(sampler))
        if not isinstance(batch_size, int) or batch_size <= 0:
            raise ValueError("batch_size should be a positive integer value, "
                             "but got batch_size={}".format(batch_size))

--- a/parakeet/datasets/README.md
+++ b/parakeet/datasets/README.md
@@ -14,9 +14,4 @@ One of the reasons we choose to load data lazily (only load metadata before hand
 For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`.
-That is it! 
+That is it!
--- a/parakeet/datasets/__init__.py
+++ b/parakeet/datasets/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parakeet/datasets/ljspeech.py
+++ b/parakeet/datasets/ljspeech.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 import numpy as np
 import pandas as pd

--- a/parakeet/datasets/vctk.py
+++ b/parakeet/datasets/vctk.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 import pandas as pd
 from ruamel.yaml import YAML
@@ -11,23 +25,25 @@ from parakeet.data.dataset import Dataset
 from parakeet.data.datacargo import DataCargo
 from parakeet.data.batch import TextIDBatcher, WavBatcher
 class VCTK(Dataset):
    def __init__(self, root):
-        assert isinstance(root, (str, Path)), "root should be a string or Path object"
+        assert isinstance(root, (
+            str, Path)), "root should be a string or Path object"
        self.root = root if isinstance(root, Path) else Path(root)
        self.text_root = self.root.joinpath("txt")
        self.wav_root = self.root.joinpath("wav48")
-        if not (self.root.joinpath("metadata.csv").exists() and 
+        if not (self.root.joinpath("metadata.csv").exists() and
                self.root.joinpath("speaker_indices.yaml").exists()):
            self._prepare_metadata()
        self.speaker_indices, self.metadata = self._load_metadata()
    def _load_metadata(self):
-        yaml=YAML(typ='safe')
+        yaml = YAML(typ='safe')
        speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml"))
-        metadata = pd.read_csv(self.root.joinpath("metadata.csv"), 
+        metadata = pd.read_csv(
-                               sep="|", quoting=3, header=1)
+            self.root.joinpath("metadata.csv"), sep="|", quoting=3, header=1)
        return speaker_indices, metadata
    def _prepare_metadata(self):
@@ -41,15 +57,19 @@ class VCTK(Dataset):
                        with io.open(str(text_file)) as f:
                            transcription = f.read().strip()
                    wav_file = text_file.with_suffix(".wav")
-                    metadata.append((wav_file.name, speaker_folder.name, transcription))
+                    metadata.append(
-        metadata = pd.DataFrame.from_records(metadata,
+                        (wav_file.name, speaker_folder.name, transcription))
-                                             columns=["wave_file", "speaker", "text"])
+        metadata = pd.DataFrame.from_records(
+            metadata, columns=["wave_file", "speaker", "text"])
        # save them
-        yaml=YAML(typ='safe')
+        yaml = YAML(typ='safe')
        yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml"))
-        metadata.to_csv(self.root.joinpath("metadata.csv"), 
+        metadata.to_csv(
-                        sep="|", quoting=3, index=False)
+            self.root.joinpath("metadata.csv"),
+            sep="|",
+            quoting=3,
+            index=False)
    def _get_example(self, metadatum):
        wave_file, speaker, text = metadatum
@@ -77,5 +97,3 @@ class VCTK(Dataset):
        speaker_batch = np.array(speaker_batch)
        phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
        return wav_batch, speaker_batch, phoneme_batch
\ No newline at end of file
--- a/parakeet/g2p/__init__.py
+++ b/parakeet/g2p/__init__.py
 # coding: utf-8
 """Text processing frontend
 All frontend module should have the following functions:

--- a/parakeet/g2p/en/__init__.py
+++ b/parakeet/g2p/en/__init__.py
@@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0):
    from ..text import text_to_sequence
    text = text_to_sequence(text, ["english_cleaners"])
    return text
--- a/parakeet/g2p/es/__init__.py
+++ b/parakeet/g2p/es/__init__.py
@@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0):
    from ..text import text_to_sequence
    text = text_to_sequence(text, ["basic_cleaners"])
    return text
--- a/parakeet/g2p/jp/__init__.py
+++ b/parakeet/g2p/jp/__init__.py
 # coding: utf-8
 import MeCab
 import jaconv
 from random import random
@@ -30,9 +29,9 @@ def _yomi(mecab_result):
 def _mix_pronunciation(tokens, yomis, p):
-    return "".join(
+    return "".join(yomis[idx]
-        yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
+                   if yomis[idx] is not None and random() < p else tokens[idx]
-        for idx in range(len(tokens)))
+                   for idx in range(len(tokens)))
 def mix_pronunciation(text, p):
@@ -59,8 +58,7 @@ def normalize_delimitor(text):
 def text_to_sequence(text, p=0.0):
-    for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】",
+    for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】", "（", "）", "(", ")"]:
-              "（", "）", "(", ")"]:
        text = text.replace(c, "")
    text = text.replace("!", "！")
    text = text.replace("?", "？")

--- a/parakeet/g2p/ko/__init__.py
+++ b/parakeet/g2p/ko/__init__.py
 # coding: utf-8
 from random import random
 n_vocab = 0xffff
@@ -13,5 +12,6 @@ _tagger = None
 def text_to_sequence(text, p=0.0):
    return [ord(c) for c in text] + [_eos]  # EOS
 def sequence_to_text(seq):
    return "".join(chr(n) for n in seq)
--- a/parakeet/g2p/text/__init__.py
+++ b/parakeet/g2p/text/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import re
 from . import cleaners
 from .symbols import symbols
 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 _id_to_symbol = {i: s for i, s in enumerate(symbols)}
@@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names):
        if not m:
            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
            break
-        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
+        sequence += _symbols_to_sequence(
+            _clean_text(m.group(1), cleaner_names))
        sequence += _arpabet_to_sequence(m.group(2))
        text = m.group(3)

--- a/parakeet/g2p/text/cleaners.py
+++ b/parakeet/g2p/text/cleaners.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Cleaners are transformations that run over the input text at both training and eval time.
@@ -14,31 +27,31 @@ import re
 from unidecode import unidecode
 from .numbers import normalize_numbers
 # Regular expression matching whitespace:
 _whitespace_re = re.compile(r'\s+')
 # List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
-    ('mrs', 'misess'),
+                  for x in [
-    ('mr', 'mister'),
+                      ('mrs', 'misess'),
-    ('dr', 'doctor'),
+                      ('mr', 'mister'),
-    ('st', 'saint'),
+                      ('dr', 'doctor'),
-    ('co', 'company'),
+                      ('st', 'saint'),
-    ('jr', 'junior'),
+                      ('co', 'company'),
-    ('maj', 'major'),
+                      ('jr', 'junior'),
-    ('gen', 'general'),
+                      ('maj', 'major'),
-    ('drs', 'doctors'),
+                      ('gen', 'general'),
-    ('rev', 'reverend'),
+                      ('drs', 'doctors'),
-    ('lt', 'lieutenant'),
+                      ('rev', 'reverend'),
-    ('hon', 'honorable'),
+                      ('lt', 'lieutenant'),
-    ('sgt', 'sergeant'),
+                      ('hon', 'honorable'),
-    ('capt', 'captain'),
+                      ('sgt', 'sergeant'),
-    ('esq', 'esquire'),
+                      ('capt', 'captain'),
-    ('ltd', 'limited'),
+                      ('esq', 'esquire'),
-    ('col', 'colonel'),
+                      ('ltd', 'limited'),
-    ('ft', 'fort'),
+                      ('col', 'colonel'),
-]]
+                      ('ft', 'fort'),
+                  ]]
 def expand_abbreviations(text):

--- a/parakeet/g2p/text/cmudict.py
+++ b/parakeet/g2p/text/cmudict.py
-import re
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
 valid_symbols = [
-    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
+    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
-    'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
+    'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
-    'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
+    'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
-    'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
+    'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
-    'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
+    'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
-    'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
+    'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
-    'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
+    'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
+    'Y', 'Z', 'ZH'
 ]
 _valid_symbol_set = set(valid_symbols)
@@ -24,7 +38,10 @@ class CMUDict:
        else:
            entries = _parse_cmudict(file_or_path)
        if not keep_ambiguous:
-            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+            entries = {
+                word: pron
+                for word, pron in entries.items() if len(pron) == 1
+            }
        self._entries = entries
    def __len__(self):

--- a/parakeet/g2p/text/numbers.py
+++ b/parakeet/g2p/text/numbers.py
@@ -3,7 +3,6 @@
 import inflect
 import re
 _inflect = inflect.engine()
 _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
@@ -56,7 +55,8 @@ def _expand_number(m):
        elif num % 100 == 0:
            return _inflect.number_to_words(num // 100) + ' hundred'
        else:
-            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+            return _inflect.number_to_words(
+                num, andword='', zero='oh', group=2).replace(', ', ' ')
    else:
        return _inflect.number_to_words(num, andword='')

--- a/parakeet/g2p/text/symbols.py
+++ b/parakeet/g2p/text/symbols.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Defines the set of symbols used in text input to the model.

--- a/parakeet/models/__init__.py
+++ b/parakeet/models/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parakeet/models/deepvoice3/__init__.py
+++ b/parakeet/models/deepvoice3/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec
 from parakeet.models.deepvoice3.decoder import Decoder, WindowRange
 from parakeet.models.deepvoice3.converter import Converter

--- a/parakeet/models/deepvoice3/attention.py
+++ b/parakeet/models/deepvoice3/attention.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from collections import namedtuple
 from paddle import fluid
@@ -19,23 +33,19 @@ class Attention(dg.Layer):
                 value_projection=True):
        super(Attention, self).__init__()
        std = np.sqrt(1 / query_dim)
-        self.query_proj = Linear(query_dim,
+        self.query_proj = Linear(
-                                 embed_dim,
+            query_dim, embed_dim, param_attr=I.Normal(scale=std))
-                                 param_attr=I.Normal(scale=std))
        if key_projection:
            std = np.sqrt(1 / embed_dim)
-            self.key_proj = Linear(embed_dim,
+            self.key_proj = Linear(
-                                   embed_dim,
+                embed_dim, embed_dim, param_attr=I.Normal(scale=std))
-                                   param_attr=I.Normal(scale=std))
        if value_projection:
            std = np.sqrt(1 / embed_dim)
-            self.value_proj = Linear(embed_dim,
+            self.value_proj = Linear(
-                                     embed_dim,
+                embed_dim, embed_dim, param_attr=I.Normal(scale=std))
-                                     param_attr=I.Normal(scale=std))
        std = np.sqrt(1 / embed_dim)
-        self.out_proj = Linear(embed_dim,
+        self.out_proj = Linear(
-                               query_dim,
+            embed_dim, query_dim, param_attr=I.Normal(scale=std))
-                               param_attr=I.Normal(scale=std))
        self.key_projection = key_projection
        self.value_projection = value_projection
@@ -102,9 +112,8 @@ class Attention(dg.Layer):
        x = F.softmax(x)
        attn_scores = x
-        x = F.dropout(x,
+        x = F.dropout(
-                      self.dropout,
+            x, self.dropout, dropout_implementation="upscale_in_train")
-                      dropout_implementation="upscale_in_train")
        x = F.matmul(x, values)
        encoder_length = keys.shape[1]
        # CAUTION: is it wrong? let it be now

--- a/parakeet/models/deepvoice3/conv1dglu.py
+++ b/parakeet/models/deepvoice3/conv1dglu.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from paddle import fluid
@@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer):
    has residual connection from the input x, and scale the output by 
    np.sqrt(0.5).
    """
    def __init__(self,
                 n_speakers,
                 speaker_dim,
@@ -50,20 +65,20 @@ class Conv1DGLU(dg.Layer):
            ), "this block uses residual connection"\
                "the input_channes should equals num_filters"
        std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
-        self.conv = Conv1DCell(in_channels,
+        self.conv = Conv1DCell(
-                               2 * num_filters,
+            in_channels,
-                               filter_size,
+            2 * num_filters,
-                               dilation,
+            filter_size,
-                               causal,
+            dilation,
-                               param_attr=I.Normal(scale=std))
+            causal,
+            param_attr=I.Normal(scale=std))
        if n_speakers > 1:
            assert (speaker_dim is not None
                    ), "speaker embed should not be null in multi-speaker case"
            std = np.sqrt(1 / speaker_dim)
-            self.fc = Linear(speaker_dim,
+            self.fc = Linear(
-                             num_filters,
+                speaker_dim, num_filters, param_attr=I.Normal(scale=std))
-                             param_attr=I.Normal(scale=std))
    def forward(self, x, speaker_embed=None):
        """
@@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer):
                C_out means the output channels of Conv1DGLU.
        """
        residual = x
-        x = F.dropout(x,
+        x = F.dropout(
-                      self.dropout,
+            x, self.dropout, dropout_implementation="upscale_in_train")
-                      dropout_implementation="upscale_in_train")
        x = self.conv(x)
        content, gate = F.split(x, num_or_sections=2, dim=1)
@@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer):
                C_out means the output channels of Conv1DGLU.
        """
        residual = x_t
-        x_t = F.dropout(x_t,
+        x_t = F.dropout(
-                        self.dropout,
+            x_t, self.dropout, dropout_implementation="upscale_in_train")
-                        dropout_implementation="upscale_in_train")
        x_t = self.conv.add_input(x_t)
        content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)

--- a/parakeet/models/deepvoice3/converter.py
+++ b/parakeet/models/deepvoice3/converter.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from itertools import chain
@@ -19,44 +33,45 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
-        Conv1DGLU(n_speakers,
+        Conv1DGLU(
-                  speaker_dim,
+            n_speakers,
-                  target_channels,
+            speaker_dim,
-                  target_channels,
-                  3,
-                  dilation=1,
-                  std_mul=1.,
-                  dropout=dropout),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=3,
-                  std_mul=4.,
-                  dropout=dropout),
-        Conv1DTranspose(
            target_channels,
            target_channels,
-            2,
+            3,
-            stride=2,
+            dilation=1,
-            param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
+            std_mul=1.,
-        Conv1DGLU(n_speakers,
+            dropout=dropout), Conv1DGLU(
-                  speaker_dim,
+                n_speakers,
-                  target_channels,
+                speaker_dim,
-                  target_channels,
+                target_channels,
-                  3,
+                target_channels,
-                  dilation=1,
+                3,
-                  std_mul=1.,
+                dilation=3,
-                  dropout=dropout),
+                std_mul=4.,
-        Conv1DGLU(n_speakers,
+                dropout=dropout), Conv1DTranspose(
-                  speaker_dim,
+                    target_channels,
-                  target_channels,
+                    target_channels,
-                  target_channels,
+                    2,
-                  3,
+                    stride=2,
-                  dilation=3,
+                    param_attr=I.Normal(scale=np.sqrt(
-                  std_mul=4.,
+                        4. / (2 * target_channels)))), Conv1DGLU(
-                  dropout=dropout)
+                            n_speakers,
+                            speaker_dim,
+                            target_channels,
+                            target_channels,
+                            3,
+                            dilation=1,
+                            std_mul=1.,
+                            dropout=dropout), Conv1DGLU(
+                                n_speakers,
+                                speaker_dim,
+                                target_channels,
+                                target_channels,
+                                3,
+                                dilation=3,
+                                std_mul=4.,
+                                dropout=dropout)
    ]
    return upsampling_convolutions
@@ -69,36 +84,38 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
-        Conv1DGLU(n_speakers,
+        Conv1DGLU(
-                  speaker_dim,
+            n_speakers,
-                  target_channels,
+            speaker_dim,
-                  target_channels,
+            target_channels,
-                  3,
+            target_channels,
-                  dilation=1,
+            3,
-                  std_mul=1.,
+            dilation=1,
-                  dropout=dropout),
+            std_mul=1.,
-        Conv1DGLU(n_speakers,
+            dropout=dropout), Conv1DGLU(
-                  speaker_dim,
+                n_speakers,
-                  target_channels,
+                speaker_dim,
-                  target_channels,
+                target_channels,
-                  3,
+                target_channels,
-                  dilation=3,
+                3,
-                  std_mul=4.,
+                dilation=3,
-                  dropout=dropout)
+                std_mul=4.,
+                dropout=dropout)
    ]
    return upsampling_convolutions
 def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout):
    upsampling_convolutions = [
-        Conv1DGLU(n_speakers,
+        Conv1DGLU(
-                  speaker_dim,
+            n_speakers,
-                  target_channels,
+            speaker_dim,
-                  target_channels,
+            target_channels,
-                  3,
+            target_channels,
-                  dilation=3,
+            3,
-                  std_mul=4.,
+            dilation=3,
-                  dropout=dropout)
+            std_mul=4.,
+            dropout=dropout)
    ]
    return upsampling_convolutions
@@ -108,6 +125,7 @@ class Converter(dg.Layer):
    Vocoder that transforms mel spectrogram (or ecoder hidden states) 
    to waveform.
    """
    def __init__(self,
                 n_speakers,
                 speaker_dim,
@@ -161,33 +179,36 @@ class Converter(dg.Layer):
                std = np.sqrt(std_mul / in_channels)
                # CAUTION: relu
                self.convolutions.append(
-                    Conv1D(in_channels,
+                    Conv1D(
-                           out_channels,
+                        in_channels,
-                           1,
+                        out_channels,
-                           act="relu",
+                        1,
-                           param_attr=I.Normal(scale=std)))
+                        act="relu",
+                        param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0
            self.convolutions.append(
-                Conv1DGLU(n_speakers,
+                Conv1DGLU(
-                          speaker_dim,
+                    n_speakers,
-                          in_channels,
+                    speaker_dim,
-                          out_channels,
+                    in_channels,
-                          filter_size,
+                    out_channels,
-                          dilation=dilation,
+                    filter_size,
-                          std_mul=std_mul,
+                    dilation=dilation,
-                          dropout=dropout))
+                    std_mul=std_mul,
+                    dropout=dropout))
            in_channels = out_channels
            std_mul = 4.0
        # final conv proj, channel transformed to linear dim
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        # CAUTION: sigmoid
-        self.last_conv_proj = Conv1D(in_channels,
+        self.last_conv_proj = Conv1D(
-                                     linear_dim,
+            in_channels,
-                                     1,
+            linear_dim,
-                                     act="sigmoid",
+            1,
-                                     param_attr=I.Normal(scale=std))
+            act="sigmoid",
+            param_attr=I.Normal(scale=std))
    def forward(self, x, speaker_embed=None):
        """
@@ -229,4 +250,4 @@ class Converter(dg.Layer):
        out = self.last_conv_proj(x)
        out = F.transpose(out, [0, 2, 1])
        return out
\ No newline at end of file
--- a/parakeet/models/deepvoice3/decoder.py
+++ b/parakeet/models/deepvoice3/decoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import paddle.fluid.layers as F
 import paddle.fluid.initializer as I
@@ -80,25 +94,25 @@ def unfold_adjacent_frames(folded_frames, r):
 class Decoder(dg.Layer):
    def __init__(
-        self,
+            self,
-        n_speakers,
+            n_speakers,
-        speaker_dim,
+            speaker_dim,
-        embed_dim,
+            embed_dim,
-        mel_dim,
+            mel_dim,
-        r=1,
+            r=1,
-        max_positions=512,
+            max_positions=512,
-        padding_idx=None,  # remove it!
+            padding_idx=None,  # remove it!
-        preattention=(ConvSpec(128, 5, 1), ) * 4,
+            preattention=(ConvSpec(128, 5, 1), ) * 4,
-        convolutions=(ConvSpec(128, 5, 1), ) * 4,
+            convolutions=(ConvSpec(128, 5, 1), ) * 4,
-        attention=True,
+            attention=True,
-        dropout=0.0,
+            dropout=0.0,
-        use_memory_mask=False,
+            use_memory_mask=False,
-        force_monotonic_attention=False,
+            force_monotonic_attention=False,
-        query_position_rate=1.0,
+            query_position_rate=1.0,
-        key_position_rate=1.0,
+            key_position_rate=1.0,
-        window_range=WindowRange(-1, 3),
+            window_range=WindowRange(-1, 3),
-        key_projection=True,
+            key_projection=True,
-        value_projection=True):
+            value_projection=True):
        super(Decoder, self).__init__()
        self.dropout = dropout
@@ -111,23 +125,17 @@ class Decoder(dg.Layer):
        conv_channels = convolutions[0].out_channels
        # only when padding idx is 0 can we easilt handle it
-        self.embed_keys_positions = PositionEmbedding(max_positions,
+        self.embed_keys_positions = PositionEmbedding(
-                                                      embed_dim,
+            max_positions, embed_dim, padding_idx=0)
-                                                      padding_idx=0)
+        self.embed_query_positions = PositionEmbedding(
-        self.embed_query_positions = PositionEmbedding(max_positions,
+            max_positions, conv_channels, padding_idx=0)
-                                                       conv_channels,
-                                                       padding_idx=0)
        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
-            self.speaker_proj1 = Linear(speaker_dim,
+            self.speaker_proj1 = Linear(
-                                        1,
+                speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
-                                        act="sigmoid",
+            self.speaker_proj2 = Linear(
-                                        param_attr=I.Normal(scale=std))
+                speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
-            self.speaker_proj2 = Linear(speaker_dim,
-                                        1,
-                                        act="sigmoid",
-                                        param_attr=I.Normal(scale=std))
        # prenet
        self.prenet = dg.LayerList()
@@ -138,24 +146,26 @@ class Decoder(dg.Layer):
                # conv1d & relu
                std = np.sqrt(std_mul / in_channels)
                self.prenet.append(
-                    Conv1D(in_channels,
+                    Conv1D(
-                           out_channels,
+                        in_channels,
-                           1,
+                        out_channels,
-                           act="relu",
+                        1,
-                           param_attr=I.Normal(scale=std)))
+                        act="relu",
+                        param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0
            self.prenet.append(
-                Conv1DGLU(n_speakers,
+                Conv1DGLU(
-                          speaker_dim,
+                    n_speakers,
-                          in_channels,
+                    speaker_dim,
-                          out_channels,
+                    in_channels,
-                          filter_size,
+                    out_channels,
-                          dilation,
+                    filter_size,
-                          std_mul,
+                    dilation,
-                          dropout,
+                    std_mul,
-                          causal=True,
+                    dropout,
-                          residual=True))
+                    causal=True,
+                    residual=True))
            in_channels = out_channels
            std_mul = 4.0
@@ -184,16 +194,17 @@ class Decoder(dg.Layer):
            assert (
                in_channels == out_channels
            ), "the stack of convolution & attention does not change channels"
-            conv_layer = Conv1DGLU(n_speakers,
+            conv_layer = Conv1DGLU(
-                                   speaker_dim,
+                n_speakers,
-                                   in_channels,
+                speaker_dim,
-                                   out_channels,
+                in_channels,
-                                   filter_size,
+                out_channels,
-                                   dilation,
+                filter_size,
-                                   std_mul,
+                dilation,
-                                   dropout,
+                std_mul,
-                                   causal=True,
+                dropout,
-                                   residual=False)
+                causal=True,
+                residual=False)
            attn_layer = Attention(
                out_channels,
                embed_dim,
@@ -211,10 +222,8 @@ class Decoder(dg.Layer):
        # 1 * 1 conv to transform channels
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
-        self.last_conv = Conv1D(in_channels,
+        self.last_conv = Conv1D(
-                                mel_dim * r,
+            in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std))
-                                1,
-                                param_attr=I.Normal(scale=std))
        # mel (before sigmoid) to done hat
        std = np.sqrt(1 / in_channels)
@@ -308,9 +317,8 @@ class Decoder(dg.Layer):
        # (B, C, T)
        frames = F.transpose(frames, [0, 2, 1])
        x = frames
-        x = F.dropout(x,
+        x = F.dropout(
-                      self.dropout,
+            x, self.dropout, dropout_implementation="upscale_in_train")
-                      dropout_implementation="upscale_in_train")
        # Prenet
        for layer in self.prenet:
            if isinstance(layer, Conv1DGLU):
@@ -408,14 +416,13 @@ class Decoder(dg.Layer):
            test_inputs = fold_adjacent_frames(test_inputs, self.r)
            test_inputs = F.transpose(test_inputs, [0, 2, 1])
-        initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1),
+        initial_input = F.zeros(
-                                dtype=keys.dtype)
+            (batch_size, self.mel_dim * self.r, 1), dtype=keys.dtype)
        t = 0  # decoder time step
        while True:
-            frame_pos = F.fill_constant((batch_size, 1),
+            frame_pos = F.fill_constant(
-                                        value=t + 1,
+                (batch_size, 1), value=t + 1, dtype="int64")
-                                        dtype="int64")
            w = self.query_position_rate
            if self.n_speakers > 1:
                w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1])
@@ -433,9 +440,8 @@ class Decoder(dg.Layer):
                    current_input = initial_input
            x_t = current_input
-            x_t = F.dropout(x_t,
+            x_t = F.dropout(
-                            self.dropout,
+                x_t, self.dropout, dropout_implementation="upscale_in_train")
-                            dropout_implementation="upscale_in_train")
            # Prenet
            for layer in self.prenet:
@@ -453,15 +459,15 @@ class Decoder(dg.Layer):
                    x_t = F.transpose(x_t, [0, 2, 1])
                    if frame_pos_embed is not None:
                        x_t += frame_pos_embed
-                    x_t, attn_scores = attn(
+                    x_t, attn_scores = attn(x_t, (keys, values), mask,
-                        x_t, (keys, values), mask,
+                                            last_attended[i]
-                        last_attended[i] if test_inputs is None else None)
+                                            if test_inputs is None else None)
                    x_t = F.transpose(x_t, [0, 2, 1])
                    step_attn_scores.append(attn_scores)  #(B, T_dec=1, T_enc)
                    # update last attended when necessary
                    if self.force_monotonic_attention[i]:
-                        last_attended[i] = np.argmax(attn_scores.numpy(),
+                        last_attended[i] = np.argmax(
-                                                     axis=-1)[0][0]
+                            attn_scores.numpy(), axis=-1)[0][0]
                x_t = F.scale(residual + x_t, np.sqrt(0.5))
            if len(step_attn_scores):
                # (B, 1, T_enc) again
@@ -485,8 +491,8 @@ class Decoder(dg.Layer):
            t += 1
            if test_inputs is None:
-                if F.reduce_min(done_t).numpy(
+                if F.reduce_min(done_t).numpy()[
-                )[0] > 0.5 and t > self.min_decoder_steps:
+                        0] > 0.5 and t > self.min_decoder_steps:
                    break
                elif t > self.max_decoder_steps:
                    break

--- a/parakeet/models/deepvoice3/encoder.py
+++ b/parakeet/models/deepvoice3/encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from collections import namedtuple
@@ -33,14 +47,16 @@ class Encoder(dg.Layer):
        self.dropout = dropout
        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
-            self.sp_proj1 = Linear(speaker_dim,
+            self.sp_proj1 = Linear(
-                                   embed_dim,
+                speaker_dim,
-                                   act="softsign",
+                embed_dim,
-                                   param_attr=I.Normal(scale=std))
+                act="softsign",
-            self.sp_proj2 = Linear(speaker_dim,
+                param_attr=I.Normal(scale=std))
-                                   embed_dim,
+            self.sp_proj2 = Linear(
-                                   act="softsign",
+                speaker_dim,
-                                   param_attr=I.Normal(scale=std))
+                embed_dim,
+                act="softsign",
+                param_attr=I.Normal(scale=std))
        self.n_speakers = n_speakers
        self.convolutions = dg.LayerList()
@@ -51,31 +67,34 @@ class Encoder(dg.Layer):
            if in_channels != out_channels:
                std = np.sqrt(std_mul / in_channels)
                self.convolutions.append(
-                    Conv1D(in_channels,
+                    Conv1D(
-                           out_channels,
+                        in_channels,
-                           1,
+                        out_channels,
-                           act="relu",
+                        1,
-                           param_attr=I.Normal(scale=std)))
+                        act="relu",
+                        param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0
            self.convolutions.append(
-                Conv1DGLU(n_speakers,
+                Conv1DGLU(
-                          speaker_dim,
+                    n_speakers,
-                          in_channels,
+                    speaker_dim,
-                          out_channels,
+                    in_channels,
-                          filter_size,
+                    out_channels,
-                          dilation,
+                    filter_size,
-                          std_mul,
+                    dilation,
-                          dropout,
+                    std_mul,
-                          causal=False,
+                    dropout,
-                          residual=True))
+                    causal=False,
+                    residual=True))
            in_channels = out_channels
            std_mul = 4.0
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        self.convolutions.append(
-            Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
+            Conv1D(
+                in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
    def forward(self, x, speaker_embed=None):
        """
@@ -96,9 +115,8 @@ class Encoder(dg.Layer):
                representation for values.
        """
        x = self.embed(x)
-        x = F.dropout(x,
+        x = F.dropout(
-                      self.dropout,
+            x, self.dropout, dropout_implementation="upscale_in_train")
-                      dropout_implementation="upscale_in_train")
        x = F.transpose(x, [0, 2, 1])
        if self.n_speakers > 1 and speaker_embed is not None:

--- a/parakeet/models/deepvoice3/loss.py
+++ b/parakeet/models/deepvoice3/loss.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from numba import jit
@@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g):
    return W
-def guided_attentions(encoder_lengths,
+def guided_attentions(encoder_lengths, decoder_lengths, max_decoder_len,
-                      decoder_lengths,
-                      max_decoder_len,
                      g=0.2):
    B = len(encoder_lengths)
    max_input_len = encoder_lengths.max()
@@ -93,9 +105,8 @@ class TTSLoss(object):
    def binary_divergence(self, prediction, target, mask):
        flattened_prediction = F.reshape(prediction, [-1, 1])
        flattened_target = F.reshape(target, [-1, 1])
-        flattened_loss = F.log_loss(flattened_prediction,
+        flattened_loss = F.log_loss(
-                                    flattened_target,
+            flattened_prediction, flattened_target, epsilon=1e-8)
-                                    epsilon=1e-8)
        bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
        w = self.masked_weight
@@ -163,23 +174,20 @@ class TTSLoss(object):
        max_mel_steps = max_frames // self.downsample_factor
        max_decoder_steps = max_mel_steps // self.r
-        decoder_mask = F.sequence_mask(n_frames // self.downsample_factor //
+        decoder_mask = F.sequence_mask(
-                                       self.r,
+            n_frames // self.downsample_factor // self.r,
-                                       max_decoder_steps,
+            max_decoder_steps,
-                                       dtype="float32")
+            dtype="float32")
-        mel_mask = F.sequence_mask(n_frames // self.downsample_factor,
+        mel_mask = F.sequence_mask(
-                                   max_mel_steps,
+            n_frames // self.downsample_factor, max_mel_steps, dtype="float32")
-                                   dtype="float32")
        lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32")
        if compute_lin_loss:
            lin_hyp = lin_hyp[:, :-self.time_shift, :]
            lin_ref = lin_ref[:, self.time_shift:, :]
            lin_mask = lin_mask[:, self.time_shift:, :]
-            lin_l1_loss = self.l1_loss(lin_hyp,
+            lin_l1_loss = self.l1_loss(
-                                       lin_ref,
+                lin_hyp, lin_ref, lin_mask, priority_bin=self.priority_bin)
-                                       lin_mask,
-                                       priority_bin=self.priority_bin)
            lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask)
            lin_loss = self.binary_divergence_weight * lin_bce_loss \
                     + (1 - self.binary_divergence_weight) * lin_l1_loss
@@ -197,9 +205,10 @@ class TTSLoss(object):
            total_loss += mel_loss
        if compute_attn_loss:
-            attn_loss = self.attention_loss(
+            attn_loss = self.attention_loss(attn_hyp,
-                attn_hyp, input_lengths.numpy(),
+                                            input_lengths.numpy(),
-                n_frames.numpy() // (self.downsample_factor * self.r))
+                                            n_frames.numpy() //
+                                            (self.downsample_factor * self.r))
            total_loss += attn_loss
        if compute_done_loss:

--- a/parakeet/models/deepvoice3/model.py
+++ b/parakeet/models/deepvoice3/model.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import paddle.fluid.layers as F
@@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer):
        mel_outputs, alignments, done, decoder_states = self.decoder(
            (keys, values), valid_lengths, mel_inputs, text_positions,
            frame_positions, speaker_embed)
-        linear_outputs = self.converter(
+        linear_outputs = self.converter(decoder_states
-            decoder_states if self.use_decoder_states else mel_outputs,
+                                        if self.use_decoder_states else
-            speaker_embed)
+                                        mel_outputs, speaker_embed)
        return mel_outputs, linear_outputs, alignments, done
    def transduce(self, text_sequences, text_positions, speaker_indices=None):
@@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer):
        keys, values = self.encoder(text_sequences, speaker_embed)
        mel_outputs, alignments, done, decoder_states = self.decoder.decode(
            (keys, values), text_positions, speaker_embed)
-        linear_outputs = self.converter(
+        linear_outputs = self.converter(decoder_states
-            decoder_states if self.use_decoder_states else mel_outputs,
+                                        if self.use_decoder_states else
-            speaker_embed)
+                                        mel_outputs, speaker_embed)
        return mel_outputs, linear_outputs, alignments, done
--- a/parakeet/models/deepvoice3/position_embedding.py
+++ b/parakeet/models/deepvoice3/position_embedding.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 from paddle import fluid
 import paddle.fluid.layers as F
@@ -95,10 +109,11 @@ class PositionEmbedding(dg.Layer):
                                            speaker_position_rate)  # (B, V, C)
        # make indices for gather_nd
        batch_id = F.expand(
-            F.unsqueeze(F.range(0, batch_size, 1, dtype="int64"), [1]),
+            F.unsqueeze(
-            [1, time_steps])
+                F.range(
+                    0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
        # (B, T, 2)
        gather_nd_id = F.stack([batch_id, indices], -1)
        out = F.gather_nd(weight, gather_nd_id)
        return out
\ No newline at end of file
--- a/parakeet/models/fastspeech/__init__.py
+++ b/parakeet/models/fastspeech/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
--- a/parakeet/models/fastspeech/decoder.py
+++ b/parakeet/models/fastspeech/decoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.fastspeech.fft_block import FFTBlock
 class Decoder(dg.Layer):
    def __init__(self,
                 len_max_seq,
@@ -18,16 +32,29 @@ class Decoder(dg.Layer):
        super(Decoder, self).__init__()
        n_position = len_max_seq + 1
-        self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
+        self.pos_inp = get_sinusoid_encoding_table(
-        self.position_enc = dg.Embedding(size=[n_position, d_model],
+            n_position, d_model, padding_idx=0)
-                                 padding_idx=0,
+        self.position_enc = dg.Embedding(
-                                 param_attr=fluid.ParamAttr(
+            size=[n_position, d_model],
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
+            padding_idx=0,
-                                     trainable=False))
+            param_attr=fluid.ParamAttr(
-        self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] 
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.layer_stack = [
+            FFTBlock(
+                d_model,
+                d_inner,
+                n_head,
+                d_k,
+                d_v,
+                fft_conv1d_kernel,
+                fft_conv1d_padding,
+                dropout=dropout) for _ in range(n_layers)
+        ]
        for i, layer in enumerate(self.layer_stack):
            self.add_sublayer('fft_{}'.format(i), layer)
    def forward(self, enc_seq, enc_pos):
        """
        Decoder layer of FastSpeech.
@@ -57,4 +84,4 @@ class Decoder(dg.Layer):
                slf_attn_mask=slf_attn_mask)
            dec_slf_attn_list += [dec_slf_attn]
        return dec_output, dec_slf_attn_list
\ No newline at end of file
--- a/parakeet/models/fastspeech/encoder.py
+++ b/parakeet/models/fastspeech/encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.fastspeech.fft_block import FFTBlock
 class Encoder(dg.Layer):
    def __init__(self,
                 n_src_vocab,
@@ -19,14 +33,28 @@ class Encoder(dg.Layer):
        super(Encoder, self).__init__()
        n_position = len_max_seq + 1
-        self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
+        self.src_word_emb = dg.Embedding(
-        self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
+            size=[n_src_vocab, d_model], padding_idx=0)
-        self.position_enc = dg.Embedding(size=[n_position, d_model],
+        self.pos_inp = get_sinusoid_encoding_table(
-                                 padding_idx=0,
+            n_position, d_model, padding_idx=0)
-                                 param_attr=fluid.ParamAttr(
+        self.position_enc = dg.Embedding(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
+            size=[n_position, d_model],
-                                     trainable=False))
+            padding_idx=0,
-        self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.layer_stack = [
+            FFTBlock(
+                d_model,
+                d_inner,
+                n_head,
+                d_k,
+                d_v,
+                fft_conv1d_kernel,
+                fft_conv1d_padding,
+                dropout=dropout) for _ in range(n_layers)
+        ]
        for i, layer in enumerate(self.layer_stack):
            self.add_sublayer('fft_{}'.format(i), layer)
@@ -52,7 +80,8 @@ class Encoder(dg.Layer):
        non_pad_mask = get_non_pad_mask(character)
        # -- Forward
-        enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
+        enc_output = self.src_word_emb(character) + self.position_enc(
+            text_pos)  #(N, T, C)
        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(
@@ -60,5 +89,5 @@ class Encoder(dg.Layer):
                non_pad_mask=non_pad_mask,
                slf_attn_mask=slf_attn_mask)
            enc_slf_attn_list += [enc_slf_attn]
        return enc_output, non_pad_mask, enc_slf_attn_list
\ No newline at end of file
--- a/parakeet/models/fastspeech/fastspeech.py
+++ b/parakeet/models/fastspeech/fastspeech.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
@@ -7,54 +20,67 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator
 from parakeet.models.fastspeech.encoder import Encoder
 from parakeet.models.fastspeech.decoder import Decoder
 class FastSpeech(dg.Layer):
    def __init__(self, cfg):
        " FastSpeech"
        super(FastSpeech, self).__init__()
-        self.encoder = Encoder(n_src_vocab=len(symbols)+1,
+        self.encoder = Encoder(
-                               len_max_seq=cfg['max_seq_len'],
+            n_src_vocab=len(symbols) + 1,
-                               n_layers=cfg['encoder_n_layer'],
+            len_max_seq=cfg['max_seq_len'],
-                               n_head=cfg['encoder_head'],
+            n_layers=cfg['encoder_n_layer'],
-                               d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
+            n_head=cfg['encoder_head'],
-                               d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
+            d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
-                               d_model=cfg['fs_hidden_size'],
+            d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
-                               d_inner=cfg['encoder_conv1d_filter_size'],
+            d_model=cfg['fs_hidden_size'],
-                               fft_conv1d_kernel=cfg['fft_conv1d_filter'], 
+            d_inner=cfg['encoder_conv1d_filter_size'],
-                               fft_conv1d_padding=cfg['fft_conv1d_padding'],
+            fft_conv1d_kernel=cfg['fft_conv1d_filter'],
-                               dropout=0.1)
+            fft_conv1d_padding=cfg['fft_conv1d_padding'],
-        self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'], 
+            dropout=0.1)
-                                                out_channels=cfg['duration_predictor_output_size'], 
+        self.length_regulator = LengthRegulator(
-                                                filter_size=cfg['duration_predictor_filter_size'], 
+            input_size=cfg['fs_hidden_size'],
-                                                dropout=cfg['dropout'])
+            out_channels=cfg['duration_predictor_output_size'],
-        self.decoder = Decoder(len_max_seq=cfg['max_seq_len'],
+            filter_size=cfg['duration_predictor_filter_size'],
-                                n_layers=cfg['decoder_n_layer'],
+            dropout=cfg['dropout'])
-                                n_head=cfg['decoder_head'],
+        self.decoder = Decoder(
-                                d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
+            len_max_seq=cfg['max_seq_len'],
-                                d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
+            n_layers=cfg['decoder_n_layer'],
-                                d_model=cfg['fs_hidden_size'],
+            n_head=cfg['decoder_head'],
-                                d_inner=cfg['decoder_conv1d_filter_size'],
+            d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
-                                fft_conv1d_kernel=cfg['fft_conv1d_filter'], 
+            d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
-                                fft_conv1d_padding=cfg['fft_conv1d_padding'],
+            d_model=cfg['fs_hidden_size'],
-                                dropout=0.1)
+            d_inner=cfg['decoder_conv1d_filter_size'],
-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
+            fft_conv1d_kernel=cfg['fft_conv1d_filter'],
+            fft_conv1d_padding=cfg['fft_conv1d_padding'],
+            dropout=0.1)
+        self.weight = fluid.ParamAttr(
+            initializer=fluid.initializer.XavierInitializer())
        k = math.sqrt(1 / cfg['fs_hidden_size'])
-        self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
+        self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-        self.mel_linear = dg.Linear(cfg['fs_hidden_size'], 
+            low=-k, high=k))
-                                    cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'],
+        self.mel_linear = dg.Linear(
-                                    param_attr = self.weight,
+            cfg['fs_hidden_size'],
-                                    bias_attr = self.bias,)
+            cfg['audio']['num_mels'] * cfg['audio']['outputs_per_step'],
-        self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'],
+            param_attr=self.weight,
-                 num_hidden=512,
+            bias_attr=self.bias, )
-                 filter_size=5,
+        self.postnet = PostConvNet(
-                 padding=int(5 / 2),
+            n_mels=cfg['audio']['num_mels'],
-                 num_conv=5,
+            num_hidden=512,
-                 outputs_per_step=cfg['audio']['outputs_per_step'],
+            filter_size=5,
-                 use_cudnn=True,
+            padding=int(5 / 2),
-                 dropout=0.1,
+            num_conv=5,
-                 batchnorm_last=True)
+            outputs_per_step=cfg['audio']['outputs_per_step'],
+            use_cudnn=True,
+            dropout=0.1,
+            batchnorm_last=True)
-    def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0):
+    def forward(self,
+                character,
+                text_pos,
+                mel_pos=None,
+                length_target=None,
+                alpha=1.0):
        """
        FastSpeech model.
@@ -80,22 +106,25 @@ class FastSpeech(dg.Layer):
            dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
        """
-        encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos)
+        encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(
+            character, text_pos)
        if fluid.framework._dygraph_tracer()._train_mode:
-            length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
+            length_regulator_output, duration_predictor_output = self.length_regulator(
-                                                                                       target=length_target,
+                encoder_output, target=length_target, alpha=alpha)
-                                                                                       alpha=alpha)
+            decoder_output, dec_slf_attn_list = self.decoder(
-            decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos)
+                length_regulator_output, mel_pos)
            mel_output = self.mel_linear(decoder_output)
            mel_output_postnet = self.postnet(mel_output) + mel_output
            return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
        else:
-            length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha)
+            length_regulator_output, decoder_pos = self.length_regulator(
-            decoder_output, _ = self.decoder(length_regulator_output, decoder_pos)
+                encoder_output, alpha=alpha)
+            decoder_output, _ = self.decoder(length_regulator_output,
+                                             decoder_pos)
            mel_output = self.mel_linear(decoder_output)
            mel_output_postnet = self.postnet(mel_output) + mel_output
            return mel_output, mel_output_postnet
\ No newline at end of file
--- a/parakeet/models/fastspeech/fft_block.py
+++ b/parakeet/models/fastspeech/fft_block.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import math
 import paddle.fluid.dygraph as dg
@@ -6,11 +19,32 @@ import paddle.fluid as fluid
 from parakeet.modules.multihead_attention import MultiheadAttention
 from parakeet.modules.ffn import PositionwiseFeedForward
 class FFTBlock(dg.Layer):
-    def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
+    def __init__(self,
+                 d_model,
+                 d_inner,
+                 n_head,
+                 d_k,
+                 d_v,
+                 filter_size,
+                 padding,
+                 dropout=0.2):
        super(FFTBlock, self).__init__()
-        self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
+        self.slf_attn = MultiheadAttention(
-        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
+            d_model,
+            d_k,
+            d_v,
+            num_head=n_head,
+            is_bias=True,
+            dropout=dropout,
+            is_concat=False)
+        self.pos_ffn = PositionwiseFeedForward(
+            d_model,
+            d_inner,
+            filter_size=filter_size,
+            padding=padding,
+            dropout=dropout)
    def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
        """
@@ -27,10 +61,11 @@ class FFTBlock(dg.Layer):
            output (Variable), Shape(B, T, C), the output after self-attention & ffn.
            slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
        """
-        output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
+        output, slf_attn = self.slf_attn(
+            enc_input, enc_input, enc_input, mask=slf_attn_mask)
        output *= non_pad_mask
        output = self.pos_ffn(output)
        output *= non_pad_mask
        return output, slf_attn
\ No newline at end of file
--- a/parakeet/models/fastspeech/length_regulator.py
+++ b/parakeet/models/fastspeech/length_regulator.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import math
 import parakeet.models.fastspeech.utils
@@ -6,47 +19,50 @@ import paddle.fluid.layers as layers
 import paddle.fluid as fluid
 from parakeet.modules.customized import Conv1D
 class LengthRegulator(dg.Layer):
    def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
        super(LengthRegulator, self).__init__()
-        self.duration_predictor = DurationPredictor(input_size=input_size, 
+        self.duration_predictor = DurationPredictor(
-                                                    out_channels=out_channels, 
+            input_size=input_size,
-                                                    filter_size=filter_size, 
+            out_channels=out_channels,
-                                                    dropout=dropout)
+            filter_size=filter_size,
+            dropout=dropout)
    def LR(self, x, duration_predictor_output, alpha=1.0):
        output = []
        batch_size = x.shape[0]
        for i in range(batch_size):
-            output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha))
+            output.append(
+                self.expand(x[i:i + 1], duration_predictor_output[i:i + 1],
+                            alpha))
        output = self.pad(output)
        return output
    def pad(self, input_ele):
        max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
        out_list = []
        for i in range(len(input_ele)):
            pad_len = max_len - input_ele[i].shape[0]
-            one_batch_padded = layers.pad(
+            one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0],
-                input_ele[i], [0, pad_len, 0, 0], pad_value=0.0)
+                                          pad_value=0.0)
            out_list.append(one_batch_padded)
        out_padded = layers.stack(out_list)
        return out_padded
    def expand(self, batch, predicted, alpha):
        out = []
        time_steps = batch.shape[1]
        fertilities = predicted.numpy()
-        batch = layers.squeeze(batch,[0]) 
+        batch = layers.squeeze(batch, [0])
        for i in range(time_steps):
-            if fertilities[0,i]==0:
+            if fertilities[0, i] == 0:
                continue
-            out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1]))
+            out.append(
+                layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1]))
        out = layers.concat(out, axis=0)
        return out
    def forward(self, x, alpha=1.0, target=None):
        """
@@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer):
        else:
            duration_predictor_output = layers.round(duration_predictor_output)
            output = self.LR(x, duration_predictor_output, alpha)
-            mel_pos = dg.to_variable(np.arange(1, output.shape[1]+1))
+            mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1))
            mel_pos = layers.unsqueeze(mel_pos, [0])
            return output, mel_pos
 class DurationPredictor(dg.Layer):
    def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
        super(DurationPredictor, self).__init__()
@@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer):
        self.dropout = dropout
        k = math.sqrt(1 / self.input_size)
-        self.conv1 = Conv1D(num_channels = self.input_size, 
+        self.conv1 = Conv1D(
-                        num_filters = self.out_channels, 
+            num_channels=self.input_size,
-                        filter_size = self.filter_size,
+            num_filters=self.out_channels,
-                        padding=1,
+            filter_size=self.filter_size,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            padding=1,
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+            param_attr=fluid.ParamAttr(
-                        #data_format='NTC')
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        #data_format='NTC')
        k = math.sqrt(1 / self.out_channels)
-        self.conv2 = Conv1D(num_channels = self.out_channels, 
+        self.conv2 = Conv1D(
-                        num_filters = self.out_channels, 
+            num_channels=self.out_channels,
-                        filter_size = self.filter_size,
+            num_filters=self.out_channels,
-                        padding=1,
+            filter_size=self.filter_size,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            padding=1,
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+            param_attr=fluid.ParamAttr(
-                        #data_format='NTC')
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        #data_format='NTC')
        self.layer_norm1 = dg.LayerNorm(self.out_channels)
        self.layer_norm2 = dg.LayerNorm(self.out_channels)
-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
+        self.weight = fluid.ParamAttr(
+            initializer=fluid.initializer.XavierInitializer())
        k = math.sqrt(1 / self.out_channels)
-        self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
+        self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+            low=-k, high=k))
-        self.linear =dg.Linear(self.out_channels, 1, param_attr = self.weight,
+        self.linear = dg.Linear(
-                            bias_attr = self.bias)
+            self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias)
    def forward(self, encoder_output):
        """
@@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer):
            out (Variable), Shape(B, T, C), the output of duration predictor.
        """
        # encoder_output.shape(N, T, C)
-        out = layers.transpose(encoder_output, [0,2,1])
+        out = layers.transpose(encoder_output, [0, 2, 1])
        out = self.conv1(out)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        out = self.conv2(out)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout)
        out = layers.relu(self.linear(out))
        out = layers.squeeze(out, axes=[-1])
-        return out
+        return out
--- a/parakeet/models/fastspeech/utils.py
+++ b/parakeet/models/fastspeech/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 def get_alignment(attn_probs, mel_lens, n_head):
    max_F = 0
    assert attn_probs[0].shape[0] % n_head == 0
@@ -8,27 +22,27 @@ def get_alignment(attn_probs, mel_lens, n_head):
    for i in range(len(attn_probs)):
        multi_attn = attn_probs[i].numpy()
        for j in range(n_head):
-            attn = multi_attn[j*batch_size:(j+1)*batch_size]
+            attn = multi_attn[j * batch_size:(j + 1) * batch_size]
            F = score_F(attn)
            if max_F < F:
                max_F = F
                max_attn = attn
    alignment = compute_duration(max_attn, mel_lens)
    return alignment
 def score_F(attn):
    max = np.max(attn, axis=-1)
    mean = np.mean(max)
    return mean
 def compute_duration(attn, mel_lens):
-    alignment = np.zeros([attn.shape[0],attn.shape[2]])
+    alignment = np.zeros([attn.shape[0], attn.shape[2]])
    mel_lens = mel_lens.numpy()
    for i in range(attn.shape[0]):
        for j in range(mel_lens[i]):
-            max_index = np.argmax(attn[i,j])
+            max_index = np.argmax(attn[i, j])
-            alignment[i,max_index] += 1
+            alignment[i, max_index] += 1
    return alignment
--- a/parakeet/models/transformer_tts/__init__.py
+++ b/parakeet/models/transformer_tts/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
--- a/parakeet/models/transformer_tts/cbhg.py
+++ b/parakeet/models/transformer_tts/cbhg.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from parakeet.g2p.text.symbols import symbols
 import paddle.fluid.dygraph as dg
@@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D
 from parakeet.modules.dynamic_gru import DynamicGRU
 import numpy as np
 class CBHG(dg.Layer):
-    def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2, 
+    def __init__(self,
-                 max_pool_kernel_size=2, is_post=False):
+                 hidden_size,
+                 batch_size,
+                 K=16,
+                 projection_size=256,
+                 num_gru_layers=2,
+                 max_pool_kernel_size=2,
+                 is_post=False):
        super(CBHG, self).__init__()
        """
        :param hidden_size: dimension of hidden unit
@@ -24,28 +44,39 @@ class CBHG(dg.Layer):
        self.projection_size = projection_size
        self.conv_list = []
        k = math.sqrt(1 / projection_size)
-        self.conv_list.append(Conv1D(num_channels = projection_size,
+        self.conv_list.append(
-                            num_filters = hidden_size,
+            Conv1D(
-                            filter_size = 1,
+                num_channels=projection_size,
-                            padding = int(np.floor(1/2)),
+                num_filters=hidden_size,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                filter_size=1,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
+                padding=int(np.floor(1 / 2)),
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k))))
        k = math.sqrt(1 / hidden_size)
-        for i in range(2,K+1):
+        for i in range(2, K + 1):
-            self.conv_list.append(Conv1D(num_channels = hidden_size,
+            self.conv_list.append(
-                            num_filters = hidden_size,
+                Conv1D(
-                            filter_size = i,
+                    num_channels=hidden_size,
-                            padding = int(np.floor(i/2)),
+                    num_filters=hidden_size,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                    filter_size=i,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
+                    padding=int(np.floor(i / 2)),
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)
        self.batchnorm_list = []
        for i in range(K):
-            self.batchnorm_list.append(dg.BatchNorm(hidden_size, 
+            self.batchnorm_list.append(
-                            data_layout='NCHW'))
+                dg.BatchNorm(
+                    hidden_size, data_layout='NCHW'))
        for i, layer in enumerate(self.batchnorm_list):
            self.add_sublayer("batchnorm_list_{}".format(i), layer)
@@ -53,91 +84,120 @@ class CBHG(dg.Layer):
        conv_outdim = hidden_size * K
        k = math.sqrt(1 / conv_outdim)
-        self.conv_projection_1 = Conv1D(num_channels = conv_outdim,
+        self.conv_projection_1 = Conv1D(
-                            num_filters = hidden_size,
+            num_channels=conv_outdim,
-                            filter_size = 3,
+            num_filters=hidden_size,
-                            padding = int(np.floor(3/2)),
+            filter_size=3,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            padding=int(np.floor(3 / 2)),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
        k = math.sqrt(1 / hidden_size)
-        self.conv_projection_2 = Conv1D(num_channels = hidden_size,
+        self.conv_projection_2 = Conv1D(
-                            num_filters = projection_size,
+            num_channels=hidden_size,
-                            filter_size = 3,
+            num_filters=projection_size,
-                            padding = int(np.floor(3/2)),
+            filter_size=3,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+            padding=int(np.floor(3 / 2)),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
-        self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, 
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                            data_layout='NCHW')
+                low=-k, high=k)))
-        self.batchnorm_proj_2 = dg.BatchNorm(projection_size, 
-                            data_layout='NCHW')
+        self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
-        self.max_pool = Pool1D(pool_size = max_pool_kernel_size, 
+        self.batchnorm_proj_2 = dg.BatchNorm(
-                    pool_type='max', 
+            projection_size, data_layout='NCHW')
-                    pool_stride=1, 
+        self.max_pool = Pool1D(
-                    pool_padding=1,
+            pool_size=max_pool_kernel_size,
-                    data_format = "NCT")
+            pool_type='max',
+            pool_stride=1,
+            pool_padding=1,
+            data_format="NCT")
        self.highway = Highwaynet(self.projection_size)
        h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
        h_0 = dg.to_variable(h_0)
        k = math.sqrt(1 / hidden_size)
-        self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
+        self.fc_forward1 = dg.Linear(
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            hidden_size,
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            hidden_size // 2 * 3,
-        self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
-                            param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-        self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
+                low=-k, high=k)))
-                              is_reverse = False,
+        self.fc_reverse1 = dg.Linear(
-                              origin_mode = True,
+            hidden_size,
-                              h_0 = h_0)
+            hidden_size // 2 * 3,
-        self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
+            param_attr=fluid.ParamAttr(
-                              is_reverse=True,
+                initializer=fluid.initializer.XavierInitializer()),
-                              origin_mode=True,
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
-                              h_0 = h_0)
+                low=-k, high=k)))
+        self.gru_forward1 = DynamicGRU(
-        self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
+            size=self.hidden_size // 2,
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            is_reverse=False,
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            origin_mode=True,
-        self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
+            h_0=h_0)
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+        self.gru_reverse1 = DynamicGRU(
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            size=self.hidden_size // 2,
-        self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
+            is_reverse=True,
-                              is_reverse = False,
+            origin_mode=True,
-                              origin_mode = True,
+            h_0=h_0)
-                              h_0 = h_0)
-        self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
+        self.fc_forward2 = dg.Linear(
-                              is_reverse=True,
+            hidden_size,
-                              origin_mode=True,
+            hidden_size // 2 * 3,
-                              h_0 = h_0)
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.fc_reverse2 = dg.Linear(
+            hidden_size,
+            hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.gru_forward2 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=False,
+            origin_mode=True,
+            h_0=h_0)
+        self.gru_reverse2 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=True,
+            origin_mode=True,
+            h_0=h_0)
    def _conv_fit_dim(self, x, filter_size=3):
        if filter_size % 2 == 0:
-            return x[:,:,:-1]
+            return x[:, :, :-1]
        else:
-            return x 
+            return x
    def forward(self, input_):
        # input_.shape = [N, C, T]
        conv_list = []
        conv_input = input_
-        for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
+        for i, (conv, batchnorm
-            conv_input = self._conv_fit_dim(conv(conv_input), i+1)
+                ) in enumerate(zip(self.conv_list, self.batchnorm_list)):
+            conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
            conv_input = layers.relu(batchnorm(conv_input))
            conv_list.append(conv_input)
        conv_cat = layers.concat(conv_list, axis=1)
-        conv_pool = self.max_pool(conv_cat)[:,:,:-1]
+        conv_pool = self.max_pool(conv_cat)[:, :, :-1]
+        conv_proj = layers.relu(
-        conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
+            self.batchnorm_proj_1(
-        conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
+                self._conv_fit_dim(self.conv_projection_1(conv_pool))))
+        conv_proj = self.batchnorm_proj_2(
+            self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
        # conv_proj.shape = [N, C, T]
-        highway = layers.transpose(conv_proj, [0,2,1])
+        highway = layers.transpose(conv_proj, [0, 2, 1])
        highway = self.highway(highway)
        # highway.shape = [N, T, C]
@@ -151,9 +211,10 @@ class CBHG(dg.Layer):
        out_forward = self.gru_forward2(fc_forward)
        out_reverse = self.gru_reverse2(fc_reverse)
        out = layers.concat([out_forward, out_reverse], axis=-1)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        return out
 class Highwaynet(dg.Layer):
    def __init__(self, num_units, num_layers=4):
        super(Highwaynet, self).__init__()
@@ -164,14 +225,26 @@ class Highwaynet(dg.Layer):
        self.linears = []
        k = math.sqrt(1 / num_units)
        for i in range(num_layers):
-            self.linears.append(dg.Linear(num_units, num_units,
+            self.linears.append(
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                dg.Linear(
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
+                    num_units,
-            self.gates.append(dg.Linear(num_units, num_units,
+                    num_units,
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                    param_attr=fluid.ParamAttr(
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
-        for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
+            self.gates.append(
+                dg.Linear(
+                    num_units,
+                    num_units,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
+        for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
            self.add_sublayer("linears_{}".format(i), linear)
            self.add_sublayer("gates_{}".format(i), gate)
@@ -183,12 +256,6 @@ class Highwaynet(dg.Layer):
            t_ = fluid.layers.sigmoid(gate(out))
            c = 1 - t_
-            out  = h * t_ + out  * c
+            out = h * t_ + out * c
-        return out
+        return out
--- a/parakeet/models/transformer_tts/decoder.py
+++ b/parakeet/models/transformer_tts/decoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
@@ -7,70 +20,110 @@ from parakeet.modules.ffn import PositionwiseFeedForward
 from parakeet.models.transformer_tts.prenet import PreNet
 from parakeet.models.transformer_tts.post_convnet import PostConvNet
 class Decoder(dg.Layer):
    def __init__(self, num_hidden, config, num_head=4):
        super(Decoder, self).__init__()
        self.num_hidden = num_hidden
        param = fluid.ParamAttr()
-        self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
+        self.alpha = self.create_parameter(
-                        default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
+            shape=(1, ),
-        self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
+            attr=param,
-        self.pos_emb = dg.Embedding(size=[1024, num_hidden],
+            dtype='float32',
-                                 padding_idx=0,
+            default_initializer=fluid.initializer.ConstantInitializer(
-                                 param_attr=fluid.ParamAttr(
+                value=1.0))
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
+        self.pos_inp = get_sinusoid_encoding_table(
-                                     trainable=False))
+            1024, self.num_hidden, padding_idx=0)
-        self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'], 
+        self.pos_emb = dg.Embedding(
-                                            hidden_size = num_hidden * 2, 
+            size=[1024, num_hidden],
-                                            output_size = num_hidden, 
+            padding_idx=0,
-                                            dropout_rate=0.2)
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.decoder_prenet = PreNet(
+            input_size=config['audio']['num_mels'],
+            hidden_size=num_hidden * 2,
+            output_size=num_hidden,
+            dropout_rate=0.2)
        k = math.sqrt(1 / num_hidden)
-        self.linear = dg.Linear(num_hidden, num_hidden,
+        self.linear = dg.Linear(
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            num_hidden,
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            num_hidden,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
-        self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        self.selfattn_layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
        for i, layer in enumerate(self.selfattn_layers):
            self.add_sublayer("self_attn_{}".format(i), layer)
-        self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        self.attn_layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
        for i, layer in enumerate(self.attn_layers):
            self.add_sublayer("attn_{}".format(i), layer)
-        self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
+        self.ffns = [
+            PositionwiseFeedForward(
+                num_hidden, num_hidden * num_head, filter_size=1)
+            for _ in range(3)
+        ]
        for i, layer in enumerate(self.ffns):
            self.add_sublayer("ffns_{}".format(i), layer)
-        self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'],
+        self.mel_linear = dg.Linear(
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            num_hidden,
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            config['audio']['num_mels'] * config['audio']['outputs_per_step'],
-        self.stop_linear = dg.Linear(num_hidden, 1,
+            param_attr=fluid.ParamAttr(
-                                  param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+                initializer=fluid.initializer.XavierInitializer()),
-                                  bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
-        self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'], 
+        self.stop_linear = dg.Linear(
-                                       filter_size = 5, padding = 4, num_conv=5, 
+            num_hidden,
-                                       outputs_per_step=config['audio']['outputs_per_step'], 
+            1,
-                                       use_cudnn = True)
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.postconvnet = PostConvNet(
+            config['audio']['num_mels'],
+            config['hidden_size'],
+            filter_size=5,
+            padding=4,
+            num_conv=5,
+            outputs_per_step=config['audio']['outputs_per_step'],
+            use_cudnn=True)
    def forward(self, key, value, query, c_mask, positional):
        # get decoder mask with triangular matrix
        if fluid.framework._dygraph_tracer()._train_mode:
            m_mask = get_non_pad_mask(positional)
-            mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
+            mask = get_attn_key_pad_mask((positional == 0).astype(np.float32),
-            triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
+                                         query)
+            triu_tensor = dg.to_variable(
+                get_triu_tensor(query.numpy(), query.numpy())).astype(
+                    np.float32)
            mask = mask + triu_tensor
            mask = fluid.layers.cast(mask == 0, np.float32)
            # (batch_size, decoder_len, encoder_len)
-            zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
+            zero_mask = get_attn_key_pad_mask(
+                layers.squeeze(c_mask, [-1]), query)
        else:
-            mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
+            mask = get_triu_tensor(query.numpy(),
+                                   query.numpy()).astype(np.float32)
            mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
            m_mask, zero_mask = None, None
        # Decoder pre-network
        query = self.decoder_prenet(query)
        # Centered position
        query = self.linear(query)
@@ -84,10 +137,13 @@ class Decoder(dg.Layer):
        # Attention decoder-decoder, encoder-decoder
        selfattn_list = list()
        attn_list = list()
-        for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
+        for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
-            query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
+                                       self.ffns):
-            query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
+            query, attn_dec = selfattn(
+                query, query, query, mask=mask, query_mask=m_mask)
+            query, attn_dot = attn(
+                key, value, query, mask=zero_mask, query_mask=m_mask)
            query = ffn(query)
            selfattn_list.append(attn_dec)
            attn_list.append(attn_dot)
@@ -96,7 +152,7 @@ class Decoder(dg.Layer):
        # Post Mel Network
        out = self.postconvnet(mel_out)
        out = mel_out + out
        # Stop tokens
        stop_tokens = self.stop_linear(query)
        stop_tokens = layers.squeeze(stop_tokens, [-1])

--- a/parakeet/models/transformer_tts/encoder.py
+++ b/parakeet/models/transformer_tts/encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
@@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention
 from parakeet.modules.ffn import PositionwiseFeedForward
 from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet
 class Encoder(dg.Layer):
    def __init__(self, embedding_size, num_hidden, num_head=4):
        super(Encoder, self).__init__()
        self.num_hidden = num_hidden
-        param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
+        param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
-        self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
+            value=1.0))
-        self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
+        self.alpha = self.create_parameter(
-        self.pos_emb = dg.Embedding(size=[1024, num_hidden],
+            shape=(1, ), attr=param, dtype='float32')
-                                 padding_idx=0,
+        self.pos_inp = get_sinusoid_encoding_table(
-                                 param_attr=fluid.ParamAttr(
+            1024, self.num_hidden, padding_idx=0)
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
+        self.pos_emb = dg.Embedding(
-                                     trainable=False))
+            size=[1024, num_hidden],
-        self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, 
+            padding_idx=0,
-                                            num_hidden = num_hidden, 
+            param_attr=fluid.ParamAttr(
-                                            use_cudnn=True)
+                initializer=fluid.initializer.NumpyArrayInitializer(
-        self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+                    self.pos_inp),
+                trainable=False))
+        self.encoder_prenet = EncoderPrenet(
+            embedding_size=embedding_size,
+            num_hidden=num_hidden,
+            use_cudnn=True)
+        self.layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
        for i, layer in enumerate(self.layers):
            self.add_sublayer("self_attn_{}".format(i), layer)
-        self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)]
+        self.ffns = [
+            PositionwiseFeedForward(
+                num_hidden,
+                num_hidden * num_head,
+                filter_size=1,
+                use_cudnn=True) for _ in range(3)
+        ]
        for i, layer in enumerate(self.ffns):
            self.add_sublayer("ffns_{}".format(i), layer)
@@ -33,25 +62,23 @@ class Encoder(dg.Layer):
            mask = get_attn_key_pad_mask(positional, x)
        else:
            query_mask, mask = None, None
        # Encoder pre_network
-        x = self.encoder_prenet(x) #(N,T,C)
+        x = self.encoder_prenet(x)  #(N,T,C)
        # Get positional encoding
-        positional = self.pos_emb(positional) 
+        positional = self.pos_emb(positional)
-        x = positional * self.alpha + x #(N, T, C)
+        x = positional * self.alpha + x  #(N, T, C)
        # Positional dropout
        x = layers.dropout(x, 0.1)
        # Self attention encoder
        attentions = list()
        for layer, ffn in zip(self.layers, self.ffns):
-            x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
+            x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
            x = ffn(x)
            attentions.append(attention)
        return x, query_mask, attentions
\ No newline at end of file
--- a/parakeet/models/transformer_tts/encoderprenet.py
+++ b/parakeet/models/transformer_tts/encoderprenet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from parakeet.g2p.text.symbols import symbols
 import paddle.fluid.dygraph as dg
@@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer):
        self.embedding_size = embedding_size
        self.num_hidden = num_hidden
        self.use_cudnn = use_cudnn
-        self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
+        self.embedding = dg.Embedding(
-                                        padding_idx = None)
+            size=[len(symbols), embedding_size], padding_idx=None)
        self.conv_list = []
        k = math.sqrt(1 / embedding_size)
-        self.conv_list.append(Conv1D(num_channels = embedding_size, 
+        self.conv_list.append(
-                            num_filters = num_hidden, 
+            Conv1D(
-                            filter_size = 5,
+                num_channels=embedding_size,
-                            padding = int(np.floor(5/2)),
+                num_filters=num_hidden,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                filter_size=5,
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+                padding=int(np.floor(5 / 2)),
-                            use_cudnn = use_cudnn))
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))
        k = math.sqrt(1 / num_hidden)
        for _ in range(2):
-            self.conv_list.append(Conv1D(num_channels = num_hidden, 
+            self.conv_list.append(
-                                num_filters = num_hidden, 
+                Conv1D(
-                                filter_size = 5,
+                    num_channels=num_hidden,
-                                padding = int(np.floor(5/2)),
+                    num_filters=num_hidden,
-                                param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
+                    filter_size=5,
-                                bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
+                    padding=int(np.floor(5 / 2)),
-                                use_cudnn = use_cudnn))
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k)),
+                    use_cudnn=use_cudnn))
        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)
-        self.batch_norm_list = [dg.BatchNorm(num_hidden, 
+        self.batch_norm_list = [
-                            data_layout='NCHW') for _ in range(3)]
+            dg.BatchNorm(
+                num_hidden, data_layout='NCHW') for _ in range(3)
+        ]
        for i, layer in enumerate(self.batch_norm_list):
            self.add_sublayer("batch_norm_list_{}".format(i), layer)
        k = math.sqrt(1 / num_hidden)
-        self.projection = dg.Linear(num_hidden, num_hidden,
+        self.projection = dg.Linear(
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            num_hidden,
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            num_hidden,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
    def forward(self, x):
-        x = self.embedding(x) #(batch_size, seq_len, embending_size)
+        x = self.embedding(x)  #(batch_size, seq_len, embending_size)
-        x = layers.transpose(x,[0,2,1])
+        x = layers.transpose(x, [0, 2, 1])
        for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
            x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
-        x = layers.transpose(x,[0,2,1]) #(N,T,C)
+        x = layers.transpose(x, [0, 2, 1])  #(N,T,C)
        x = self.projection(x)
        return x
\ No newline at end of file
--- a/parakeet/models/transformer_tts/post_convnet.py
+++ b/parakeet/models/transformer_tts/post_convnet.py
--- a/parakeet/models/transformer_tts/prenet.py
+++ b/parakeet/models/transformer_tts/prenet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 class PreNet(dg.Layer):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
        """
@@ -17,13 +31,21 @@ class PreNet(dg.Layer):
        self.dropout_rate = dropout_rate
        k = math.sqrt(1 / input_size)
-        self.linear1 = dg.Linear(input_size, hidden_size,
+        self.linear1 = dg.Linear(
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            input_size,
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            hidden_size,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
        k = math.sqrt(1 / hidden_size)
-        self.linear2 = dg.Linear(hidden_size, output_size,
+        self.linear2 = dg.Linear(
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
+            hidden_size,
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+            output_size,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
    def forward(self, x):
        """

--- a/parakeet/models/transformer_tts/transformer_tts.py
+++ b/parakeet/models/transformer_tts/transformer_tts.py
--- a/parakeet/models/transformer_tts/utils.py
+++ b/parakeet/models/transformer_tts/utils.py
--- a/parakeet/models/transformer_tts/vocoder.py
+++ b/parakeet/models/transformer_tts/vocoder.py
--- a/parakeet/models/waveflow/__init__.py
+++ b/parakeet/models/waveflow/__init__.py
--- a/parakeet/models/waveflow/data.py
+++ b/parakeet/models/waveflow/data.py
--- a/parakeet/models/waveflow/waveflow.py
+++ b/parakeet/models/waveflow/waveflow.py
--- a/parakeet/models/waveflow/waveflow_modules.py
+++ b/parakeet/models/waveflow/waveflow_modules.py
--- a/parakeet/models/wavenet/README.md
+++ b/parakeet/models/wavenet/README.md
--- a/parakeet/models/wavenet/data.py
+++ b/parakeet/models/wavenet/data.py
--- a/parakeet/models/wavenet/slurm.py
+++ b/parakeet/models/wavenet/slurm.py
--- a/parakeet/models/wavenet/synthesis.py
+++ b/parakeet/models/wavenet/synthesis.py
--- a/parakeet/models/wavenet/train.py
+++ b/parakeet/models/wavenet/train.py
--- a/parakeet/models/wavenet/utils.py
+++ b/parakeet/models/wavenet/utils.py
--- a/parakeet/models/wavenet/wavenet.py
+++ b/parakeet/models/wavenet/wavenet.py
--- a/parakeet/models/wavenet/wavenet_modules.py
+++ b/parakeet/models/wavenet/wavenet_modules.py
--- a/parakeet/modules/__init__.py
+++ b/parakeet/modules/__init__.py
--- a/parakeet/modules/customized.py
+++ b/parakeet/modules/customized.py
--- a/parakeet/modules/dynamic_gru.py
+++ b/parakeet/modules/dynamic_gru.py
--- a/parakeet/modules/ffn.py
+++ b/parakeet/modules/ffn.py
--- a/parakeet/modules/multihead_attention.py
+++ b/parakeet/modules/multihead_attention.py
--- a/parakeet/modules/weight_norm.py
+++ b/parakeet/modules/weight_norm.py
--- a/parakeet/utils/layer_tools.py
+++ b/parakeet/utils/layer_tools.py
--- a/setup.py
+++ b/setup.py
--- a/tests/test_ljspeech.py
+++ b/tests/test_ljspeech.py
--- a/tests/test_vctk.py
+++ b/tests/test_vctk.py
--- a/tools/copyright.hook
+++ b/tools/copyright.hook