add license

9d796994 · lifuchen · f84d6bec · 9d796994 · 9d796994 · 9d796994
92 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,3 +25,11 @@
        files: \.md$
    -   id: remove-tabs
        files: \.md$
+-   repo: local
+    hooks:
+    -   id: copyright_checker
+        name: copyright_checker
+        entry: python ./tools/copyright.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
+        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
--- a/examples/deepvoice3/README.md
+++ b/examples/deepvoice3/README.md
-# Deepvoice 3 
+# Deepvoice 3

 Paddle implementation of deepvoice 3 in dynamic graph, a convolutional network based text-to-speech synthesis model. The implementation is based on [Deep Voice 3: Scaling Text-to-Speech with Convolutional Sequence Learning](https://arxiv.org/abs/1710.07654).

@@ -22,7 +22,7 @@ The model consists of an encoder, a decoder and a converter (and a speaker embed
 ## Project Structure

 ```text
-├── data.py          data_processing 
+├── data.py          data_processing
 ├── ljspeech.yaml    (example) configuration file
 ├── sentences.txt    sample sentences
 ├── synthesis.py     script to synthesize waveform from text
@@ -50,7 +50,7 @@ optional arguments:
                        The directory to save result.
  -g DEVICE, --device DEVICE
                        device to use
-``` 
+```

 1. `--config` is the configuration file to use. The provided `ljspeech.yaml` can be used directly. And you can change some values in the configuration file and train the model with a different config.
 2. `--data` is the path of the LJSpeech dataset, the extracted folder from the downloaded archive (the folder which contains metadata.txt).
@@ -61,7 +61,7 @@ optional arguments:
 ├── checkpoints      # checkpoint
 ├── log              # tensorboard log
 └── states           # train and evaluation results
-    ├── alignments   # attention 
+    ├── alignments   # attention
    ├── lin_spec     # linear spectrogram
    ├── mel_spec     # mel spectrogram
    └── waveform     # waveform (.wav files)
@@ -112,4 +112,3 @@ example script:
 ```bash
 python synthesis.py --config=./ljspeech.yaml --device=0 experiment/checkpoints/model_step_005000000 sentences.txt generated
 ```
-
--- a/examples/deepvoice3/data.py
+++ b/examples/deepvoice3/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import csv
 from pathlib import Path
@@ -79,10 +93,11 @@ class Transform(object):
        y = signal.lfilter([1., -self.preemphasis], [1.], wav)

        # STFT
-        D = librosa.stft(y=y,
-                         n_fft=self.n_fft,
-                         win_length=self.win_length,
-                         hop_length=self.hop_length)
+        D = librosa.stft(
+            y=y,
+            n_fft=self.n_fft,
+            win_length=self.win_length,
+            hop_length=self.hop_length)
        S = np.abs(D)

        # to db and normalize to 0-1
@@ -96,11 +111,8 @@ class Transform(object):

        # mel scale and to db and normalize to 0-1,
        # CAUTION: pass linear scale S, not dbscaled S
-        S_mel = librosa.feature.melspectrogram(S=S,
-                                               n_mels=self.n_mels,
-                                               fmin=self.fmin,
-                                               fmax=self.fmax,
-                                               power=1.)
+        S_mel = librosa.feature.melspectrogram(
+            S=S, n_mels=self.n_mels, fmin=self.fmin, fmax=self.fmax, power=1.)
        S_mel = 20 * np.log10(np.maximum(amplitude_min,
                                         S_mel)) - self.ref_level_db
        S_mel_norm = (S_mel - self.min_level_db) / (-self.min_level_db)
@@ -148,20 +160,18 @@ class DataCollector(object):
            (mix_grapheme_phonemes, text_length, speaker_id, S_norm,
             S_mel_norm, num_frames) = example
            text_sequences.append(
-                np.pad(mix_grapheme_phonemes,
-                       (0, max_text_length - text_length)))
+                np.pad(mix_grapheme_phonemes, (0, max_text_length - text_length
+                                               )))
            lin_specs.append(
-                np.pad(S_norm,
-                       ((0, 0), (self._pad_begin,
-                                 max_frames - self._pad_begin - num_frames))))
+                np.pad(S_norm, ((0, 0), (self._pad_begin, max_frames -
+                                         self._pad_begin - num_frames))))
            mel_specs.append(
-                np.pad(S_mel_norm,
-                       ((0, 0), (self._pad_begin,
-                                 max_frames - self._pad_begin - num_frames))))
+                np.pad(S_mel_norm, ((0, 0), (self._pad_begin, max_frames -
+                                             self._pad_begin - num_frames))))
            done_flags.append(
                np.pad(np.zeros((int(np.ceil(num_frames // self._factor)), )),
-                       (0, max_decoder_length -
-                        int(np.ceil(num_frames // self._factor))),
+                       (0, max_decoder_length - int(
+                           np.ceil(num_frames // self._factor))),
                       constant_values=1))
        text_sequences = np.array(text_sequences).astype(np.int64)
        lin_specs = np.transpose(np.array(lin_specs),

--- a/examples/deepvoice3/synthesis.py
+++ b/examples/deepvoice3/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import argparse
 import ruamel.yaml
@@ -22,11 +36,8 @@ if __name__ == "__main__":
    parser.add_argument("checkpoint", type=str, help="checkpoint to load.")
    parser.add_argument("text", type=str, help="text file to synthesize")
    parser.add_argument("output_path", type=str, help="path to save results")
-    parser.add_argument("-g",
-                        "--device",
-                        type=int,
-                        default=-1,
-                        help="device to use")
+    parser.add_argument(
+        "-g", "--device", type=int, default=-1, help="device to use")

    args = parser.parse_args()
    with open(args.config, 'rt') as f:
@@ -76,15 +87,14 @@ if __name__ == "__main__":
        window_ahead = model_config["window_ahead"]
        key_projection = model_config["key_projection"]
        value_projection = model_config["value_projection"]
-        dv3 = make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
-                         padding_idx, embedding_std, max_positions, n_vocab,
-                         freeze_embedding, filter_size, encoder_channels,
-                         n_mels, decoder_channels, r,
-                         trainable_positional_encodings, use_memory_mask,
-                         query_position_rate, key_position_rate,
-                         window_backward, window_ahead, key_projection,
-                         value_projection, downsample_factor, linear_dim,
-                         use_decoder_states, converter_channels, dropout)
+        dv3 = make_model(
+            n_speakers, speaker_dim, speaker_embed_std, embed_dim, padding_idx,
+            embedding_std, max_positions, n_vocab, freeze_embedding,
+            filter_size, encoder_channels, n_mels, decoder_channels, r,
+            trainable_positional_encodings, use_memory_mask,
+            query_position_rate, key_position_rate, window_backward,
+            window_ahead, key_projection, value_projection, downsample_factor,
+            linear_dim, use_decoder_states, converter_channels, dropout)

        summary(dv3)
        state, _ = dg.load_dygraph(args.checkpoint)

--- a/examples/deepvoice3/train.py
+++ b/examples/deepvoice3/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import argparse
 import ruamel.yaml

--- a/examples/deepvoice3/utils.py
+++ b/examples/deepvoice3/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import numpy as np
 from matplotlib import cm
@@ -28,8 +42,9 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
               converter_channels, dropout):
    """just a simple function to create a deepvoice 3 model"""
    if n_speakers > 1:
-        spe = dg.Embedding((n_speakers, speaker_dim),
-                           param_attr=I.Normal(scale=speaker_embed_std))
+        spe = dg.Embedding(
+            (n_speakers, speaker_dim),
+            param_attr=I.Normal(scale=speaker_embed_std))
    else:
        spe = None

@@ -45,17 +60,17 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
        ConvSpec(h, k, 1),
-        ConvSpec(h, k, 3),
-    )
-    enc = Encoder(n_vocab,
-                  embed_dim,
-                  n_speakers,
-                  speaker_dim,
-                  padding_idx=None,
-                  embedding_weight_std=embedding_std,
-                  convolutions=encoder_convolutions,
-                  max_positions=max_positions,
-                  dropout=dropout)
+        ConvSpec(h, k, 3), )
+    enc = Encoder(
+        n_vocab,
+        embed_dim,
+        n_speakers,
+        speaker_dim,
+        padding_idx=None,
+        embedding_weight_std=embedding_std,
+        convolutions=encoder_convolutions,
+        max_positions=max_positions,
+        dropout=dropout)
    if freeze_embedding:
        freeze(enc.embed)

@@ -66,28 +81,28 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
        ConvSpec(h, k, 3),
        ConvSpec(h, k, 9),
        ConvSpec(h, k, 27),
-        ConvSpec(h, k, 1),
-    )
+        ConvSpec(h, k, 1), )
    attention = [True, False, False, False, True]
    force_monotonic_attention = [True, False, False, False, True]
-    dec = Decoder(n_speakers,
-                  speaker_dim,
-                  embed_dim,
-                  mel_dim,
-                  r=r,
-                  max_positions=max_positions,
-                  padding_idx=padding_idx,
-                  preattention=prenet_convolutions,
-                  convolutions=attentive_convolutions,
-                  attention=attention,
-                  dropout=dropout,
-                  use_memory_mask=use_memory_mask,
-                  force_monotonic_attention=force_monotonic_attention,
-                  query_position_rate=query_position_rate,
-                  key_position_rate=key_position_rate,
-                  window_range=WindowRange(window_behind, window_ahead),
-                  key_projection=key_projection,
-                  value_projection=value_projection)
+    dec = Decoder(
+        n_speakers,
+        speaker_dim,
+        embed_dim,
+        mel_dim,
+        r=r,
+        max_positions=max_positions,
+        padding_idx=padding_idx,
+        preattention=prenet_convolutions,
+        convolutions=attentive_convolutions,
+        attention=attention,
+        dropout=dropout,
+        use_memory_mask=use_memory_mask,
+        force_monotonic_attention=force_monotonic_attention,
+        query_position_rate=query_position_rate,
+        key_position_rate=key_position_rate,
+        window_range=WindowRange(window_behind, window_ahead),
+        key_projection=key_projection,
+        value_projection=value_projection)
    if not trainable_positional_encodings:
        freeze(dec.embed_keys_positions)
        freeze(dec.embed_query_positions)
@@ -97,15 +112,15 @@ def make_model(n_speakers, speaker_dim, speaker_embed_std, embed_dim,
        ConvSpec(h, k, 1),
        ConvSpec(h, k, 3),
        ConvSpec(2 * h, k, 1),
-        ConvSpec(2 * h, k, 3),
-    )
-    cvt = Converter(n_speakers,
-                    speaker_dim,
-                    dec.state_dim if use_decoder_states else mel_dim,
-                    linear_dim,
-                    time_upsampling=downsample_factor,
-                    convolutions=postnet_convolutions,
-                    dropout=dropout)
+        ConvSpec(2 * h, k, 3), )
+    cvt = Converter(
+        n_speakers,
+        speaker_dim,
+        dec.state_dim if use_decoder_states else mel_dim,
+        linear_dim,
+        time_upsampling=downsample_factor,
+        convolutions=postnet_convolutions,
+        dropout=dropout)
    dv3 = DeepVoice3(enc, dec, cvt, spe, use_decoder_states)
    return dv3

@@ -115,8 +130,10 @@ def eval_model(model, text, replace_pronounciation_prob, min_level_db,
               ref_level_db, power, n_iter, win_length, hop_length,
               preemphasis):
    """generate waveform from text using a deepvoice 3 model"""
-    text = np.array(en.text_to_sequence(text, p=replace_pronounciation_prob),
-                    dtype=np.int64)
+    text = np.array(
+        en.text_to_sequence(
+            text, p=replace_pronounciation_prob),
+        dtype=np.int64)
    length = len(text)
    print("text sequence's length: {}".format(length))
    text_positions = np.arange(1, 1 + length)
@@ -145,10 +162,11 @@ def spec_to_waveform(spec, min_level_db, ref_level_db, power, n_iter,
    """
    denoramlized = np.clip(spec, 0, 1) * (-min_level_db) + min_level_db
    lin_scaled = np.exp((denoramlized + ref_level_db) / 20 * np.log(10))
-    wav = librosa.griffinlim(lin_scaled**power,
-                             n_iter=n_iter,
-                             hop_length=hop_length,
-                             win_length=win_length)
+    wav = librosa.griffinlim(
+        lin_scaled**power,
+        n_iter=n_iter,
+        hop_length=hop_length,
+        win_length=win_length)
    if preemphasis > 0:
        wav = signal.lfilter([1.], [1., -preemphasis], wav)
    return wav
@@ -225,28 +243,30 @@ def save_state(save_dir,
        plt.colorbar()
        plt.title("mel_input")
        plt.savefig(
-            os.path.join(path,
-                         "target_mel_spec_step{:09d}.png".format(global_step)))
+            os.path.join(path, "target_mel_spec_step{:09d}.png".format(
+                global_step)))
        plt.close()

-        writer.add_image("target/mel_spec",
-                         cm.viridis(mel_input),
-                         global_step,
-                         dataformats="HWC")
+        writer.add_image(
+            "target/mel_spec",
+            cm.viridis(mel_input),
+            global_step,
+            dataformats="HWC")

        plt.figure(figsize=(10, 3))
        display.specshow(mel_output)
        plt.colorbar()
        plt.title("mel_output")
        plt.savefig(
-            os.path.join(
-                path, "predicted_mel_spec_step{:09d}.png".format(global_step)))
+            os.path.join(path, "predicted_mel_spec_step{:09d}.png".format(
+                global_step)))
        plt.close()

-        writer.add_image("predicted/mel_spec",
-                         cm.viridis(mel_output),
-                         global_step,
-                         dataformats="HWC")
+        writer.add_image(
+            "predicted/mel_spec",
+            cm.viridis(mel_output),
+            global_step,
+            dataformats="HWC")

    if lin_input is not None and lin_output is not None:
        lin_input = lin_input[0].numpy().T
@@ -258,28 +278,30 @@ def save_state(save_dir,
        plt.colorbar()
        plt.title("mel_input")
        plt.savefig(
-            os.path.join(path,
-                         "target_lin_spec_step{:09d}.png".format(global_step)))
+            os.path.join(path, "target_lin_spec_step{:09d}.png".format(
+                global_step)))
        plt.close()

-        writer.add_image("target/lin_spec",
-                         cm.viridis(lin_input),
-                         global_step,
-                         dataformats="HWC")
+        writer.add_image(
+            "target/lin_spec",
+            cm.viridis(lin_input),
+            global_step,
+            dataformats="HWC")

        plt.figure(figsize=(10, 3))
        display.specshow(lin_output)
        plt.colorbar()
        plt.title("mel_input")
        plt.savefig(
-            os.path.join(
-                path, "predicted_lin_spec_step{:09d}.png".format(global_step)))
+            os.path.join(path, "predicted_lin_spec_step{:09d}.png".format(
+                global_step)))
        plt.close()

-        writer.add_image("predicted/lin_spec",
-                         cm.viridis(lin_output),
-                         global_step,
-                         dataformats="HWC")
+        writer.add_image(
+            "predicted/lin_spec",
+            cm.viridis(lin_output),
+            global_step,
+            dataformats="HWC")

    if alignments is not None and len(alignments.shape) == 4:
        path = os.path.join(save_dir, "alignments")
@@ -290,10 +312,11 @@ def save_state(save_dir,
                "train_attn_layer_{}_step_{}.png".format(idx, global_step))
            plot_alignment(attn_layer, save_path)

-            writer.add_image("train_attn/layer_{}".format(idx),
-                             cm.viridis(attn_layer),
-                             global_step,
-                             dataformats="HWC")
+            writer.add_image(
+                "train_attn/layer_{}".format(idx),
+                cm.viridis(attn_layer),
+                global_step,
+                dataformats="HWC")

    if lin_output is not None:
        wav = spec_to_waveform(lin_output, min_level_db, ref_level_db, power,
@@ -302,7 +325,5 @@ def save_state(save_dir,
        save_path = os.path.join(
            path, "train_sample_step_{:09d}.wav".format(global_step))
        sf.write(save_path, wav, sample_rate)
-        writer.add_audio("train_sample",
-                         wav,
-                         global_step,
-                         sample_rate=sample_rate)
+        writer.add_audio(
+            "train_sample", wav, global_step, sample_rate=sample_rate)
--- a/examples/fastspeech/README.md
+++ b/examples/fastspeech/README.md
@@ -57,7 +57,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr

 if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--fastspeech_step``

-For more help on arguments: 
+For more help on arguments:
 ``python train.py --help``.

 ## Synthesis
@@ -75,5 +75,5 @@ or you can run the script file directly.
 sh synthesis.sh
 ```

-For more help on arguments: 
+For more help on arguments:
 ``python synthesis.py --help``.
--- a/examples/fastspeech/parse.py
+++ b/examples/fastspeech/parse.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse

+
 def add_config_options_to_parser(parser):
-    parser.add_argument('--config_path', type=str, default='config/fastspeech.yaml',
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default='config/fastspeech.yaml',
        help="the yaml config file path.")
-    parser.add_argument('--batch_size', type=int, default=32,
-        help="batch size for training.")
-    parser.add_argument('--epochs', type=int, default=10000,
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help="batch size for training.")
+    parser.add_argument(
+        '--epochs',
+        type=int,
+        default=10000,
        help="the number of epoch for training.")
-    parser.add_argument('--lr', type=float, default=0.001,
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=0.001,
        help="the learning rate for training.")
-    parser.add_argument('--save_step', type=int, default=500,
+    parser.add_argument(
+        '--save_step',
+        type=int,
+        default=500,
        help="checkpointing interval during training.")
-    parser.add_argument('--fastspeech_step', type=int, default=70000,
+    parser.add_argument(
+        '--fastspeech_step',
+        type=int,
+        default=70000,
        help="Global step to restore checkpoint of fastspeech.")
-    parser.add_argument('--use_gpu', type=int, default=1,
+    parser.add_argument(
+        '--use_gpu',
+        type=int,
+        default=1,
        help="use gpu or not during training.")
-    parser.add_argument('--use_data_parallel', type=int, default=0,
+    parser.add_argument(
+        '--use_data_parallel',
+        type=int,
+        default=0,
        help="use data parallel or not during training.")

-    parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='./dataset/LJSpeech-1.1',
        help="the path of dataset.")
-    parser.add_argument('--checkpoint_path', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint_path',
+        type=str,
+        default=None,
        help="the path to load checkpoint or pretrain model.")
-    parser.add_argument('--save_path', type=str, default='./checkpoint',
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./checkpoint',
        help="the path to save checkpoint.")
-    parser.add_argument('--log_dir', type=str, default='./log',
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default='./log',
        help="the directory to save tensorboard log.")
-    parser.add_argument('--sample_path', type=str, default='./sample',
+    parser.add_argument(
+        '--sample_path',
+        type=str,
+        default='./sample',
        help="the directory to save audio sample in synthesis.")
-    parser.add_argument('--transtts_path', type=str, default='./log',
+    parser.add_argument(
+        '--transtts_path',
+        type=str,
+        default='./log',
        help="the directory to load pretrain transformerTTS model.")
-    parser.add_argument('--transformer_step', type=int, default=160000,
+    parser.add_argument(
+        '--transformer_step',
+        type=int,
+        default=160000,
        help="the step to load transformerTTS model.")
-    
-
--- a/examples/fastspeech/synthesis.py
+++ b/examples/fastspeech/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from tensorboardX import SummaryWriter
 from collections import OrderedDict
@@ -12,6 +25,7 @@ from parakeet.g2p.en import text_to_sequence
 from parakeet import audio
 from parakeet.models.fastspeech.fastspeech import FastSpeech

+
 def load_checkpoint(step, model_path):
    model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
    new_state_dict = OrderedDict()
@@ -22,13 +36,14 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict

+
 def synthesis(text_input, args):
    place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())

    # tensorboard
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'synthesis')
+        os.mkdir(args.log_dir)
+    path = os.path.join(args.log_dir, 'synthesis')

    with open(args.config_path) as f:
        cfg = yaml.load(f, Loader=yaml.Loader)
@@ -37,24 +52,28 @@ def synthesis(text_input, args):

    with dg.guard(place):
        model = FastSpeech(cfg)
-        model.set_dict(load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech")))
+        model.set_dict(
+            load_checkpoint(
+                str(args.fastspeech_step),
+                os.path.join(args.checkpoint_path, "fastspeech")))
        model.eval()

        text = np.asarray(text_to_sequence(text_input))
-        text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
-        pos_text = np.arange(1, text.shape[1]+1)
-        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
+        text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
+        pos_text = np.arange(1, text.shape[1] + 1)
+        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])

-        mel_output, mel_output_postnet = model(text, pos_text, alpha=args.alpha)
+        mel_output, mel_output_postnet = model(
+            text, pos_text, alpha=args.alpha)

        _ljspeech_processor = audio.AudioProcessor(
-            sample_rate=cfg['audio']['sr'], 
-            num_mels=cfg['audio']['num_mels'], 
-            min_level_db=cfg['audio']['min_level_db'], 
-            ref_level_db=cfg['audio']['ref_level_db'], 
-            n_fft=cfg['audio']['n_fft'], 
-            win_length= cfg['audio']['win_length'], 
-            hop_length= cfg['audio']['hop_length'],
+            sample_rate=cfg['audio']['sr'],
+            num_mels=cfg['audio']['num_mels'],
+            min_level_db=cfg['audio']['min_level_db'],
+            ref_level_db=cfg['audio']['ref_level_db'],
+            n_fft=cfg['audio']['n_fft'],
+            win_length=cfg['audio']['win_length'],
+            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
@@ -67,14 +86,17 @@ def synthesis(text_input, args):
            do_trim_silence=False,
            sound_norm=False)

-        mel_output_postnet = fluid.layers.transpose(fluid.layers.squeeze(mel_output_postnet,[0]), [1,0])
-        wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy())
+        mel_output_postnet = fluid.layers.transpose(
+            fluid.layers.squeeze(mel_output_postnet, [0]), [1, 0])
+        wav = _ljspeech_processor.inv_melspectrogram(mel_output_postnet.numpy(
+        ))
        writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
        print("Synthesis completed !!!")
    writer.close()

+
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train Fastspeech model")
    add_config_options_to_parser(parser)
    args = parser.parse_args()
-    synthesis("Transformer model is so fast!", args)
\ No newline at end of file
+    synthesis("Transformer model is so fast!", args)
--- a/examples/fastspeech/train.py
+++ b/examples/fastspeech/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import argparse
 import os
@@ -20,8 +33,10 @@ import sys
 sys.path.append("../transformer_tts")
 from data import LJSpeechLoader

+
 def load_checkpoint(step, model_path):
-    model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
+    model_dict, opti_dict = fluid.dygraph.load_dygraph(
+        os.path.join(model_path, step))
    new_state_dict = OrderedDict()
    for param in model_dict:
        if param.startswith('_layers.'):
@@ -30,6 +45,7 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict, opti_dict

+
 def main(args):
    local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
    nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1
@@ -43,26 +59,33 @@ def main(args):
             if args.use_gpu else fluid.CPUPlace())

    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'fastspeech')
+        os.mkdir(args.log_dir)
+    path = os.path.join(args.log_dir, 'fastspeech')

    writer = SummaryWriter(path) if local_rank == 0 else None

    with dg.guard(place):
        with fluid.unique_name.guard():
            transformerTTS = TransformerTTS(cfg)
-            model_dict, _ = load_checkpoint(str(args.transformer_step), os.path.join(args.transtts_path, "transformer"))
+            model_dict, _ = load_checkpoint(
+                str(args.transformer_step),
+                os.path.join(args.transtts_path, "transformer"))
            transformerTTS.set_dict(model_dict)
            transformerTTS.eval()

        model = FastSpeech(cfg)
        model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
-                                                  parameter_list=model.parameters())
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
-        
+        optimizer = fluid.optimizer.AdamOptimizer(
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
+            parameter_list=model.parameters())
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, shuffle=True).reader()
+
        if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.fastspeech_step), os.path.join(args.checkpoint_path, "fastspeech"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.fastspeech_step),
+                os.path.join(args.checkpoint_path, "fastspeech"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.fastspeech_step
@@ -76,31 +99,42 @@ def main(args):
            pbar = tqdm(reader)

            for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                character, mel, mel_input, pos_text, pos_mel, text_length, mel_lens = data

-                _, _, attn_probs, _, _, _ = transformerTTS(character, mel_input, pos_text, pos_mel)
-                alignment = dg.to_variable(get_alignment(attn_probs, mel_lens, cfg['transformer_head'])).astype(np.float32)
+                _, _, attn_probs, _, _, _ = transformerTTS(
+                    character, mel_input, pos_text, pos_mel)
+                alignment = dg.to_variable(
+                    get_alignment(attn_probs, mel_lens, cfg[
+                        'transformer_head'])).astype(np.float32)

                global_step += 1
-                    
+
                #Forward
-                result= model(character, 
-                              pos_text, 
-                              mel_pos=pos_mel,  
-                              length_target=alignment)
+                result = model(
+                    character,
+                    pos_text,
+                    mel_pos=pos_mel,
+                    length_target=alignment)
                mel_output, mel_output_postnet, duration_predictor_output, _, _ = result
                mel_loss = layers.mse_loss(mel_output, mel)
                mel_postnet_loss = layers.mse_loss(mel_output_postnet, mel)
-                duration_loss = layers.mean(layers.abs(layers.elementwise_sub(duration_predictor_output, alignment)))
+                duration_loss = layers.mean(
+                    layers.abs(
+                        layers.elementwise_sub(duration_predictor_output,
+                                               alignment)))
                total_loss = mel_loss + mel_postnet_loss + duration_loss

-                if local_rank==0:
-                    writer.add_scalar('mel_loss', mel_loss.numpy(), global_step)
-                    writer.add_scalar('post_mel_loss', mel_postnet_loss.numpy(), global_step)
-                    writer.add_scalar('duration_loss', duration_loss.numpy(), global_step)
-                    writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
-
+                if local_rank == 0:
+                    writer.add_scalar('mel_loss',
+                                      mel_loss.numpy(), global_step)
+                    writer.add_scalar('post_mel_loss',
+                                      mel_postnet_loss.numpy(), global_step)
+                    writer.add_scalar('duration_loss',
+                                      duration_loss.numpy(), global_step)
+                    writer.add_scalar('learning_rate',
+                                      optimizer._learning_rate.step().numpy(),
+                                      global_step)

                if args.use_data_parallel:
                    total_loss = model.scale_loss(total_loss)
@@ -108,21 +142,25 @@ def main(args):
                    model.apply_collective_grads()
                else:
                    total_loss.backward()
-                optimizer.minimize(total_loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    total_loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                model.clear_gradients()

-                 # save checkpoint
-                if local_rank==0 and global_step % args.save_step == 0:
+                # save checkpoint
+                if local_rank == 0 and global_step % args.save_step == 0:
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
-                    save_path = os.path.join(args.save_path,'fastspeech/%d' % global_step)
+                    save_path = os.path.join(args.save_path,
+                                             'fastspeech/%d' % global_step)
                    dg.save_dygraph(model.state_dict(), save_path)
                    dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
            writer.close()


-if __name__ =='__main__':
+if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train Fastspeech model")
    add_config_options_to_parser(parser)
    args = parser.parse_args()

--- a/examples/transformer_tts/README.md
+++ b/examples/transformer_tts/README.md
@@ -50,7 +50,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr

 if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--transformer_step``

-For more help on arguments: 
+For more help on arguments:
 ``python train_transformer.py --help``.

 ## Train Vocoder
@@ -78,7 +78,7 @@ python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog tr
 ```
 if you wish to resume from an exists model, please set ``--checkpoint_path`` and ``--vocoder_step``

-For more help on arguments: 
+For more help on arguments:
 ``python train_vocoder.py --help``.

 ## Synthesis
@@ -101,5 +101,5 @@ sh synthesis.sh

 And the audio file will be saved in ``--sample_path``.

-For more help on arguments: 
+For more help on arguments:
 ``python synthesis.py --help``.
--- a/examples/transformer_tts/data.py
+++ b/examples/transformer_tts/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from pathlib import Path
 import numpy as np
 import pandas as pd
@@ -12,23 +25,43 @@ from parakeet.data.datacargo import DataCargo
 from parakeet.data.batch import TextIDBatcher, SpecBatcher
 from parakeet.data.dataset import DatasetMixin, TransformDataset

+
 class LJSpeechLoader:
-    def __init__(self, config, args, nranks, rank, is_vocoder=False, shuffle=True):
+    def __init__(self,
+                 config,
+                 args,
+                 nranks,
+                 rank,
+                 is_vocoder=False,
+                 shuffle=True):
        place = fluid.CUDAPlace(rank) if args.use_gpu else fluid.CPUPlace()

        LJSPEECH_ROOT = Path(args.data_path)
        metadata = LJSpeechMetaData(LJSPEECH_ROOT)
        transformer = LJSpeech(config)
        dataset = TransformDataset(metadata, transformer)
-        sampler = DistributedSampler(len(metadata), nranks, rank, shuffle=shuffle)
+        sampler = DistributedSampler(
+            len(metadata), nranks, rank, shuffle=shuffle)

        assert args.batch_size % nranks == 0
        each_bs = args.batch_size // nranks
        if is_vocoder:
-            dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples_vocoder, drop_last=True)
+            dataloader = DataCargo(
+                dataset,
+                sampler=sampler,
+                batch_size=each_bs,
+                shuffle=shuffle,
+                batch_fn=batch_examples_vocoder,
+                drop_last=True)
        else:
-            dataloader = DataCargo(dataset, sampler=sampler, batch_size=each_bs, shuffle=shuffle, batch_fn=batch_examples, drop_last=True)
-        
+            dataloader = DataCargo(
+                dataset,
+                sampler=sampler,
+                batch_size=each_bs,
+                shuffle=shuffle,
+                batch_fn=batch_examples,
+                drop_last=True)
+
        self.reader = fluid.io.DataLoader.from_generator(
            capacity=32,
            iterable=True,
@@ -63,13 +96,13 @@ class LJSpeech(object):
        super(LJSpeech, self).__init__()
        self.config = config
        self._ljspeech_processor = audio.AudioProcessor(
-            sample_rate=config['audio']['sr'], 
-            num_mels=config['audio']['num_mels'], 
-            min_level_db=config['audio']['min_level_db'], 
-            ref_level_db=config['audio']['ref_level_db'], 
-            n_fft=config['audio']['n_fft'], 
-            win_length= config['audio']['win_length'], 
-            hop_length= config['audio']['hop_length'],
+            sample_rate=config['audio']['sr'],
+            num_mels=config['audio']['num_mels'],
+            min_level_db=config['audio']['min_level_db'],
+            ref_level_db=config['audio']['ref_level_db'],
+            n_fft=config['audio']['n_fft'],
+            win_length=config['audio']['win_length'],
+            hop_length=config['audio']['hop_length'],
            power=config['audio']['power'],
            preemphasis=config['audio']['preemphasis'],
            signal_norm=True,
@@ -81,7 +114,7 @@ class LJSpeech(object):
            griffin_lim_iters=60,
            do_trim_silence=False,
            sound_norm=False)
-            
+
    def __call__(self, metadatum):
        """All the code for generating an Example from a metadatum. If you want a 
        different preprocessing pipeline, you can override this method. 
@@ -90,13 +123,15 @@ class LJSpeech(object):
        method.
        """
        fname, raw_text, normalized_text = metadatum
-        
+
        # load -> trim -> preemphasis -> stft -> magnitude -> mel_scale -> logscale -> normalize
        wav = self._ljspeech_processor.load_wav(str(fname))
        mag = self._ljspeech_processor.spectrogram(wav).astype(np.float32)
        mel = self._ljspeech_processor.melspectrogram(wav).astype(np.float32)
-        phonemes = np.array(g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
-        return (mag, mel, phonemes) # maybe we need to implement it as a map in the future
+        phonemes = np.array(
+            g2p.en.text_to_sequence(normalized_text), dtype=np.int64)
+        return (mag, mel, phonemes
+                )  # maybe we need to implement it as a map in the future


 def batch_examples(batch):
@@ -109,44 +144,71 @@ def batch_examples(batch):
    pos_mels = []
    for data in batch:
        _, mel, text = data
-        mel_inputs.append(np.concatenate([np.zeros([mel.shape[0], 1], np.float32), mel[:,:-1]], axis=-1))
+        mel_inputs.append(
+            np.concatenate(
+                [np.zeros([mel.shape[0], 1], np.float32), mel[:, :-1]],
+                axis=-1))
        mel_lens.append(mel.shape[1])
        text_lens.append(len(text))
        pos_texts.append(np.arange(1, len(text) + 1))
        pos_mels.append(np.arange(1, mel.shape[1] + 1))
        mels.append(mel)
        texts.append(text)
-    
+
    # Sort by text_len in descending order
-    texts = [i for i,_ in sorted(zip(texts, text_lens), key=lambda x: x[1], reverse=True)]
-    mels = [i for i,_ in sorted(zip(mels, text_lens), key=lambda x: x[1], reverse=True)]
-    mel_inputs = [i for i,_ in sorted(zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)]
-    mel_lens = [i for i,_ in sorted(zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)]
-    pos_texts = [i for i,_ in sorted(zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)]
-    pos_mels = [i for i,_ in sorted(zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)]
+    texts = [
+        i
+        for i, _ in sorted(
+            zip(texts, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    mels = [
+        i
+        for i, _ in sorted(
+            zip(mels, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    mel_inputs = [
+        i
+        for i, _ in sorted(
+            zip(mel_inputs, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    mel_lens = [
+        i
+        for i, _ in sorted(
+            zip(mel_lens, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    pos_texts = [
+        i
+        for i, _ in sorted(
+            zip(pos_texts, text_lens), key=lambda x: x[1], reverse=True)
+    ]
+    pos_mels = [
+        i
+        for i, _ in sorted(
+            zip(pos_mels, text_lens), key=lambda x: x[1], reverse=True)
+    ]
    text_lens = sorted(text_lens, reverse=True)

    # Pad sequence with largest len of the batch
-    texts = TextIDBatcher(pad_id=0)(texts)   #(B, T)
-    pos_texts = TextIDBatcher(pad_id=0)(pos_texts) #(B,T)
-    pos_mels = TextIDBatcher(pad_id=0)(pos_mels) #(B,T)
-    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1)) #(B,T,num_mels)
-    mel_inputs = np.transpose(SpecBatcher(pad_value=0.)(mel_inputs), axes=(0,2,1))#(B,T,num_mels)
-    return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens), np.array(mel_lens))
+    texts = TextIDBatcher(pad_id=0)(texts)  #(B, T)
+    pos_texts = TextIDBatcher(pad_id=0)(pos_texts)  #(B,T)
+    pos_mels = TextIDBatcher(pad_id=0)(pos_mels)  #(B,T)
+    mels = np.transpose(
+        SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))  #(B,T,num_mels)
+    mel_inputs = np.transpose(
+        SpecBatcher(pad_value=0.)(mel_inputs), axes=(0, 2, 1))  #(B,T,num_mels)
+    return (texts, mels, mel_inputs, pos_texts, pos_mels, np.array(text_lens),
+            np.array(mel_lens))
+

 def batch_examples_vocoder(batch):
-    mels=[]
-    mags=[]
+    mels = []
+    mags = []
    for data in batch:
        mag, mel, _ = data
        mels.append(mel)
        mags.append(mag)

-    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0,2,1))
-    mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0,2,1))
+    mels = np.transpose(SpecBatcher(pad_value=0.)(mels), axes=(0, 2, 1))
+    mags = np.transpose(SpecBatcher(pad_value=0.)(mags), axes=(0, 2, 1))

    return (mels, mags)
-
-
-
-        
--- a/examples/transformer_tts/parse.py
+++ b/examples/transformer_tts/parse.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import argparse

+
 def add_config_options_to_parser(parser):
-    parser.add_argument('--config_path', type=str, default='config/train_transformer.yaml',
+    parser.add_argument(
+        '--config_path',
+        type=str,
+        default='config/train_transformer.yaml',
        help="the yaml config file path.")
-    parser.add_argument('--batch_size', type=int, default=32,
-        help="batch size for training.")
-    parser.add_argument('--epochs', type=int, default=10000,
+    parser.add_argument(
+        '--batch_size', type=int, default=32, help="batch size for training.")
+    parser.add_argument(
+        '--epochs',
+        type=int,
+        default=10000,
        help="the number of epoch for training.")
-    parser.add_argument('--lr', type=float, default=0.001,
+    parser.add_argument(
+        '--lr',
+        type=float,
+        default=0.001,
        help="the learning rate for training.")
-    parser.add_argument('--save_step', type=int, default=500,
+    parser.add_argument(
+        '--save_step',
+        type=int,
+        default=500,
        help="checkpointing interval during training.")
-    parser.add_argument('--image_step', type=int, default=2000,
+    parser.add_argument(
+        '--image_step',
+        type=int,
+        default=2000,
        help="attention image interval during training.")
-    parser.add_argument('--max_len', type=int, default=400,
+    parser.add_argument(
+        '--max_len',
+        type=int,
+        default=400,
        help="The max length of audio when synthsis.")
-    parser.add_argument('--transformer_step', type=int, default=160000,
+    parser.add_argument(
+        '--transformer_step',
+        type=int,
+        default=160000,
        help="Global step to restore checkpoint of transformer.")
-    parser.add_argument('--vocoder_step', type=int, default=90000,
+    parser.add_argument(
+        '--vocoder_step',
+        type=int,
+        default=90000,
        help="Global step to restore checkpoint of postnet.")
-    parser.add_argument('--use_gpu', type=int, default=1,
+    parser.add_argument(
+        '--use_gpu',
+        type=int,
+        default=1,
        help="use gpu or not during training.")
-    parser.add_argument('--use_data_parallel', type=int, default=0,
+    parser.add_argument(
+        '--use_data_parallel',
+        type=int,
+        default=0,
        help="use data parallel or not during training.")
-    parser.add_argument('--stop_token', type=int, default=0,
+    parser.add_argument(
+        '--stop_token',
+        type=int,
+        default=0,
        help="use stop token loss in network or not.")

-    parser.add_argument('--data_path', type=str, default='./dataset/LJSpeech-1.1',
+    parser.add_argument(
+        '--data_path',
+        type=str,
+        default='./dataset/LJSpeech-1.1',
        help="the path of dataset.")
-    parser.add_argument('--checkpoint_path', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint_path',
+        type=str,
+        default=None,
        help="the path to load checkpoint or pretrain model.")
-    parser.add_argument('--save_path', type=str, default='./checkpoint',
+    parser.add_argument(
+        '--save_path',
+        type=str,
+        default='./checkpoint',
        help="the path to save checkpoint.")
-    parser.add_argument('--log_dir', type=str, default='./log',
+    parser.add_argument(
+        '--log_dir',
+        type=str,
+        default='./log',
        help="the directory to save tensorboard log.")
-    parser.add_argument('--sample_path', type=str, default='./sample',
+    parser.add_argument(
+        '--sample_path',
+        type=str,
+        default='./sample',
        help="the directory to save audio sample in synthesis.")
--- a/examples/transformer_tts/synthesis.py
+++ b/examples/transformer_tts/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from scipy.io.wavfile import write
 from parakeet.g2p.en import text_to_sequence
@@ -16,6 +29,7 @@ from parakeet import audio
 from parakeet.models.transformer_tts.vocoder import Vocoder
 from parakeet.models.transformer_tts.transformer_tts import TransformerTTS

+
 def load_checkpoint(step, model_path):
    model_dict, _ = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
    new_state_dict = OrderedDict()
@@ -26,6 +40,7 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict

+
 def synthesis(text_input, args):
    place = (fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace())

@@ -34,46 +49,53 @@ def synthesis(text_input, args):

    # tensorboard
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'synthesis')
+        os.mkdir(args.log_dir)
+    path = os.path.join(args.log_dir, 'synthesis')

    writer = SummaryWriter(path)

    with dg.guard(place):
        with fluid.unique_name.guard():
            model = TransformerTTS(cfg)
-            model.set_dict(load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer")))
+            model.set_dict(
+                load_checkpoint(
+                    str(args.transformer_step),
+                    os.path.join(args.checkpoint_path, "transformer")))
            model.eval()
-        
+
        with fluid.unique_name.guard():
            model_vocoder = Vocoder(cfg, args.batch_size)
-            model_vocoder.set_dict(load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder")))
+            model_vocoder.set_dict(
+                load_checkpoint(
+                    str(args.vocoder_step),
+                    os.path.join(args.checkpoint_path, "vocoder")))
            model_vocoder.eval()
        # init input
        text = np.asarray(text_to_sequence(text_input))
-        text = fluid.layers.unsqueeze(dg.to_variable(text),[0])
-        mel_input = dg.to_variable(np.zeros([1,1,80])).astype(np.float32)
-        pos_text = np.arange(1, text.shape[1]+1)
-        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text),[0])
-        
+        text = fluid.layers.unsqueeze(dg.to_variable(text), [0])
+        mel_input = dg.to_variable(np.zeros([1, 1, 80])).astype(np.float32)
+        pos_text = np.arange(1, text.shape[1] + 1)
+        pos_text = fluid.layers.unsqueeze(dg.to_variable(pos_text), [0])

        pbar = tqdm(range(args.max_len))

        for i in pbar:
-            pos_mel = np.arange(1, mel_input.shape[1]+1)
-            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel),[0])
-            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(text, mel_input, pos_text, pos_mel)
-            mel_input = fluid.layers.concat([mel_input, postnet_pred[:,-1:,:]], axis=1)
+            pos_mel = np.arange(1, mel_input.shape[1] + 1)
+            pos_mel = fluid.layers.unsqueeze(dg.to_variable(pos_mel), [0])
+            mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
+                text, mel_input, pos_text, pos_mel)
+            mel_input = fluid.layers.concat(
+                [mel_input, postnet_pred[:, -1:, :]], axis=1)
        mag_pred = model_vocoder(postnet_pred)

        _ljspeech_processor = audio.AudioProcessor(
-            sample_rate=cfg['audio']['sr'], 
-            num_mels=cfg['audio']['num_mels'], 
-            min_level_db=cfg['audio']['min_level_db'], 
-            ref_level_db=cfg['audio']['ref_level_db'], 
-            n_fft=cfg['audio']['n_fft'], 
-            win_length= cfg['audio']['win_length'], 
-            hop_length= cfg['audio']['hop_length'],
+            sample_rate=cfg['audio']['sr'],
+            num_mels=cfg['audio']['num_mels'],
+            min_level_db=cfg['audio']['min_level_db'],
+            ref_level_db=cfg['audio']['ref_level_db'],
+            n_fft=cfg['audio']['n_fft'],
+            win_length=cfg['audio']['win_length'],
+            hop_length=cfg['audio']['hop_length'],
            power=cfg['audio']['power'],
            preemphasis=cfg['audio']['preemphasis'],
            signal_norm=True,
@@ -86,13 +108,18 @@ def synthesis(text_input, args):
            do_trim_silence=False,
            sound_norm=False)

-        wav = _ljspeech_processor.inv_spectrogram(fluid.layers.transpose(fluid.layers.squeeze(mag_pred,[0]), [1,0]).numpy())
+        wav = _ljspeech_processor.inv_spectrogram(
+            fluid.layers.transpose(
+                fluid.layers.squeeze(mag_pred, [0]), [1, 0]).numpy())
        writer.add_audio(text_input, wav, 0, cfg['audio']['sr'])
        if not os.path.exists(args.sample_path):
            os.mkdir(args.sample_path)
-        write(os.path.join(args.sample_path,'test.wav'), cfg['audio']['sr'], wav)
+        write(
+            os.path.join(args.sample_path, 'test.wav'), cfg['audio']['sr'],
+            wav)
    writer.close()

+
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Synthesis model")
    add_config_options_to_parser(parser)

--- a/examples/transformer_tts/train_transformer.py
+++ b/examples/transformer_tts/train_transformer.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 from tqdm import tqdm
 from tensorboardX import SummaryWriter
@@ -16,8 +29,10 @@ from parakeet.models.transformer_tts.utils import cross_entropy
 from data import LJSpeechLoader
 from parakeet.models.transformer_tts.transformer_tts import TransformerTTS

+
 def load_checkpoint(step, model_path):
-    model_dict, opti_dict = fluid.dygraph.load_dygraph(os.path.join(model_path, step))
+    model_dict, opti_dict = fluid.dygraph.load_dygraph(
+        os.path.join(model_path, step))
    new_state_dict = OrderedDict()
    for param in model_dict:
        if param.startswith('_layers.'):
@@ -40,22 +55,27 @@ def main(args):
             if args.use_gpu else fluid.CPUPlace())

    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'transformer')
+        os.mkdir(args.log_dir)
+    path = os.path.join(args.log_dir, 'transformer')

    writer = SummaryWriter(path) if local_rank == 0 else None
-    
+
    with dg.guard(place):
        model = TransformerTTS(cfg)

        model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']), 
-                                                  parameter_list=model.parameters())
-        
-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, shuffle=True).reader()
+        optimizer = fluid.optimizer.AdamOptimizer(
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
+            parameter_list=model.parameters())
+
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, shuffle=True).reader()

        if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.transformer_step), os.path.join(args.checkpoint_path, "transformer"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.transformer_step),
+                os.path.join(args.checkpoint_path, "transformer"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.transformer_step
@@ -64,86 +84,112 @@ def main(args):
        if args.use_data_parallel:
            strategy = dg.parallel.prepare_context()
            model = fluid.dygraph.parallel.DataParallel(model, strategy)
-        
+
        for epoch in range(args.epochs):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                character, mel, mel_input, pos_text, pos_mel, text_length, _ = data

                global_step += 1
-                mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(character, mel_input, pos_text, pos_mel)
-                
+                mel_pred, postnet_pred, attn_probs, stop_preds, attn_enc, attn_dec = model(
+                    character, mel_input, pos_text, pos_mel)

                label = (pos_mel == 0).astype(np.float32)
-                    
-                mel_loss = layers.mean(layers.abs(layers.elementwise_sub(mel_pred, mel)))
-                post_mel_loss = layers.mean(layers.abs(layers.elementwise_sub(postnet_pred, mel)))
+
+                mel_loss = layers.mean(
+                    layers.abs(layers.elementwise_sub(mel_pred, mel)))
+                post_mel_loss = layers.mean(
+                    layers.abs(layers.elementwise_sub(postnet_pred, mel)))
                loss = mel_loss + post_mel_loss
                # Note: When used stop token loss the learning did not work.
                if args.stop_token:
                    stop_loss = cross_entropy(stop_preds, label)
                    loss = loss + stop_loss

-                if local_rank==0:
+                if local_rank == 0:
                    writer.add_scalars('training_loss', {
-                        'mel_loss':mel_loss.numpy(),
-                        'post_mel_loss':post_mel_loss.numpy()
+                        'mel_loss': mel_loss.numpy(),
+                        'post_mel_loss': post_mel_loss.numpy()
                    }, global_step)

                    if args.stop_token:
-                        writer.add_scalar('stop_loss', stop_loss.numpy(), global_step)
+                        writer.add_scalar('stop_loss',
+                                          stop_loss.numpy(), global_step)

                    if args.use_data_parallel:
                        writer.add_scalars('alphas', {
-                            'encoder_alpha':model._layers.encoder.alpha.numpy(),
-                            'decoder_alpha':model._layers.decoder.alpha.numpy(),
+                            'encoder_alpha':
+                            model._layers.encoder.alpha.numpy(),
+                            'decoder_alpha':
+                            model._layers.decoder.alpha.numpy(),
                        }, global_step)
                    else:
                        writer.add_scalars('alphas', {
-                            'encoder_alpha':model.encoder.alpha.numpy(),
-                            'decoder_alpha':model.decoder.alpha.numpy(),
+                            'encoder_alpha': model.encoder.alpha.numpy(),
+                            'decoder_alpha': model.decoder.alpha.numpy(),
                        }, global_step)

-                    writer.add_scalar('learning_rate', optimizer._learning_rate.step().numpy(), global_step)
+                    writer.add_scalar('learning_rate',
+                                      optimizer._learning_rate.step().numpy(),
+                                      global_step)

                    if global_step % args.image_step == 1:
                        for i, prob in enumerate(attn_probs):
                            for j in range(4):
-                                    x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
-                                    writer.add_image('Attention_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                x = np.uint8(
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")

                        for i, prob in enumerate(attn_enc):
                            for j in range(4):
-                                x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
-                                writer.add_image('Attention_enc_%d_0'%global_step, x, i*4+j, dataformats="HWC")
+                                x = np.uint8(
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_enc_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")

                        for i, prob in enumerate(attn_dec):
                            for j in range(4):
-                                x = np.uint8(cm.viridis(prob.numpy()[j*16]) * 255)
-                                writer.add_image('Attention_dec_%d_0'%global_step, x, i*4+j, dataformats="HWC")
-                                
+                                x = np.uint8(
+                                    cm.viridis(prob.numpy()[j * 16]) * 255)
+                                writer.add_image(
+                                    'Attention_dec_%d_0' % global_step,
+                                    x,
+                                    i * 4 + j,
+                                    dataformats="HWC")
+
                if args.use_data_parallel:
                    loss = model.scale_loss(loss)
                    loss.backward()
                    model.apply_collective_grads()
                else:
                    loss.backward()
-                optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                model.clear_gradients()
-                
+
                # save checkpoint
-                if local_rank==0 and global_step % args.save_step == 0:
+                if local_rank == 0 and global_step % args.save_step == 0:
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
-                    save_path = os.path.join(args.save_path,'transformer/%d' % global_step)
+                    save_path = os.path.join(args.save_path,
+                                             'transformer/%d' % global_step)
                    dg.save_dygraph(model.state_dict(), save_path)
                    dg.save_dygraph(optimizer.state_dict(), save_path)
-        if local_rank==0:
+        if local_rank == 0:
            writer.close()
-                    

-if __name__ =='__main__':
+
+if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train TransformerTTS model")
    add_config_options_to_parser(parser)


--- a/examples/transformer_tts/train_vocoder.py
+++ b/examples/transformer_tts/train_vocoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from tensorboardX import SummaryWriter
 import os
 from tqdm import tqdm
@@ -13,6 +26,7 @@ import paddle.fluid.layers as layers
 from data import LJSpeechLoader
 from parakeet.models.transformer_tts.vocoder import Vocoder

+
 def load_checkpoint(step, model_path):
    model_dict, opti_dict = dg.load_dygraph(os.path.join(model_path, step))
    new_state_dict = OrderedDict()
@@ -23,8 +37,9 @@ def load_checkpoint(step, model_path):
            new_state_dict[param] = model_dict[param]
    return new_state_dict, opti_dict

+
 def main(args):
-    
+
    local_rank = dg.parallel.Env().local_rank if args.use_data_parallel else 0
    nranks = dg.parallel.Env().nranks if args.use_data_parallel else 1

@@ -35,23 +50,26 @@ def main(args):
    place = (fluid.CUDAPlace(dg.parallel.Env().dev_id)
             if args.use_data_parallel else fluid.CUDAPlace(0)
             if args.use_gpu else fluid.CPUPlace())
-    
+
    if not os.path.exists(args.log_dir):
-            os.mkdir(args.log_dir)
-    path = os.path.join(args.log_dir,'vocoder')
+        os.mkdir(args.log_dir)
+    path = os.path.join(args.log_dir, 'vocoder')

    writer = SummaryWriter(path) if local_rank == 0 else None

-    with dg.guard(place):   
+    with dg.guard(place):
        model = Vocoder(cfg, args.batch_size)

        model.train()
-        optimizer = fluid.optimizer.AdamOptimizer(learning_rate=dg.NoamDecay(1/(cfg['warm_up_step'] *( args.lr ** 2)), cfg['warm_up_step']),
-                                                  parameter_list=model.parameters())
-
+        optimizer = fluid.optimizer.AdamOptimizer(
+            learning_rate=dg.NoamDecay(1 / (
+                cfg['warm_up_step'] * (args.lr**2)), cfg['warm_up_step']),
+            parameter_list=model.parameters())

        if args.checkpoint_path is not None:
-            model_dict, opti_dict = load_checkpoint(str(args.vocoder_step), os.path.join(args.checkpoint_path, "vocoder"))
+            model_dict, opti_dict = load_checkpoint(
+                str(args.vocoder_step),
+                os.path.join(args.checkpoint_path, "vocoder"))
            model.set_dict(model_dict)
            optimizer.set_dict(opti_dict)
            global_step = args.vocoder_step
@@ -61,48 +79,55 @@ def main(args):
            strategy = dg.parallel.prepare_context()
            model = fluid.dygraph.parallel.DataParallel(model, strategy)

-        reader = LJSpeechLoader(cfg, args, nranks, local_rank, is_vocoder=True).reader()
+        reader = LJSpeechLoader(
+            cfg, args, nranks, local_rank, is_vocoder=True).reader()

        for epoch in range(args.epochs):
            pbar = tqdm(reader)
            for i, data in enumerate(pbar):
-                pbar.set_description('Processing at epoch %d'%epoch)
+                pbar.set_description('Processing at epoch %d' % epoch)
                mel, mag = data
                mag = dg.to_variable(mag.numpy())
                mel = dg.to_variable(mel.numpy())
                global_step += 1

                mag_pred = model(mel)
-                loss = layers.mean(layers.abs(layers.elementwise_sub(mag_pred, mag)))
-                
+                loss = layers.mean(
+                    layers.abs(layers.elementwise_sub(mag_pred, mag)))
+
                if args.use_data_parallel:
                    loss = model.scale_loss(loss)
                    loss.backward()
                    model.apply_collective_grads()
                else:
                    loss.backward()
-                optimizer.minimize(loss, grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg['grad_clip_thresh']))
+                optimizer.minimize(
+                    loss,
+                    grad_clip=fluid.dygraph_grad_clip.GradClipByGlobalNorm(cfg[
+                        'grad_clip_thresh']))
                model.clear_gradients()
-                
-                if local_rank==0:
-                    writer.add_scalars('training_loss',{
-                        'loss':loss.numpy(),
+
+                if local_rank == 0:
+                    writer.add_scalars('training_loss', {
+                        'loss': loss.numpy(),
                    }, global_step)

                    if global_step % args.save_step == 0:
                        if not os.path.exists(args.save_path):
                            os.mkdir(args.save_path)
-                        save_path = os.path.join(args.save_path,'vocoder/%d' % global_step)
+                        save_path = os.path.join(args.save_path,
+                                                 'vocoder/%d' % global_step)
                        dg.save_dygraph(model.state_dict(), save_path)
                        dg.save_dygraph(optimizer.state_dict(), save_path)

-        if local_rank==0:
+        if local_rank == 0:
            writer.close()

+
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="Train vocoder model")
    add_config_options_to_parser(parser)
    args = parser.parse_args()
    # Print the whole config setting.
    pprint(args)
-    main(args)
\ No newline at end of file
+    main(args)
--- a/examples/waveflow/benchmark.py
+++ b/examples/waveflow/benchmark.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 from pprint import pprint

--- a/examples/waveflow/synthesis.py
+++ b/examples/waveflow/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 from pprint import pprint

--- a/examples/waveflow/train.py
+++ b/examples/waveflow/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 import subprocess

--- a/examples/waveflow/utils.py
+++ b/examples/waveflow/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import itertools
 import os
 import time

--- a/parakeet/__init__.py
+++ b/parakeet/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 __version__ = "0.0.0"

 from . import data, g2p, models, modules
--- a/parakeet/audio/__init__.py
+++ b/parakeet/audio/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .audio import AudioProcessor
\ No newline at end of file
--- a/parakeet/audio/audio.py
+++ b/parakeet/audio/audio.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import librosa
 import soundfile as sf
 import numpy as np
 import scipy.io
 import scipy.signal

+
 class AudioProcessor(object):
-    def __init__(self,
-                 sample_rate=None, # int, sampling rate
-                 num_mels=None, # int, bands of mel spectrogram
-                 min_level_db=None, # float, minimum level db
-                 ref_level_db=None, # float, reference level db
-                 n_fft=None, # int: number of samples in a frame for stft
-                 win_length=None, # int: the same meaning with n_fft
-                 hop_length=None, # int: number of samples between neighboring frame
-                 power=None, # float:power to raise before griffin-lim
-                 preemphasis=None, # float: preemphasis coefficident
-                 signal_norm=None, # 
-                 symmetric_norm=False, # bool, apply clip norm in [-max_norm, max_form]
-                 max_norm=None, # float, max norm
-                 mel_fmin=None, # int: mel spectrogram's minimum frequency
-                 mel_fmax=None, # int: mel spectrogram's maximum frequency
-                 clip_norm=True, # bool: clip spectrogram's norm
-                 griffin_lim_iters=None, # int:
-                 do_trim_silence=False, # bool: trim silence
-                 sound_norm=False,
-                 **kwargs):
+    def __init__(
+            self,
+            sample_rate=None,  # int, sampling rate
+            num_mels=None,  # int, bands of mel spectrogram
+            min_level_db=None,  # float, minimum level db
+            ref_level_db=None,  # float, reference level db
+            n_fft=None,  # int: number of samples in a frame for stft
+            win_length=None,  # int: the same meaning with n_fft
+            hop_length=None,  # int: number of samples between neighboring frame
+            power=None,  # float:power to raise before griffin-lim
+            preemphasis=None,  # float: preemphasis coefficident
+            signal_norm=None,  # 
+            symmetric_norm=False,  # bool, apply clip norm in [-max_norm, max_form]
+            max_norm=None,  # float, max norm
+            mel_fmin=None,  # int: mel spectrogram's minimum frequency
+            mel_fmax=None,  # int: mel spectrogram's maximum frequency
+            clip_norm=True,  # bool: clip spectrogram's norm
+            griffin_lim_iters=None,  # int:
+            do_trim_silence=False,  # bool: trim silence
+            sound_norm=False,
+            **kwargs):
        self.sample_rate = sample_rate
        self.num_mels = num_mels
        self.min_level_db = min_level_db
@@ -34,8 +50,8 @@ class AudioProcessor(object):
        self.n_fft = n_fft
        self.win_length = win_length or n_fft
        # hop length defaults to 1/4 window_length
-        self.hop_length = hop_length or 0.25 * self.win_length 
-        
+        self.hop_length = hop_length or 0.25 * self.win_length
+
        self.power = power
        self.preemphasis = float(preemphasis)

@@ -52,7 +68,8 @@ class AudioProcessor(object):
        self.do_trim_silence = do_trim_silence

        self.sound_norm = sound_norm
-        self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters()
+        self.num_freq, self.frame_length_ms, self.frame_shift_ms = self._stft_parameters(
+        )

    def _stft_parameters(self):
        """compute frame length and hop length in ms"""
@@ -65,44 +82,54 @@ class AudioProcessor(object):
        """object repr"""
        cls_name_str = self.__class__.__name__
        members = vars(self)
-        dict_str = "\n".join(["  {}: {},".format(k, v) for k, v in members.items()])
+        dict_str = "\n".join(
+            ["  {}: {},".format(k, v) for k, v in members.items()])
        repr_str = "{}(\n{})\n".format(cls_name_str, dict_str)
        return repr_str

    def save_wav(self, path, wav):
        """save audio with scipy.io.wavfile in 16bit integers"""
        wav_norm = wav * (32767 / max(0.01, np.max(np.abs(wav))))
-        scipy.io.wavfile.write(path, self.sample_rate, wav_norm.as_type(np.int16))
+        scipy.io.wavfile.write(path, self.sample_rate,
+                               wav_norm.as_type(np.int16))

    def load_wav(self, path, sr=None):
        """load wav -> trim_silence -> rescale"""

        x, sr = librosa.load(path, sr=None)
-        assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(sr, self.sample_rate)
+        assert self.sample_rate == sr, "audio sample rate: {}Hz != processor sample rate: {}Hz".format(
+            sr, self.sample_rate)
        if self.do_trim_silence:
            try:
                x = self.trim_silence(x)
            except ValueError:
-                print(" [!] File cannot be trimmed for silence - {}".format(path))
+                print(" [!] File cannot be trimmed for silence - {}".format(
+                    path))
        if self.sound_norm:
-            x = x / x.max() * 0.9 # why 0.9 ?
+            x = x / x.max() * 0.9  # why 0.9 ?
        return x

    def trim_silence(self, wav):
        """Trim soilent parts with a threshold and 0.01s margin"""
        margin = int(self.sample_rate * 0.01)
-        wav = wav[margin: -margin]
-        trimed_wav = librosa.effects.trim(wav, top_db=60, frame_length=self.win_length, hop_length=self.hop_length)[0]
+        wav = wav[margin:-margin]
+        trimed_wav = librosa.effects.trim(
+            wav,
+            top_db=60,
+            frame_length=self.win_length,
+            hop_length=self.hop_length)[0]
        return trimed_wav

    def apply_preemphasis(self, x):
        if self.preemphasis == 0.:
-            raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
+            raise RuntimeError(
+                " !! Preemphasis coefficient should be positive. ")
        return scipy.signal.lfilter([1., -self.preemphasis], [1.], x)

    def apply_inv_preemphasis(self, x):
        if self.preemphasis == 0.:
-            raise RuntimeError(" !! Preemphasis coefficient should be positive. ")
+            raise RuntimeError(
+                " !! Preemphasis coefficient should be positive. ")
        return scipy.signal.lfilter([1.], [1., -self.preemphasis], x)

    def _amplitude_to_db(self, x):
@@ -125,12 +152,11 @@ class AudioProcessor(object):
        """return mel basis for mel scale"""
        if self.mel_fmax is not None:
            assert self.mel_fmax <= self.sample_rate // 2
-        return librosa.filters.mel(
-            self.sample_rate, 
-            self.n_fft, 
-            n_mels=self.num_mels,
-            fmin=self.mel_fmin,
-            fmax=self.mel_fmax)
+        return librosa.filters.mel(self.sample_rate,
+                                   self.n_fft,
+                                   n_mels=self.num_mels,
+                                   fmin=self.mel_fmin,
+                                   fmax=self.mel_fmax)

    def _normalize(self, S):
        """put values in [0, self.max_norm] or [-self.max_norm, self,max_norm]"""
@@ -156,25 +182,29 @@ class AudioProcessor(object):
            if self.symmetric_norm:
                if self.clip_norm:
                    S_denorm = np.clip(S_denorm, -self.max_norm, self.max_norm)
-                S_denorm = (S_denorm + self.max_norm) * (-self.min_level_db) / (2 * self.max_norm) + self.min_level_db
+                S_denorm = (S_denorm + self.max_norm) * (
+                    -self.min_level_db) / (2 * self.max_norm
+                                           ) + self.min_level_db
                return S_denorm
            else:
                if self.clip_norm:
                    S_denorm = np.clip(S_denorm, 0, self.max_norm)
-                S_denorm = S_denorm * (-self.min_level_db)/ self.max_norm + self.min_level_db
+                S_denorm = S_denorm * (-self.min_level_db
+                                       ) / self.max_norm + self.min_level_db
                return S_denorm
        else:
            return S

    def _stft(self, y):
        return librosa.stft(
-            y=y, 
+            y=y,
            n_fft=self.n_fft,
            win_length=self.win_length,
            hop_length=self.hop_length)

    def _istft(self, S):
-        return librosa.istft(S, hop_length=self.hop_length, win_length=self.win_length)
+        return librosa.istft(
+            S, hop_length=self.hop_length, win_length=self.win_length)

    def spectrogram(self, y):
        """compute linear spectrogram(amplitude)
@@ -195,7 +225,8 @@ class AudioProcessor(object):
            D = self._stft(self.apply_preemphasis(y))
        else:
            D = self._stft(y)
-        S = self._amplitude_to_db(self._linear_to_mel(np.abs(D))) - self.ref_level_db
+        S = self._amplitude_to_db(self._linear_to_mel(np.abs(
+            D))) - self.ref_level_db
        return self._normalize(S)

    def inv_spectrogram(self, spectrogram):
@@ -203,16 +234,16 @@ class AudioProcessor(object):
        S = self._denormalize(spectrogram)
        S = self._db_to_amplitude(S + self.ref_level_db)
        if self.preemphasis:
-            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
-        return self._griffin_lim(S ** self.power)
+            return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
+        return self._griffin_lim(S**self.power)

    def inv_melspectrogram(self, mel_spectrogram):
        S = self._denormalize(mel_spectrogram)
        S = self._db_to_amplitude(S + self.ref_level_db)
        S = self._mel_to_linear(np.abs(S))
        if self.preemphasis:
-            return self.apply_inv_preemphasis(self._griffin_lim(S ** self.power))
-        return self._griffin_lim(S ** self.power)
+            return self.apply_inv_preemphasis(self._griffin_lim(S**self.power))
+        return self._griffin_lim(S**self.power)

    def out_linear_to_mel(self, linear_spec):
        """convert output linear spec to mel spec"""
@@ -222,7 +253,7 @@ class AudioProcessor(object):
        S = self._amplitude_to_db(S) - self.ref_level_db
        mel = self._normalize(S)
        return mel
-        
+
    def _griffin_lim(self, S):
        angles = np.exp(2j * np.pi * np.random.rand(*S.shape))
        S_complex = np.abs(S).astype(np.complex)
@@ -234,18 +265,18 @@ class AudioProcessor(object):

    @staticmethod
    def mulaw_encode(wav, qc):
-        mu = 2 ** qc - 1
+        mu = 2**qc - 1
        # wav_abs = np.minimum(np.abs(wav), 1.0)
        signal = np.sign(wav) * np.log(1 + mu * np.abs(wav)) / np.log(1. + mu)
        # Quantize signal to the specified number of levels.
        signal = (signal + 1) / 2 * mu + 0.5
-        return np.floor(signal,)
+        return np.floor(signal, )

    @staticmethod
    def mulaw_decode(wav, qc):
        """Recovers waveform from quantized values."""
-        mu = 2 ** qc - 1
-        x = np.sign(wav) / mu * ((1 + mu) ** np.abs(wav) - 1)
+        mu = 2**qc - 1
+        x = np.sign(wav) / mu * ((1 + mu)**np.abs(wav) - 1)
        return x

    @staticmethod

--- a/parakeet/data/__init__.py
+++ b/parakeet/data/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from .dataset import *
 from .datacargo import *
 from .sampler import *

--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 functions to make batch for arrays which satisfy some conditions.
 """
 import numpy as np

+
 class TextIDBatcher(object):
    """A wrapper class for a function to build a functor, which holds the configs to pass to the function."""
+
    def __init__(self, pad_id=0, dtype=np.int64):
        self.pad_id = pad_id
        self.dtype = dtype
-    
+
    def __call__(self, minibatch):
        out = batch_text_id(minibatch, pad_id=self.pad_id, dtype=self.dtype)
        return out

+
 def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
    """
    minibatch: List[Example]
@@ -20,26 +36,32 @@ def batch_text_id(minibatch, pad_id=0, dtype=np.int64):
    """
    peek_example = minibatch[0]
    assert len(peek_example.shape) == 1, "text example is an 1D tensor"
-    
-    lengths = [example.shape[0] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
+
+    lengths = [example.shape[0] for example in minibatch
+               ]  # assume (channel, n_samples) or (n_samples, )
    max_len = np.max(lengths)
-    
+
    batch = []
    for example in minibatch:
        pad_len = max_len - example.shape[0]
-        batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_id))
+        batch.append(
+            np.pad(example, [(0, pad_len)],
+                   mode='constant',
+                   constant_values=pad_id))

    return np.array(batch, dtype=dtype)

+
 class WavBatcher(object):
    def __init__(self, pad_value=0., dtype=np.float32):
        self.pad_value = pad_value
        self.dtype = dtype
-        
+
    def __call__(self, minibatch):
        out = batch_wav(minibatch, pad_value=self.pad_value, dtype=self.dtype)
        return out

+
 def batch_wav(minibatch, pad_value=0., dtype=np.float32):
    """
    minibatch: List[Example]
@@ -51,18 +73,25 @@ def batch_wav(minibatch, pad_value=0., dtype=np.float32):
        mono_channel = True
    elif len(peek_example.shape) == 2:
        mono_channel = False
-    
-    lengths = [example.shape[-1] for example in minibatch] # assume (channel, n_samples) or (n_samples, )
+
+    lengths = [example.shape[-1] for example in minibatch
+               ]  # assume (channel, n_samples) or (n_samples, )
    max_len = np.max(lengths)
-    
+
    batch = []
    for example in minibatch:
        pad_len = max_len - example.shape[-1]
        if mono_channel:
-            batch.append(np.pad(example, [(0, pad_len)], mode='constant', constant_values=pad_value))
+            batch.append(
+                np.pad(example, [(0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))
        else:
-            batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
-    
+            batch.append(
+                np.pad(example, [(0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))  # what about PCM, no
+
    return np.array(batch, dtype=dtype)


@@ -75,6 +104,7 @@ class SpecBatcher(object):
        out = batch_spec(minibatch, pad_value=self.pad_value, dtype=self.dtype)
        return out

+
 def batch_spec(minibatch, pad_value=0., dtype=np.float32):
    """
    minibatch: List[Example]
@@ -86,16 +116,23 @@ def batch_spec(minibatch, pad_value=0., dtype=np.float32):
        mono_channel = True
    elif len(peek_example.shape) == 3:
        mono_channel = False
-    
-    lengths = [example.shape[-1] for example in minibatch] # assume (channel, F, n_frame) or (F, n_frame)
-    max_len = np.max(lengths)  
-    
+
+    lengths = [example.shape[-1] for example in minibatch
+               ]  # assume (channel, F, n_frame) or (F, n_frame)
+    max_len = np.max(lengths)
+
    batch = []
    for example in minibatch:
        pad_len = max_len - example.shape[-1]
        if mono_channel:
-            batch.append(np.pad(example, [(0, 0), (0, pad_len)], mode='constant', constant_values=pad_value))
+            batch.append(
+                np.pad(example, [(0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))
        else:
-            batch.append(np.pad(example, [(0, 0), (0, 0), (0, pad_len)], mode='constant', constant_values=pad_value)) # what about PCM, no
-    
-    return np.array(batch, dtype=dtype) 
\ No newline at end of file
+            batch.append(
+                np.pad(example, [(0, 0), (0, 0), (0, pad_len)],
+                       mode='constant',
+                       constant_values=pad_value))  # what about PCM, no
+
+    return np.array(batch, dtype=dtype)
--- a/parakeet/data/datacargo.py
+++ b/parakeet/data/datacargo.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import six
 from .sampler import SequentialSampler, RandomSampler, BatchSampler


--- a/parakeet/data/dataset.py
+++ b/parakeet/data/dataset.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import six
 import numpy as np

@@ -9,8 +23,7 @@ class DatasetMixin(object):
        if isinstance(index, slice):
            start, stop, step = index.indices(len(self))
            return [
-                self.get_example(i)
-                for i in six.moves.range(start, stop, step)
+                self.get_example(i) for i in six.moves.range(start, stop, step)
            ]
        elif isinstance(index, (list, np.ndarray)):
            return [self.get_example(i) for i in index]
@@ -180,8 +193,7 @@ class ChainDataset(DatasetMixin):

    def get_example(self, i):
        if i < 0:
-            raise IndexError(
-                "ChainDataset doesnot support negative indexing.")
+            raise IndexError("ChainDataset doesnot support negative indexing.")

        for dataset in self._datasets:
            if i < len(dataset):

--- a/parakeet/data/sampler.py
+++ b/parakeet/data/sampler.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 At most cases, we have non-stream dataset, which means we can random access it with __getitem__, and we can get the length of the dataset with __len__.

@@ -6,10 +19,10 @@ This suffices for a sampler. We implemente sampler as iterable of valid indices.
 So the sampler is only responsible for generating valid indices.
 """

-
 import numpy as np
 import random

+
 class Sampler(object):
    def __init__(self, data_source):
        pass
@@ -23,7 +36,7 @@ class Sampler(object):
 class SequentialSampler(Sampler):
    def __init__(self, data_source):
        self.data_source = data_source
-    
+
    def __iter__(self):
        return iter(range(len(self.data_source)))

@@ -42,12 +55,14 @@ class RandomSampler(Sampler):
                             "replacement={}".format(self.replacement))

        if self._num_samples is not None and not replacement:
-            raise ValueError("With replacement=False, num_samples should not be specified, "
-                             "since a random permutation will be performed.")
+            raise ValueError(
+                "With replacement=False, num_samples should not be specified, "
+                "since a random permutation will be performed.")

        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
            raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(self.num_samples))
+                             "value, but got num_samples={}".format(
+                                 self.num_samples))

    @property
    def num_samples(self):
@@ -59,7 +74,9 @@ class RandomSampler(Sampler):
    def __iter__(self):
        n = len(self.data_source)
        if self.replacement:
-            return iter(np.random.randint(0, n, size=(self.num_samples,), dtype=np.int64).tolist())
+            return iter(
+                np.random.randint(
+                    0, n, size=(self.num_samples, ), dtype=np.int64).tolist())
        return iter(np.random.permutation(n).tolist())

    def __len__(self):
@@ -76,7 +93,8 @@ class SubsetRandomSampler(Sampler):
        self.indices = indices

    def __iter__(self):
-        return (self.indices[i] for i in np.random.permutation(len(self.indices)))
+        return (self.indices[i]
+                for i in np.random.permutation(len(self.indices)))

    def __len__(self):
        return len(self.indices)
@@ -89,9 +107,14 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
    3. Permutate mini-batchs
    """

-    def __init__(self, lengths, batch_size=4, batch_group_size=None,
+    def __init__(self,
+                 lengths,
+                 batch_size=4,
+                 batch_group_size=None,
                 permutate=True):
-        _lengths = np.array(lengths, dtype=np.int64) # maybe better implement length as a sort key
+        _lengths = np.array(
+            lengths,
+            dtype=np.int64)  # maybe better implement length as a sort key
        self.lengths = np.sort(_lengths)
        self.sorted_indices = np.argsort(_lengths)

@@ -112,20 +135,21 @@ class PartialyRandomizedSimilarTimeLengthSampler(Sampler):
        for i in range(len(indices) // batch_group_size):
            s = i * batch_group_size
            e = s + batch_group_size
-            random.shuffle(indices[s: e]) # inplace
+            random.shuffle(indices[s:e])  # inplace

        # Permutate batches
        if self.permutate:
            perm = np.arange(len(indices[:e]) // self.batch_size)
            random.shuffle(perm)
-            indices[:e] = indices[:e].reshape(-1, self.batch_size)[perm, :].reshape(-1)
+            indices[:e] = indices[:e].reshape(
+                -1, self.batch_size)[perm, :].reshape(-1)

        # Handle last elements
        s += batch_group_size
        #print(indices)
        if s < len(indices):
            random.shuffle(indices[s:])
-        
+
        return iter(indices)

    def __len__(self):
@@ -150,14 +174,19 @@ class WeightedRandomSampler(Sampler):
    def __init__(self, weights, num_samples, replacement):
        if not isinstance(num_samples, int) or num_samples <= 0:
            raise ValueError("num_samples should be a positive integer "
-                             "value, but got num_samples={}".format(num_samples))
+                             "value, but got num_samples={}".format(
+                                 num_samples))
        self.weights = np.array(weights, dtype=np.float64)
        self.num_samples = num_samples
        self.replacement = replacement

    def __iter__(self):
-        return iter(np.random.choice(len(self.weights), size=(self.num_samples, ),  
-                                     replace=self.replacement, p=self.weights).tolist())
+        return iter(
+            np.random.choice(
+                len(self.weights),
+                size=(self.num_samples, ),
+                replace=self.replacement,
+                p=self.weights).tolist())

    def __len__(self):
        return self.num_samples
@@ -184,7 +213,7 @@ class DistributedSampler(Sampler):

        # Subset samples for each trainer.
        indices = indices[self.rank:self.total_size:self.num_trainers]
-        assert len(indices) ==  self.num_samples
+        assert len(indices) == self.num_samples

        return iter(indices)

@@ -209,8 +238,7 @@ class BatchSampler(Sampler):
    def __init__(self, sampler, batch_size, drop_last):
        if not isinstance(sampler, Sampler):
            raise ValueError("sampler should be an instance of "
-                             "Sampler, but got sampler={}"
-                             .format(sampler))
+                             "Sampler, but got sampler={}".format(sampler))
        if not isinstance(batch_size, int) or batch_size <= 0:
            raise ValueError("batch_size should be a positive integer value, "
                             "but got batch_size={}".format(batch_size))

--- a/parakeet/datasets/README.md
+++ b/parakeet/datasets/README.md
@@ -14,9 +14,4 @@ One of the reasons we choose to load data lazily (only load metadata before hand

 For deep learning practice, we typically batch examples. So the dataset should comes with a method to batch examples. Assuming the record is implemented as a tuple with several items. When an item is represented as a fix-sized array, to batch them is trivial, just `np.stack` suffices. But for array with dynamic size, padding is needed. We decide to implement a batching method for each item. Then batching a record can be implemented by these methods. For a dataset, a `_batch_examples` should be implemented. But in most cases, you can choose one from `batching.py`.

-That is it! 
-
-
-
-
-
+That is it!
--- a/parakeet/datasets/__init__.py
+++ b/parakeet/datasets/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parakeet/datasets/ljspeech.py
+++ b/parakeet/datasets/ljspeech.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from pathlib import Path
 import numpy as np
 import pandas as pd

--- a/parakeet/datasets/vctk.py
+++ b/parakeet/datasets/vctk.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from pathlib import Path
 import pandas as pd
 from ruamel.yaml import YAML
@@ -11,23 +25,25 @@ from parakeet.data.dataset import Dataset
 from parakeet.data.datacargo import DataCargo
 from parakeet.data.batch import TextIDBatcher, WavBatcher

+
 class VCTK(Dataset):
    def __init__(self, root):
-        assert isinstance(root, (str, Path)), "root should be a string or Path object"
+        assert isinstance(root, (
+            str, Path)), "root should be a string or Path object"
        self.root = root if isinstance(root, Path) else Path(root)
        self.text_root = self.root.joinpath("txt")
        self.wav_root = self.root.joinpath("wav48")

-        if not (self.root.joinpath("metadata.csv").exists() and 
+        if not (self.root.joinpath("metadata.csv").exists() and
                self.root.joinpath("speaker_indices.yaml").exists()):
            self._prepare_metadata()
        self.speaker_indices, self.metadata = self._load_metadata()

    def _load_metadata(self):
-        yaml=YAML(typ='safe')
+        yaml = YAML(typ='safe')
        speaker_indices = yaml.load(self.root.joinpath("speaker_indices.yaml"))
-        metadata = pd.read_csv(self.root.joinpath("metadata.csv"), 
-                               sep="|", quoting=3, header=1)
+        metadata = pd.read_csv(
+            self.root.joinpath("metadata.csv"), sep="|", quoting=3, header=1)
        return speaker_indices, metadata

    def _prepare_metadata(self):
@@ -41,15 +57,19 @@ class VCTK(Dataset):
                        with io.open(str(text_file)) as f:
                            transcription = f.read().strip()
                    wav_file = text_file.with_suffix(".wav")
-                    metadata.append((wav_file.name, speaker_folder.name, transcription))
-        metadata = pd.DataFrame.from_records(metadata,
-                                             columns=["wave_file", "speaker", "text"])
-        
+                    metadata.append(
+                        (wav_file.name, speaker_folder.name, transcription))
+        metadata = pd.DataFrame.from_records(
+            metadata, columns=["wave_file", "speaker", "text"])
+
        # save them
-        yaml=YAML(typ='safe')
+        yaml = YAML(typ='safe')
        yaml.dump(speaker_to_index, self.root.joinpath("speaker_indices.yaml"))
-        metadata.to_csv(self.root.joinpath("metadata.csv"), 
-                        sep="|", quoting=3, index=False)
+        metadata.to_csv(
+            self.root.joinpath("metadata.csv"),
+            sep="|",
+            quoting=3,
+            index=False)

    def _get_example(self, metadatum):
        wave_file, speaker, text = metadatum
@@ -77,5 +97,3 @@ class VCTK(Dataset):
        speaker_batch = np.array(speaker_batch)
        phoneme_batch = TextIDBatcher(pad_id=0)(phoneme_batch)
        return wav_batch, speaker_batch, phoneme_batch
-
-    
\ No newline at end of file
--- a/parakeet/g2p/__init__.py
+++ b/parakeet/g2p/__init__.py
 # coding: utf-8
-
 """Text processing frontend

 All frontend module should have the following functions:

--- a/parakeet/g2p/en/__init__.py
+++ b/parakeet/g2p/en/__init__.py
@@ -32,6 +32,3 @@ def text_to_sequence(text, p=0.0):
    from ..text import text_to_sequence
    text = text_to_sequence(text, ["english_cleaners"])
    return text
-
-
-
--- a/parakeet/g2p/es/__init__.py
+++ b/parakeet/g2p/es/__init__.py
@@ -12,6 +12,3 @@ def text_to_sequence(text, p=0.0):
    from ..text import text_to_sequence
    text = text_to_sequence(text, ["basic_cleaners"])
    return text
-
-
-
--- a/parakeet/g2p/jp/__init__.py
+++ b/parakeet/g2p/jp/__init__.py
 # coding: utf-8

-
 import MeCab
 import jaconv
 from random import random
@@ -30,9 +29,9 @@ def _yomi(mecab_result):


 def _mix_pronunciation(tokens, yomis, p):
-    return "".join(
-        yomis[idx] if yomis[idx] is not None and random() < p else tokens[idx]
-        for idx in range(len(tokens)))
+    return "".join(yomis[idx]
+                   if yomis[idx] is not None and random() < p else tokens[idx]
+                   for idx in range(len(tokens)))


 def mix_pronunciation(text, p):
@@ -59,8 +58,7 @@ def normalize_delimitor(text):


 def text_to_sequence(text, p=0.0):
-    for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】",
-              "（", "）", "(", ")"]:
+    for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】", "（", "）", "(", ")"]:
        text = text.replace(c, "")
    text = text.replace("!", "！")
    text = text.replace("?", "？")

--- a/parakeet/g2p/ko/__init__.py
+++ b/parakeet/g2p/ko/__init__.py
 # coding: utf-8

-
 from random import random

 n_vocab = 0xffff
@@ -13,5 +12,6 @@ _tagger = None
 def text_to_sequence(text, p=0.0):
    return [ord(c) for c in text] + [_eos]  # EOS

+
 def sequence_to_text(seq):
    return "".join(chr(n) for n in seq)
--- a/parakeet/g2p/text/__init__.py
+++ b/parakeet/g2p/text/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import re
 from . import cleaners
 from .symbols import symbols

-
 # Mappings from symbol to numeric ID and vice versa:
 _symbol_to_id = {s: i for i, s in enumerate(symbols)}
 _id_to_symbol = {i: s for i, s in enumerate(symbols)}
@@ -32,7 +45,8 @@ def text_to_sequence(text, cleaner_names):
        if not m:
            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
            break
-        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
+        sequence += _symbols_to_sequence(
+            _clean_text(m.group(1), cleaner_names))
        sequence += _arpabet_to_sequence(m.group(2))
        text = m.group(3)


--- a/parakeet/g2p/text/cleaners.py
+++ b/parakeet/g2p/text/cleaners.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Cleaners are transformations that run over the input text at both training and eval time.

@@ -14,31 +27,31 @@ import re
 from unidecode import unidecode
 from .numbers import normalize_numbers

-
 # Regular expression matching whitespace:
 _whitespace_re = re.compile(r'\s+')

 # List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
-    ('mrs', 'misess'),
-    ('mr', 'mister'),
-    ('dr', 'doctor'),
-    ('st', 'saint'),
-    ('co', 'company'),
-    ('jr', 'junior'),
-    ('maj', 'major'),
-    ('gen', 'general'),
-    ('drs', 'doctors'),
-    ('rev', 'reverend'),
-    ('lt', 'lieutenant'),
-    ('hon', 'honorable'),
-    ('sgt', 'sergeant'),
-    ('capt', 'captain'),
-    ('esq', 'esquire'),
-    ('ltd', 'limited'),
-    ('col', 'colonel'),
-    ('ft', 'fort'),
-]]
+_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
+                  for x in [
+                      ('mrs', 'misess'),
+                      ('mr', 'mister'),
+                      ('dr', 'doctor'),
+                      ('st', 'saint'),
+                      ('co', 'company'),
+                      ('jr', 'junior'),
+                      ('maj', 'major'),
+                      ('gen', 'general'),
+                      ('drs', 'doctors'),
+                      ('rev', 'reverend'),
+                      ('lt', 'lieutenant'),
+                      ('hon', 'honorable'),
+                      ('sgt', 'sergeant'),
+                      ('capt', 'captain'),
+                      ('esq', 'esquire'),
+                      ('ltd', 'limited'),
+                      ('col', 'colonel'),
+                      ('ft', 'fort'),
+                  ]]


 def expand_abbreviations(text):

--- a/parakeet/g2p/text/cmudict.py
+++ b/parakeet/g2p/text/cmudict.py
-import re
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

+import re

 valid_symbols = [
-    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1', 'AH2',
-    'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0', 'AY1', 'AY2',
-    'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0', 'ER1', 'ER2', 'EY',
-    'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1', 'IH2', 'IY', 'IY0', 'IY1',
-    'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0',
-    'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW',
-    'UW0', 'UW1', 'UW2', 'V', 'W', 'Y', 'Z', 'ZH'
+    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
+    'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
+    'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
+    'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0',
+    'IH1', 'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG',
+    'OW', 'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH',
+    'T', 'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W',
+    'Y', 'Z', 'ZH'
 ]

 _valid_symbol_set = set(valid_symbols)
@@ -24,7 +38,10 @@ class CMUDict:
        else:
            entries = _parse_cmudict(file_or_path)
        if not keep_ambiguous:
-            entries = {word: pron for word, pron in entries.items() if len(pron) == 1}
+            entries = {
+                word: pron
+                for word, pron in entries.items() if len(pron) == 1
+            }
        self._entries = entries

    def __len__(self):

--- a/parakeet/g2p/text/numbers.py
+++ b/parakeet/g2p/text/numbers.py
@@ -3,7 +3,6 @@
 import inflect
 import re

-
 _inflect = inflect.engine()
 _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
 _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
@@ -56,7 +55,8 @@ def _expand_number(m):
        elif num % 100 == 0:
            return _inflect.number_to_words(num // 100) + ' hundred'
        else:
-            return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
+            return _inflect.number_to_words(
+                num, andword='', zero='oh', group=2).replace(', ', ' ')
    else:
        return _inflect.number_to_words(num, andword='')


--- a/parakeet/g2p/text/symbols.py
+++ b/parakeet/g2p/text/symbols.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Defines the set of symbols used in text input to the model.


--- a/parakeet/models/__init__.py
+++ b/parakeet/models/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/parakeet/models/deepvoice3/__init__.py
+++ b/parakeet/models/deepvoice3/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.models.deepvoice3.encoder import Encoder, ConvSpec
 from parakeet.models.deepvoice3.decoder import Decoder, WindowRange
 from parakeet.models.deepvoice3.converter import Converter

--- a/parakeet/models/deepvoice3/attention.py
+++ b/parakeet/models/deepvoice3/attention.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from collections import namedtuple
 from paddle import fluid
@@ -19,23 +33,19 @@ class Attention(dg.Layer):
                 value_projection=True):
        super(Attention, self).__init__()
        std = np.sqrt(1 / query_dim)
-        self.query_proj = Linear(query_dim,
-                                 embed_dim,
-                                 param_attr=I.Normal(scale=std))
+        self.query_proj = Linear(
+            query_dim, embed_dim, param_attr=I.Normal(scale=std))
        if key_projection:
            std = np.sqrt(1 / embed_dim)
-            self.key_proj = Linear(embed_dim,
-                                   embed_dim,
-                                   param_attr=I.Normal(scale=std))
+            self.key_proj = Linear(
+                embed_dim, embed_dim, param_attr=I.Normal(scale=std))
        if value_projection:
            std = np.sqrt(1 / embed_dim)
-            self.value_proj = Linear(embed_dim,
-                                     embed_dim,
-                                     param_attr=I.Normal(scale=std))
+            self.value_proj = Linear(
+                embed_dim, embed_dim, param_attr=I.Normal(scale=std))
        std = np.sqrt(1 / embed_dim)
-        self.out_proj = Linear(embed_dim,
-                               query_dim,
-                               param_attr=I.Normal(scale=std))
+        self.out_proj = Linear(
+            embed_dim, query_dim, param_attr=I.Normal(scale=std))

        self.key_projection = key_projection
        self.value_projection = value_projection
@@ -102,9 +112,8 @@ class Attention(dg.Layer):

        x = F.softmax(x)
        attn_scores = x
-        x = F.dropout(x,
-                      self.dropout,
-                      dropout_implementation="upscale_in_train")
+        x = F.dropout(
+            x, self.dropout, dropout_implementation="upscale_in_train")
        x = F.matmul(x, values)
        encoder_length = keys.shape[1]
        # CAUTION: is it wrong? let it be now

--- a/parakeet/models/deepvoice3/conv1dglu.py
+++ b/parakeet/models/deepvoice3/conv1dglu.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np

 from paddle import fluid
@@ -15,6 +29,7 @@ class Conv1DGLU(dg.Layer):
    has residual connection from the input x, and scale the output by 
    np.sqrt(0.5).
    """
+
    def __init__(self,
                 n_speakers,
                 speaker_dim,
@@ -50,20 +65,20 @@ class Conv1DGLU(dg.Layer):
            ), "this block uses residual connection"\
                "the input_channes should equals num_filters"
        std = np.sqrt(std_mul * (1 - dropout) / (filter_size * in_channels))
-        self.conv = Conv1DCell(in_channels,
-                               2 * num_filters,
-                               filter_size,
-                               dilation,
-                               causal,
-                               param_attr=I.Normal(scale=std))
+        self.conv = Conv1DCell(
+            in_channels,
+            2 * num_filters,
+            filter_size,
+            dilation,
+            causal,
+            param_attr=I.Normal(scale=std))

        if n_speakers > 1:
            assert (speaker_dim is not None
                    ), "speaker embed should not be null in multi-speaker case"
            std = np.sqrt(1 / speaker_dim)
-            self.fc = Linear(speaker_dim,
-                             num_filters,
-                             param_attr=I.Normal(scale=std))
+            self.fc = Linear(
+                speaker_dim, num_filters, param_attr=I.Normal(scale=std))

    def forward(self, x, speaker_embed=None):
        """
@@ -82,9 +97,8 @@ class Conv1DGLU(dg.Layer):
                C_out means the output channels of Conv1DGLU.
        """
        residual = x
-        x = F.dropout(x,
-                      self.dropout,
-                      dropout_implementation="upscale_in_train")
+        x = F.dropout(
+            x, self.dropout, dropout_implementation="upscale_in_train")
        x = self.conv(x)
        content, gate = F.split(x, num_or_sections=2, dim=1)

@@ -118,9 +132,8 @@ class Conv1DGLU(dg.Layer):
                C_out means the output channels of Conv1DGLU.
        """
        residual = x_t
-        x_t = F.dropout(x_t,
-                        self.dropout,
-                        dropout_implementation="upscale_in_train")
+        x_t = F.dropout(
+            x_t, self.dropout, dropout_implementation="upscale_in_train")
        x_t = self.conv.add_input(x_t)
        content_t, gate_t = F.split(x_t, num_or_sections=2, dim=1)


--- a/parakeet/models/deepvoice3/converter.py
+++ b/parakeet/models/deepvoice3/converter.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from itertools import chain

@@ -19,44 +33,45 @@ def upsampling_4x_blocks(n_speakers, speaker_dim, target_channels, dropout):
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(1 / (2 * target_channels)))),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=1,
-                  std_mul=1.,
-                  dropout=dropout),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=3,
-                  std_mul=4.,
-                  dropout=dropout),
-        Conv1DTranspose(
+        Conv1DGLU(
+            n_speakers,
+            speaker_dim,
            target_channels,
            target_channels,
-            2,
-            stride=2,
-            param_attr=I.Normal(scale=np.sqrt(4. / (2 * target_channels)))),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=1,
-                  std_mul=1.,
-                  dropout=dropout),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=3,
-                  std_mul=4.,
-                  dropout=dropout)
+            3,
+            dilation=1,
+            std_mul=1.,
+            dropout=dropout), Conv1DGLU(
+                n_speakers,
+                speaker_dim,
+                target_channels,
+                target_channels,
+                3,
+                dilation=3,
+                std_mul=4.,
+                dropout=dropout), Conv1DTranspose(
+                    target_channels,
+                    target_channels,
+                    2,
+                    stride=2,
+                    param_attr=I.Normal(scale=np.sqrt(
+                        4. / (2 * target_channels)))), Conv1DGLU(
+                            n_speakers,
+                            speaker_dim,
+                            target_channels,
+                            target_channels,
+                            3,
+                            dilation=1,
+                            std_mul=1.,
+                            dropout=dropout), Conv1DGLU(
+                                n_speakers,
+                                speaker_dim,
+                                target_channels,
+                                target_channels,
+                                3,
+                                dilation=3,
+                                std_mul=4.,
+                                dropout=dropout)
    ]
    return upsampling_convolutions

@@ -69,36 +84,38 @@ def upsampling_2x_blocks(n_speakers, speaker_dim, target_channels, dropout):
            2,
            stride=2,
            param_attr=I.Normal(scale=np.sqrt(1. / (2 * target_channels)))),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=1,
-                  std_mul=1.,
-                  dropout=dropout),
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=3,
-                  std_mul=4.,
-                  dropout=dropout)
+        Conv1DGLU(
+            n_speakers,
+            speaker_dim,
+            target_channels,
+            target_channels,
+            3,
+            dilation=1,
+            std_mul=1.,
+            dropout=dropout), Conv1DGLU(
+                n_speakers,
+                speaker_dim,
+                target_channels,
+                target_channels,
+                3,
+                dilation=3,
+                std_mul=4.,
+                dropout=dropout)
    ]
    return upsampling_convolutions


 def upsampling_1x_blocks(n_speakers, speaker_dim, target_channels, dropout):
    upsampling_convolutions = [
-        Conv1DGLU(n_speakers,
-                  speaker_dim,
-                  target_channels,
-                  target_channels,
-                  3,
-                  dilation=3,
-                  std_mul=4.,
-                  dropout=dropout)
+        Conv1DGLU(
+            n_speakers,
+            speaker_dim,
+            target_channels,
+            target_channels,
+            3,
+            dilation=3,
+            std_mul=4.,
+            dropout=dropout)
    ]
    return upsampling_convolutions

@@ -108,6 +125,7 @@ class Converter(dg.Layer):
    Vocoder that transforms mel spectrogram (or ecoder hidden states) 
    to waveform.
    """
+
    def __init__(self,
                 n_speakers,
                 speaker_dim,
@@ -161,33 +179,36 @@ class Converter(dg.Layer):
                std = np.sqrt(std_mul / in_channels)
                # CAUTION: relu
                self.convolutions.append(
-                    Conv1D(in_channels,
-                           out_channels,
-                           1,
-                           act="relu",
-                           param_attr=I.Normal(scale=std)))
+                    Conv1D(
+                        in_channels,
+                        out_channels,
+                        1,
+                        act="relu",
+                        param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0
            self.convolutions.append(
-                Conv1DGLU(n_speakers,
-                          speaker_dim,
-                          in_channels,
-                          out_channels,
-                          filter_size,
-                          dilation=dilation,
-                          std_mul=std_mul,
-                          dropout=dropout))
+                Conv1DGLU(
+                    n_speakers,
+                    speaker_dim,
+                    in_channels,
+                    out_channels,
+                    filter_size,
+                    dilation=dilation,
+                    std_mul=std_mul,
+                    dropout=dropout))
            in_channels = out_channels
            std_mul = 4.0

        # final conv proj, channel transformed to linear dim
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        # CAUTION: sigmoid
-        self.last_conv_proj = Conv1D(in_channels,
-                                     linear_dim,
-                                     1,
-                                     act="sigmoid",
-                                     param_attr=I.Normal(scale=std))
+        self.last_conv_proj = Conv1D(
+            in_channels,
+            linear_dim,
+            1,
+            act="sigmoid",
+            param_attr=I.Normal(scale=std))

    def forward(self, x, speaker_embed=None):
        """
@@ -229,4 +250,4 @@ class Converter(dg.Layer):

        out = self.last_conv_proj(x)
        out = F.transpose(out, [0, 2, 1])
-        return out
\ No newline at end of file
+        return out
--- a/parakeet/models/deepvoice3/decoder.py
+++ b/parakeet/models/deepvoice3/decoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 import paddle.fluid.layers as F
 import paddle.fluid.initializer as I
@@ -80,25 +94,25 @@ def unfold_adjacent_frames(folded_frames, r):

 class Decoder(dg.Layer):
    def __init__(
-        self,
-        n_speakers,
-        speaker_dim,
-        embed_dim,
-        mel_dim,
-        r=1,
-        max_positions=512,
-        padding_idx=None,  # remove it!
-        preattention=(ConvSpec(128, 5, 1), ) * 4,
-        convolutions=(ConvSpec(128, 5, 1), ) * 4,
-        attention=True,
-        dropout=0.0,
-        use_memory_mask=False,
-        force_monotonic_attention=False,
-        query_position_rate=1.0,
-        key_position_rate=1.0,
-        window_range=WindowRange(-1, 3),
-        key_projection=True,
-        value_projection=True):
+            self,
+            n_speakers,
+            speaker_dim,
+            embed_dim,
+            mel_dim,
+            r=1,
+            max_positions=512,
+            padding_idx=None,  # remove it!
+            preattention=(ConvSpec(128, 5, 1), ) * 4,
+            convolutions=(ConvSpec(128, 5, 1), ) * 4,
+            attention=True,
+            dropout=0.0,
+            use_memory_mask=False,
+            force_monotonic_attention=False,
+            query_position_rate=1.0,
+            key_position_rate=1.0,
+            window_range=WindowRange(-1, 3),
+            key_projection=True,
+            value_projection=True):
        super(Decoder, self).__init__()

        self.dropout = dropout
@@ -111,23 +125,17 @@ class Decoder(dg.Layer):

        conv_channels = convolutions[0].out_channels
        # only when padding idx is 0 can we easilt handle it
-        self.embed_keys_positions = PositionEmbedding(max_positions,
-                                                      embed_dim,
-                                                      padding_idx=0)
-        self.embed_query_positions = PositionEmbedding(max_positions,
-                                                       conv_channels,
-                                                       padding_idx=0)
+        self.embed_keys_positions = PositionEmbedding(
+            max_positions, embed_dim, padding_idx=0)
+        self.embed_query_positions = PositionEmbedding(
+            max_positions, conv_channels, padding_idx=0)

        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
-            self.speaker_proj1 = Linear(speaker_dim,
-                                        1,
-                                        act="sigmoid",
-                                        param_attr=I.Normal(scale=std))
-            self.speaker_proj2 = Linear(speaker_dim,
-                                        1,
-                                        act="sigmoid",
-                                        param_attr=I.Normal(scale=std))
+            self.speaker_proj1 = Linear(
+                speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))
+            self.speaker_proj2 = Linear(
+                speaker_dim, 1, act="sigmoid", param_attr=I.Normal(scale=std))

        # prenet
        self.prenet = dg.LayerList()
@@ -138,24 +146,26 @@ class Decoder(dg.Layer):
                # conv1d & relu
                std = np.sqrt(std_mul / in_channels)
                self.prenet.append(
-                    Conv1D(in_channels,
-                           out_channels,
-                           1,
-                           act="relu",
-                           param_attr=I.Normal(scale=std)))
+                    Conv1D(
+                        in_channels,
+                        out_channels,
+                        1,
+                        act="relu",
+                        param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0
            self.prenet.append(
-                Conv1DGLU(n_speakers,
-                          speaker_dim,
-                          in_channels,
-                          out_channels,
-                          filter_size,
-                          dilation,
-                          std_mul,
-                          dropout,
-                          causal=True,
-                          residual=True))
+                Conv1DGLU(
+                    n_speakers,
+                    speaker_dim,
+                    in_channels,
+                    out_channels,
+                    filter_size,
+                    dilation,
+                    std_mul,
+                    dropout,
+                    causal=True,
+                    residual=True))
            in_channels = out_channels
            std_mul = 4.0

@@ -184,16 +194,17 @@ class Decoder(dg.Layer):
            assert (
                in_channels == out_channels
            ), "the stack of convolution & attention does not change channels"
-            conv_layer = Conv1DGLU(n_speakers,
-                                   speaker_dim,
-                                   in_channels,
-                                   out_channels,
-                                   filter_size,
-                                   dilation,
-                                   std_mul,
-                                   dropout,
-                                   causal=True,
-                                   residual=False)
+            conv_layer = Conv1DGLU(
+                n_speakers,
+                speaker_dim,
+                in_channels,
+                out_channels,
+                filter_size,
+                dilation,
+                std_mul,
+                dropout,
+                causal=True,
+                residual=False)
            attn_layer = Attention(
                out_channels,
                embed_dim,
@@ -211,10 +222,8 @@ class Decoder(dg.Layer):

        # 1 * 1 conv to transform channels
        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
-        self.last_conv = Conv1D(in_channels,
-                                mel_dim * r,
-                                1,
-                                param_attr=I.Normal(scale=std))
+        self.last_conv = Conv1D(
+            in_channels, mel_dim * r, 1, param_attr=I.Normal(scale=std))

        # mel (before sigmoid) to done hat
        std = np.sqrt(1 / in_channels)
@@ -308,9 +317,8 @@ class Decoder(dg.Layer):
        # (B, C, T)
        frames = F.transpose(frames, [0, 2, 1])
        x = frames
-        x = F.dropout(x,
-                      self.dropout,
-                      dropout_implementation="upscale_in_train")
+        x = F.dropout(
+            x, self.dropout, dropout_implementation="upscale_in_train")
        # Prenet
        for layer in self.prenet:
            if isinstance(layer, Conv1DGLU):
@@ -408,14 +416,13 @@ class Decoder(dg.Layer):
            test_inputs = fold_adjacent_frames(test_inputs, self.r)
            test_inputs = F.transpose(test_inputs, [0, 2, 1])

-        initial_input = F.zeros((batch_size, self.mel_dim * self.r, 1),
-                                dtype=keys.dtype)
+        initial_input = F.zeros(
+            (batch_size, self.mel_dim * self.r, 1), dtype=keys.dtype)

        t = 0  # decoder time step
        while True:
-            frame_pos = F.fill_constant((batch_size, 1),
-                                        value=t + 1,
-                                        dtype="int64")
+            frame_pos = F.fill_constant(
+                (batch_size, 1), value=t + 1, dtype="int64")
            w = self.query_position_rate
            if self.n_speakers > 1:
                w = w * F.squeeze(self.speaker_proj2(speaker_embed), [-1])
@@ -433,9 +440,8 @@ class Decoder(dg.Layer):
                    current_input = initial_input

            x_t = current_input
-            x_t = F.dropout(x_t,
-                            self.dropout,
-                            dropout_implementation="upscale_in_train")
+            x_t = F.dropout(
+                x_t, self.dropout, dropout_implementation="upscale_in_train")

            # Prenet
            for layer in self.prenet:
@@ -453,15 +459,15 @@ class Decoder(dg.Layer):
                    x_t = F.transpose(x_t, [0, 2, 1])
                    if frame_pos_embed is not None:
                        x_t += frame_pos_embed
-                    x_t, attn_scores = attn(
-                        x_t, (keys, values), mask,
-                        last_attended[i] if test_inputs is None else None)
+                    x_t, attn_scores = attn(x_t, (keys, values), mask,
+                                            last_attended[i]
+                                            if test_inputs is None else None)
                    x_t = F.transpose(x_t, [0, 2, 1])
                    step_attn_scores.append(attn_scores)  #(B, T_dec=1, T_enc)
                    # update last attended when necessary
                    if self.force_monotonic_attention[i]:
-                        last_attended[i] = np.argmax(attn_scores.numpy(),
-                                                     axis=-1)[0][0]
+                        last_attended[i] = np.argmax(
+                            attn_scores.numpy(), axis=-1)[0][0]
                x_t = F.scale(residual + x_t, np.sqrt(0.5))
            if len(step_attn_scores):
                # (B, 1, T_enc) again
@@ -485,8 +491,8 @@ class Decoder(dg.Layer):
            t += 1

            if test_inputs is None:
-                if F.reduce_min(done_t).numpy(
-                )[0] > 0.5 and t > self.min_decoder_steps:
+                if F.reduce_min(done_t).numpy()[
+                        0] > 0.5 and t > self.min_decoder_steps:
                    break
                elif t > self.max_decoder_steps:
                    break

--- a/parakeet/models/deepvoice3/encoder.py
+++ b/parakeet/models/deepvoice3/encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from collections import namedtuple

@@ -33,14 +47,16 @@ class Encoder(dg.Layer):
        self.dropout = dropout
        if n_speakers > 1:
            std = np.sqrt((1 - dropout) / speaker_dim)
-            self.sp_proj1 = Linear(speaker_dim,
-                                   embed_dim,
-                                   act="softsign",
-                                   param_attr=I.Normal(scale=std))
-            self.sp_proj2 = Linear(speaker_dim,
-                                   embed_dim,
-                                   act="softsign",
-                                   param_attr=I.Normal(scale=std))
+            self.sp_proj1 = Linear(
+                speaker_dim,
+                embed_dim,
+                act="softsign",
+                param_attr=I.Normal(scale=std))
+            self.sp_proj2 = Linear(
+                speaker_dim,
+                embed_dim,
+                act="softsign",
+                param_attr=I.Normal(scale=std))
        self.n_speakers = n_speakers

        self.convolutions = dg.LayerList()
@@ -51,31 +67,34 @@ class Encoder(dg.Layer):
            if in_channels != out_channels:
                std = np.sqrt(std_mul / in_channels)
                self.convolutions.append(
-                    Conv1D(in_channels,
-                           out_channels,
-                           1,
-                           act="relu",
-                           param_attr=I.Normal(scale=std)))
+                    Conv1D(
+                        in_channels,
+                        out_channels,
+                        1,
+                        act="relu",
+                        param_attr=I.Normal(scale=std)))
                in_channels = out_channels
                std_mul = 2.0

            self.convolutions.append(
-                Conv1DGLU(n_speakers,
-                          speaker_dim,
-                          in_channels,
-                          out_channels,
-                          filter_size,
-                          dilation,
-                          std_mul,
-                          dropout,
-                          causal=False,
-                          residual=True))
+                Conv1DGLU(
+                    n_speakers,
+                    speaker_dim,
+                    in_channels,
+                    out_channels,
+                    filter_size,
+                    dilation,
+                    std_mul,
+                    dropout,
+                    causal=False,
+                    residual=True))
            in_channels = out_channels
            std_mul = 4.0

        std = np.sqrt(std_mul * (1 - dropout) / in_channels)
        self.convolutions.append(
-            Conv1D(in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))
+            Conv1D(
+                in_channels, embed_dim, 1, param_attr=I.Normal(scale=std)))

    def forward(self, x, speaker_embed=None):
        """
@@ -96,9 +115,8 @@ class Encoder(dg.Layer):
                representation for values.
        """
        x = self.embed(x)
-        x = F.dropout(x,
-                      self.dropout,
-                      dropout_implementation="upscale_in_train")
+        x = F.dropout(
+            x, self.dropout, dropout_implementation="upscale_in_train")
        x = F.transpose(x, [0, 2, 1])

        if self.n_speakers > 1 and speaker_embed is not None:

--- a/parakeet/models/deepvoice3/loss.py
+++ b/parakeet/models/deepvoice3/loss.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from numba import jit

@@ -31,9 +45,7 @@ def guided_attention(N, max_N, T, max_T, g):
    return W


-def guided_attentions(encoder_lengths,
-                      decoder_lengths,
-                      max_decoder_len,
+def guided_attentions(encoder_lengths, decoder_lengths, max_decoder_len,
                      g=0.2):
    B = len(encoder_lengths)
    max_input_len = encoder_lengths.max()
@@ -93,9 +105,8 @@ class TTSLoss(object):
    def binary_divergence(self, prediction, target, mask):
        flattened_prediction = F.reshape(prediction, [-1, 1])
        flattened_target = F.reshape(target, [-1, 1])
-        flattened_loss = F.log_loss(flattened_prediction,
-                                    flattened_target,
-                                    epsilon=1e-8)
+        flattened_loss = F.log_loss(
+            flattened_prediction, flattened_target, epsilon=1e-8)
        bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)

        w = self.masked_weight
@@ -163,23 +174,20 @@ class TTSLoss(object):
        max_mel_steps = max_frames // self.downsample_factor
        max_decoder_steps = max_mel_steps // self.r

-        decoder_mask = F.sequence_mask(n_frames // self.downsample_factor //
-                                       self.r,
-                                       max_decoder_steps,
-                                       dtype="float32")
-        mel_mask = F.sequence_mask(n_frames // self.downsample_factor,
-                                   max_mel_steps,
-                                   dtype="float32")
+        decoder_mask = F.sequence_mask(
+            n_frames // self.downsample_factor // self.r,
+            max_decoder_steps,
+            dtype="float32")
+        mel_mask = F.sequence_mask(
+            n_frames // self.downsample_factor, max_mel_steps, dtype="float32")
        lin_mask = F.sequence_mask(n_frames, max_frames, dtype="float32")

        if compute_lin_loss:
            lin_hyp = lin_hyp[:, :-self.time_shift, :]
            lin_ref = lin_ref[:, self.time_shift:, :]
            lin_mask = lin_mask[:, self.time_shift:, :]
-            lin_l1_loss = self.l1_loss(lin_hyp,
-                                       lin_ref,
-                                       lin_mask,
-                                       priority_bin=self.priority_bin)
+            lin_l1_loss = self.l1_loss(
+                lin_hyp, lin_ref, lin_mask, priority_bin=self.priority_bin)
            lin_bce_loss = self.binary_divergence(lin_hyp, lin_ref, lin_mask)
            lin_loss = self.binary_divergence_weight * lin_bce_loss \
                     + (1 - self.binary_divergence_weight) * lin_l1_loss
@@ -197,9 +205,10 @@ class TTSLoss(object):
            total_loss += mel_loss

        if compute_attn_loss:
-            attn_loss = self.attention_loss(
-                attn_hyp, input_lengths.numpy(),
-                n_frames.numpy() // (self.downsample_factor * self.r))
+            attn_loss = self.attention_loss(attn_hyp,
+                                            input_lengths.numpy(),
+                                            n_frames.numpy() //
+                                            (self.downsample_factor * self.r))
            total_loss += attn_loss

        if compute_done_loss:

--- a/parakeet/models/deepvoice3/model.py
+++ b/parakeet/models/deepvoice3/model.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np

 import paddle.fluid.layers as F
@@ -29,9 +43,9 @@ class DeepVoice3(dg.Layer):
        mel_outputs, alignments, done, decoder_states = self.decoder(
            (keys, values), valid_lengths, mel_inputs, text_positions,
            frame_positions, speaker_embed)
-        linear_outputs = self.converter(
-            decoder_states if self.use_decoder_states else mel_outputs,
-            speaker_embed)
+        linear_outputs = self.converter(decoder_states
+                                        if self.use_decoder_states else
+                                        mel_outputs, speaker_embed)
        return mel_outputs, linear_outputs, alignments, done

    def transduce(self, text_sequences, text_positions, speaker_indices=None):
@@ -43,7 +57,7 @@ class DeepVoice3(dg.Layer):
        keys, values = self.encoder(text_sequences, speaker_embed)
        mel_outputs, alignments, done, decoder_states = self.decoder.decode(
            (keys, values), text_positions, speaker_embed)
-        linear_outputs = self.converter(
-            decoder_states if self.use_decoder_states else mel_outputs,
-            speaker_embed)
+        linear_outputs = self.converter(decoder_states
+                                        if self.use_decoder_states else
+                                        mel_outputs, speaker_embed)
        return mel_outputs, linear_outputs, alignments, done
--- a/parakeet/models/deepvoice3/position_embedding.py
+++ b/parakeet/models/deepvoice3/position_embedding.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from paddle import fluid
 import paddle.fluid.layers as F
@@ -95,10 +109,11 @@ class PositionEmbedding(dg.Layer):
                                            speaker_position_rate)  # (B, V, C)
        # make indices for gather_nd
        batch_id = F.expand(
-            F.unsqueeze(F.range(0, batch_size, 1, dtype="int64"), [1]),
-            [1, time_steps])
+            F.unsqueeze(
+                F.range(
+                    0, batch_size, 1, dtype="int64"), [1]), [1, time_steps])
        # (B, T, 2)
        gather_nd_id = F.stack([batch_id, indices], -1)

        out = F.gather_nd(weight, gather_nd_id)
-        return out
\ No newline at end of file
+        return out
--- a/parakeet/models/fastspeech/__init__.py
+++ b/parakeet/models/fastspeech/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
--- a/parakeet/models/fastspeech/decoder.py
+++ b/parakeet/models/fastspeech/decoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.fastspeech.fft_block import FFTBlock

+
 class Decoder(dg.Layer):
    def __init__(self,
                 len_max_seq,
@@ -18,16 +32,29 @@ class Decoder(dg.Layer):
        super(Decoder, self).__init__()

        n_position = len_max_seq + 1
-        self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
-        self.position_enc = dg.Embedding(size=[n_position, d_model],
-                                 padding_idx=0,
-                                 param_attr=fluid.ParamAttr(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
-                                     trainable=False))
-        self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)] 
+        self.pos_inp = get_sinusoid_encoding_table(
+            n_position, d_model, padding_idx=0)
+        self.position_enc = dg.Embedding(
+            size=[n_position, d_model],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.layer_stack = [
+            FFTBlock(
+                d_model,
+                d_inner,
+                n_head,
+                d_k,
+                d_v,
+                fft_conv1d_kernel,
+                fft_conv1d_padding,
+                dropout=dropout) for _ in range(n_layers)
+        ]
        for i, layer in enumerate(self.layer_stack):
            self.add_sublayer('fft_{}'.format(i), layer)
-    
+
    def forward(self, enc_seq, enc_pos):
        """
        Decoder layer of FastSpeech.
@@ -57,4 +84,4 @@ class Decoder(dg.Layer):
                slf_attn_mask=slf_attn_mask)
            dec_slf_attn_list += [dec_slf_attn]

-        return dec_output, dec_slf_attn_list
\ No newline at end of file
+        return dec_output, dec_slf_attn_list
--- a/parakeet/models/fastspeech/encoder.py
+++ b/parakeet/models/fastspeech/encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.fastspeech.fft_block import FFTBlock

+
 class Encoder(dg.Layer):
    def __init__(self,
                 n_src_vocab,
@@ -19,14 +33,28 @@ class Encoder(dg.Layer):
        super(Encoder, self).__init__()
        n_position = len_max_seq + 1

-        self.src_word_emb = dg.Embedding(size=[n_src_vocab, d_model], padding_idx=0)
-        self.pos_inp = get_sinusoid_encoding_table(n_position, d_model, padding_idx=0)
-        self.position_enc = dg.Embedding(size=[n_position, d_model],
-                                 padding_idx=0,
-                                 param_attr=fluid.ParamAttr(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
-                                     trainable=False))
-        self.layer_stack = [FFTBlock(d_model, d_inner, n_head, d_k, d_v, fft_conv1d_kernel, fft_conv1d_padding, dropout=dropout) for _ in range(n_layers)]
+        self.src_word_emb = dg.Embedding(
+            size=[n_src_vocab, d_model], padding_idx=0)
+        self.pos_inp = get_sinusoid_encoding_table(
+            n_position, d_model, padding_idx=0)
+        self.position_enc = dg.Embedding(
+            size=[n_position, d_model],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.layer_stack = [
+            FFTBlock(
+                d_model,
+                d_inner,
+                n_head,
+                d_k,
+                d_v,
+                fft_conv1d_kernel,
+                fft_conv1d_padding,
+                dropout=dropout) for _ in range(n_layers)
+        ]
        for i, layer in enumerate(self.layer_stack):
            self.add_sublayer('fft_{}'.format(i), layer)

@@ -52,7 +80,8 @@ class Encoder(dg.Layer):
        non_pad_mask = get_non_pad_mask(character)

        # -- Forward
-        enc_output = self.src_word_emb(character) + self.position_enc(text_pos) #(N, T, C)
+        enc_output = self.src_word_emb(character) + self.position_enc(
+            text_pos)  #(N, T, C)

        for enc_layer in self.layer_stack:
            enc_output, enc_slf_attn = enc_layer(
@@ -60,5 +89,5 @@ class Encoder(dg.Layer):
                non_pad_mask=non_pad_mask,
                slf_attn_mask=slf_attn_mask)
            enc_slf_attn_list += [enc_slf_attn]
-        
-        return enc_output, non_pad_mask, enc_slf_attn_list
\ No newline at end of file
+
+        return enc_output, non_pad_mask, enc_slf_attn_list
--- a/parakeet/models/fastspeech/fastspeech.py
+++ b/parakeet/models/fastspeech/fastspeech.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
@@ -7,54 +20,67 @@ from parakeet.models.fastspeech.length_regulator import LengthRegulator
 from parakeet.models.fastspeech.encoder import Encoder
 from parakeet.models.fastspeech.decoder import Decoder

+
 class FastSpeech(dg.Layer):
    def __init__(self, cfg):
        " FastSpeech"
        super(FastSpeech, self).__init__()

-        self.encoder = Encoder(n_src_vocab=len(symbols)+1,
-                               len_max_seq=cfg['max_seq_len'],
-                               n_layers=cfg['encoder_n_layer'],
-                               n_head=cfg['encoder_head'],
-                               d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
-                               d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
-                               d_model=cfg['fs_hidden_size'],
-                               d_inner=cfg['encoder_conv1d_filter_size'],
-                               fft_conv1d_kernel=cfg['fft_conv1d_filter'], 
-                               fft_conv1d_padding=cfg['fft_conv1d_padding'],
-                               dropout=0.1)
-        self.length_regulator = LengthRegulator(input_size=cfg['fs_hidden_size'], 
-                                                out_channels=cfg['duration_predictor_output_size'], 
-                                                filter_size=cfg['duration_predictor_filter_size'], 
-                                                dropout=cfg['dropout'])
-        self.decoder = Decoder(len_max_seq=cfg['max_seq_len'],
-                                n_layers=cfg['decoder_n_layer'],
-                                n_head=cfg['decoder_head'],
-                                d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
-                                d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
-                                d_model=cfg['fs_hidden_size'],
-                                d_inner=cfg['decoder_conv1d_filter_size'],
-                                fft_conv1d_kernel=cfg['fft_conv1d_filter'], 
-                                fft_conv1d_padding=cfg['fft_conv1d_padding'],
-                                dropout=0.1)
-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
+        self.encoder = Encoder(
+            n_src_vocab=len(symbols) + 1,
+            len_max_seq=cfg['max_seq_len'],
+            n_layers=cfg['encoder_n_layer'],
+            n_head=cfg['encoder_head'],
+            d_k=cfg['fs_hidden_size'] // cfg['encoder_head'],
+            d_v=cfg['fs_hidden_size'] // cfg['encoder_head'],
+            d_model=cfg['fs_hidden_size'],
+            d_inner=cfg['encoder_conv1d_filter_size'],
+            fft_conv1d_kernel=cfg['fft_conv1d_filter'],
+            fft_conv1d_padding=cfg['fft_conv1d_padding'],
+            dropout=0.1)
+        self.length_regulator = LengthRegulator(
+            input_size=cfg['fs_hidden_size'],
+            out_channels=cfg['duration_predictor_output_size'],
+            filter_size=cfg['duration_predictor_filter_size'],
+            dropout=cfg['dropout'])
+        self.decoder = Decoder(
+            len_max_seq=cfg['max_seq_len'],
+            n_layers=cfg['decoder_n_layer'],
+            n_head=cfg['decoder_head'],
+            d_k=cfg['fs_hidden_size'] // cfg['decoder_head'],
+            d_v=cfg['fs_hidden_size'] // cfg['decoder_head'],
+            d_model=cfg['fs_hidden_size'],
+            d_inner=cfg['decoder_conv1d_filter_size'],
+            fft_conv1d_kernel=cfg['fft_conv1d_filter'],
+            fft_conv1d_padding=cfg['fft_conv1d_padding'],
+            dropout=0.1)
+        self.weight = fluid.ParamAttr(
+            initializer=fluid.initializer.XavierInitializer())
        k = math.sqrt(1 / cfg['fs_hidden_size'])
-        self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
-        self.mel_linear = dg.Linear(cfg['fs_hidden_size'], 
-                                    cfg['audio']['num_mels']* cfg['audio']['outputs_per_step'],
-                                    param_attr = self.weight,
-                                    bias_attr = self.bias,)
-        self.postnet = PostConvNet(n_mels=cfg['audio']['num_mels'],
-                 num_hidden=512,
-                 filter_size=5,
-                 padding=int(5 / 2),
-                 num_conv=5,
-                 outputs_per_step=cfg['audio']['outputs_per_step'],
-                 use_cudnn=True,
-                 dropout=0.1,
-                 batchnorm_last=True)
+        self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+            low=-k, high=k))
+        self.mel_linear = dg.Linear(
+            cfg['fs_hidden_size'],
+            cfg['audio']['num_mels'] * cfg['audio']['outputs_per_step'],
+            param_attr=self.weight,
+            bias_attr=self.bias, )
+        self.postnet = PostConvNet(
+            n_mels=cfg['audio']['num_mels'],
+            num_hidden=512,
+            filter_size=5,
+            padding=int(5 / 2),
+            num_conv=5,
+            outputs_per_step=cfg['audio']['outputs_per_step'],
+            use_cudnn=True,
+            dropout=0.1,
+            batchnorm_last=True)

-    def forward(self, character, text_pos, mel_pos=None, length_target=None, alpha=1.0):
+    def forward(self,
+                character,
+                text_pos,
+                mel_pos=None,
+                length_target=None,
+                alpha=1.0):
        """
        FastSpeech model.
        
@@ -80,22 +106,25 @@ class FastSpeech(dg.Layer):
            dec_slf_attn_list (Variable), Shape(B, mel_T, mel_T), the decoder self attention list.
        """

-        encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(character, text_pos)
+        encoder_output, non_pad_mask, enc_slf_attn_list = self.encoder(
+            character, text_pos)
        if fluid.framework._dygraph_tracer()._train_mode:
-            
-            length_regulator_output, duration_predictor_output = self.length_regulator(encoder_output,
-                                                                                       target=length_target,
-                                                                                       alpha=alpha)
-            decoder_output, dec_slf_attn_list = self.decoder(length_regulator_output, mel_pos)
+
+            length_regulator_output, duration_predictor_output = self.length_regulator(
+                encoder_output, target=length_target, alpha=alpha)
+            decoder_output, dec_slf_attn_list = self.decoder(
+                length_regulator_output, mel_pos)

            mel_output = self.mel_linear(decoder_output)
            mel_output_postnet = self.postnet(mel_output) + mel_output

            return mel_output, mel_output_postnet, duration_predictor_output, enc_slf_attn_list, dec_slf_attn_list
        else:
-            length_regulator_output, decoder_pos = self.length_regulator(encoder_output, alpha=alpha)
-            decoder_output, _ = self.decoder(length_regulator_output, decoder_pos)
+            length_regulator_output, decoder_pos = self.length_regulator(
+                encoder_output, alpha=alpha)
+            decoder_output, _ = self.decoder(length_regulator_output,
+                                             decoder_pos)
            mel_output = self.mel_linear(decoder_output)
            mel_output_postnet = self.postnet(mel_output) + mel_output

-            return mel_output, mel_output_postnet
\ No newline at end of file
+            return mel_output, mel_output_postnet
--- a/parakeet/models/fastspeech/fft_block.py
+++ b/parakeet/models/fastspeech/fft_block.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import math
 import paddle.fluid.dygraph as dg
@@ -6,11 +19,32 @@ import paddle.fluid as fluid
 from parakeet.modules.multihead_attention import MultiheadAttention
 from parakeet.modules.ffn import PositionwiseFeedForward

+
 class FFTBlock(dg.Layer):
-    def __init__(self, d_model, d_inner, n_head, d_k, d_v, filter_size, padding, dropout=0.2):
+    def __init__(self,
+                 d_model,
+                 d_inner,
+                 n_head,
+                 d_k,
+                 d_v,
+                 filter_size,
+                 padding,
+                 dropout=0.2):
        super(FFTBlock, self).__init__()
-        self.slf_attn = MultiheadAttention(d_model, d_k, d_v, num_head=n_head, is_bias=True, dropout=dropout, is_concat=False)
-        self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, filter_size =filter_size, padding =padding, dropout=dropout)
+        self.slf_attn = MultiheadAttention(
+            d_model,
+            d_k,
+            d_v,
+            num_head=n_head,
+            is_bias=True,
+            dropout=dropout,
+            is_concat=False)
+        self.pos_ffn = PositionwiseFeedForward(
+            d_model,
+            d_inner,
+            filter_size=filter_size,
+            padding=padding,
+            dropout=dropout)

    def forward(self, enc_input, non_pad_mask=None, slf_attn_mask=None):
        """
@@ -27,10 +61,11 @@ class FFTBlock(dg.Layer):
            output (Variable), Shape(B, T, C), the output after self-attention & ffn.
            slf_attn (Variable), Shape(B * n_head, T, T), the self attention.
        """
-        output, slf_attn = self.slf_attn(enc_input, enc_input, enc_input, mask=slf_attn_mask)
+        output, slf_attn = self.slf_attn(
+            enc_input, enc_input, enc_input, mask=slf_attn_mask)
        output *= non_pad_mask

        output = self.pos_ffn(output)
        output *= non_pad_mask

-        return output, slf_attn
\ No newline at end of file
+        return output, slf_attn
--- a/parakeet/models/fastspeech/length_regulator.py
+++ b/parakeet/models/fastspeech/length_regulator.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import math
 import parakeet.models.fastspeech.utils
@@ -6,47 +19,50 @@ import paddle.fluid.layers as layers
 import paddle.fluid as fluid
 from parakeet.modules.customized import Conv1D

+
 class LengthRegulator(dg.Layer):
    def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
        super(LengthRegulator, self).__init__()
-        self.duration_predictor = DurationPredictor(input_size=input_size, 
-                                                    out_channels=out_channels, 
-                                                    filter_size=filter_size, 
-                                                    dropout=dropout)
+        self.duration_predictor = DurationPredictor(
+            input_size=input_size,
+            out_channels=out_channels,
+            filter_size=filter_size,
+            dropout=dropout)

    def LR(self, x, duration_predictor_output, alpha=1.0):
        output = []
        batch_size = x.shape[0]
        for i in range(batch_size):
-            output.append(self.expand(x[i:i+1], duration_predictor_output[i:i+1], alpha))
+            output.append(
+                self.expand(x[i:i + 1], duration_predictor_output[i:i + 1],
+                            alpha))
        output = self.pad(output)
        return output
-    
+
    def pad(self, input_ele):
        max_len = max([input_ele[i].shape[0] for i in range(len(input_ele))])
        out_list = []
        for i in range(len(input_ele)):
            pad_len = max_len - input_ele[i].shape[0]
-            one_batch_padded = layers.pad(
-                input_ele[i], [0, pad_len, 0, 0], pad_value=0.0)
+            one_batch_padded = layers.pad(input_ele[i], [0, pad_len, 0, 0],
+                                          pad_value=0.0)
            out_list.append(one_batch_padded)
        out_padded = layers.stack(out_list)
        return out_padded
-    
+
    def expand(self, batch, predicted, alpha):
        out = []
        time_steps = batch.shape[1]
        fertilities = predicted.numpy()
-        batch = layers.squeeze(batch,[0]) 
-        
-        
+        batch = layers.squeeze(batch, [0])
+
        for i in range(time_steps):
-            if fertilities[0,i]==0:
+            if fertilities[0, i] == 0:
                continue
-            out.append(layers.expand(batch[i: i + 1, :], [int(fertilities[0,i]), 1]))
+            out.append(
+                layers.expand(batch[i:i + 1, :], [int(fertilities[0, i]), 1]))
        out = layers.concat(out, axis=0)
        return out
-    

    def forward(self, x, alpha=1.0, target=None):
        """
@@ -70,10 +86,11 @@ class LengthRegulator(dg.Layer):
        else:
            duration_predictor_output = layers.round(duration_predictor_output)
            output = self.LR(x, duration_predictor_output, alpha)
-            mel_pos = dg.to_variable(np.arange(1, output.shape[1]+1))
+            mel_pos = dg.to_variable(np.arange(1, output.shape[1] + 1))
            mel_pos = layers.unsqueeze(mel_pos, [0])
            return output, mel_pos

+
 class DurationPredictor(dg.Layer):
    def __init__(self, input_size, out_channels, filter_size, dropout=0.1):
        super(DurationPredictor, self).__init__()
@@ -83,30 +100,38 @@ class DurationPredictor(dg.Layer):
        self.dropout = dropout

        k = math.sqrt(1 / self.input_size)
-        self.conv1 = Conv1D(num_channels = self.input_size, 
-                        num_filters = self.out_channels, 
-                        filter_size = self.filter_size,
-                        padding=1,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
-                        #data_format='NTC')
+        self.conv1 = Conv1D(
+            num_channels=self.input_size,
+            num_filters=self.out_channels,
+            filter_size=self.filter_size,
+            padding=1,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        #data_format='NTC')
        k = math.sqrt(1 / self.out_channels)
-        self.conv2 = Conv1D(num_channels = self.out_channels, 
-                        num_filters = self.out_channels, 
-                        filter_size = self.filter_size,
-                        padding=1,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
-                        #data_format='NTC')
+        self.conv2 = Conv1D(
+            num_channels=self.out_channels,
+            num_filters=self.out_channels,
+            filter_size=self.filter_size,
+            padding=1,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        #data_format='NTC')
        self.layer_norm1 = dg.LayerNorm(self.out_channels)
        self.layer_norm2 = dg.LayerNorm(self.out_channels)

-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
+        self.weight = fluid.ParamAttr(
+            initializer=fluid.initializer.XavierInitializer())
        k = math.sqrt(1 / self.out_channels)
-        self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
+        self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+            low=-k, high=k))

-        self.linear =dg.Linear(self.out_channels, 1, param_attr = self.weight,
-                            bias_attr = self.bias)
+        self.linear = dg.Linear(
+            self.out_channels, 1, param_attr=self.weight, bias_attr=self.bias)

    def forward(self, encoder_output):
        """
@@ -118,18 +143,15 @@ class DurationPredictor(dg.Layer):
            out (Variable), Shape(B, T, C), the output of duration predictor.
        """
        # encoder_output.shape(N, T, C)
-        out = layers.transpose(encoder_output, [0,2,1])
+        out = layers.transpose(encoder_output, [0, 2, 1])
        out = self.conv1(out)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        out = layers.dropout(layers.relu(self.layer_norm1(out)), self.dropout)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        out = self.conv2(out)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        out = layers.dropout(layers.relu(self.layer_norm2(out)), self.dropout)
        out = layers.relu(self.linear(out))
        out = layers.squeeze(out, axes=[-1])
-        
-            
-        return out

-        
+        return out
--- a/parakeet/models/fastspeech/utils.py
+++ b/parakeet/models/fastspeech/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np

+
 def get_alignment(attn_probs, mel_lens, n_head):
    max_F = 0
    assert attn_probs[0].shape[0] % n_head == 0
@@ -8,27 +22,27 @@ def get_alignment(attn_probs, mel_lens, n_head):
    for i in range(len(attn_probs)):
        multi_attn = attn_probs[i].numpy()
        for j in range(n_head):
-            attn = multi_attn[j*batch_size:(j+1)*batch_size]
+            attn = multi_attn[j * batch_size:(j + 1) * batch_size]
            F = score_F(attn)
            if max_F < F:
                max_F = F
                max_attn = attn
    alignment = compute_duration(max_attn, mel_lens)
    return alignment
-    
+
+
 def score_F(attn):
    max = np.max(attn, axis=-1)
    mean = np.mean(max)
    return mean

+
 def compute_duration(attn, mel_lens):
-    alignment = np.zeros([attn.shape[0],attn.shape[2]])
+    alignment = np.zeros([attn.shape[0], attn.shape[2]])
    mel_lens = mel_lens.numpy()
    for i in range(attn.shape[0]):
        for j in range(mel_lens[i]):
-            max_index = np.argmax(attn[i,j])
-            alignment[i,max_index] += 1
+            max_index = np.argmax(attn[i, j])
+            alignment[i, max_index] += 1

    return alignment
-
-
--- a/parakeet/models/transformer_tts/__init__.py
+++ b/parakeet/models/transformer_tts/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
\ No newline at end of file
--- a/parakeet/models/transformer_tts/cbhg.py
+++ b/parakeet/models/transformer_tts/cbhg.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from parakeet.g2p.text.symbols import symbols
 import paddle.fluid.dygraph as dg
@@ -7,9 +20,16 @@ from parakeet.modules.customized import Pool1D, Conv1D
 from parakeet.modules.dynamic_gru import DynamicGRU
 import numpy as np

+
 class CBHG(dg.Layer):
-    def __init__(self, hidden_size, batch_size, K=16, projection_size = 256, num_gru_layers=2, 
-                 max_pool_kernel_size=2, is_post=False):
+    def __init__(self,
+                 hidden_size,
+                 batch_size,
+                 K=16,
+                 projection_size=256,
+                 num_gru_layers=2,
+                 max_pool_kernel_size=2,
+                 is_post=False):
        super(CBHG, self).__init__()
        """
        :param hidden_size: dimension of hidden unit
@@ -24,28 +44,39 @@ class CBHG(dg.Layer):
        self.projection_size = projection_size
        self.conv_list = []
        k = math.sqrt(1 / projection_size)
-        self.conv_list.append(Conv1D(num_channels = projection_size,
-                            num_filters = hidden_size,
-                            filter_size = 1,
-                            padding = int(np.floor(1/2)),
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
+        self.conv_list.append(
+            Conv1D(
+                num_channels=projection_size,
+                num_filters=hidden_size,
+                filter_size=1,
+                padding=int(np.floor(1 / 2)),
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k))))
        k = math.sqrt(1 / hidden_size)
-        for i in range(2,K+1):
-            self.conv_list.append(Conv1D(num_channels = hidden_size,
-                            num_filters = hidden_size,
-                            filter_size = i,
-                            padding = int(np.floor(i/2)),
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k))))
+        for i in range(2, K + 1):
+            self.conv_list.append(
+                Conv1D(
+                    num_channels=hidden_size,
+                    num_filters=hidden_size,
+                    filter_size=i,
+                    padding=int(np.floor(i / 2)),
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))

        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)

        self.batchnorm_list = []
        for i in range(K):
-            self.batchnorm_list.append(dg.BatchNorm(hidden_size, 
-                            data_layout='NCHW'))
+            self.batchnorm_list.append(
+                dg.BatchNorm(
+                    hidden_size, data_layout='NCHW'))

        for i, layer in enumerate(self.batchnorm_list):
            self.add_sublayer("batchnorm_list_{}".format(i), layer)
@@ -53,91 +84,120 @@ class CBHG(dg.Layer):
        conv_outdim = hidden_size * K

        k = math.sqrt(1 / conv_outdim)
-        self.conv_projection_1 = Conv1D(num_channels = conv_outdim,
-                            num_filters = hidden_size,
-                            filter_size = 3,
-                            padding = int(np.floor(3/2)),
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
+        self.conv_projection_1 = Conv1D(
+            num_channels=conv_outdim,
+            num_filters=hidden_size,
+            filter_size=3,
+            padding=int(np.floor(3 / 2)),
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))

        k = math.sqrt(1 / hidden_size)
-        self.conv_projection_2 = Conv1D(num_channels = hidden_size,
-                            num_filters = projection_size,
-                            filter_size = 3,
-                            padding = int(np.floor(3/2)),
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)))
-
-        self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, 
-                            data_layout='NCHW')
-        self.batchnorm_proj_2 = dg.BatchNorm(projection_size, 
-                            data_layout='NCHW')
-        self.max_pool = Pool1D(pool_size = max_pool_kernel_size, 
-                    pool_type='max', 
-                    pool_stride=1, 
-                    pool_padding=1,
-                    data_format = "NCT")
+        self.conv_projection_2 = Conv1D(
+            num_channels=hidden_size,
+            num_filters=projection_size,
+            filter_size=3,
+            padding=int(np.floor(3 / 2)),
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+
+        self.batchnorm_proj_1 = dg.BatchNorm(hidden_size, data_layout='NCHW')
+        self.batchnorm_proj_2 = dg.BatchNorm(
+            projection_size, data_layout='NCHW')
+        self.max_pool = Pool1D(
+            pool_size=max_pool_kernel_size,
+            pool_type='max',
+            pool_stride=1,
+            pool_padding=1,
+            data_format="NCT")
        self.highway = Highwaynet(self.projection_size)

        h_0 = np.zeros((batch_size, hidden_size // 2), dtype="float32")
        h_0 = dg.to_variable(h_0)
        k = math.sqrt(1 / hidden_size)
-        self.fc_forward1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
-        self.fc_reverse1 = dg.Linear(hidden_size, hidden_size // 2 * 3,
-                            param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                            bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
-        self.gru_forward1 = DynamicGRU(size = self.hidden_size // 2,
-                              is_reverse = False,
-                              origin_mode = True,
-                              h_0 = h_0)
-        self.gru_reverse1 = DynamicGRU(size = self.hidden_size // 2,
-                              is_reverse=True,
-                              origin_mode=True,
-                              h_0 = h_0)
-
-        self.fc_forward2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
-        self.fc_reverse2 = dg.Linear(hidden_size, hidden_size // 2 * 3,
-                           param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                           bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
-        self.gru_forward2 = DynamicGRU(size = self.hidden_size // 2,
-                              is_reverse = False,
-                              origin_mode = True,
-                              h_0 = h_0)
-        self.gru_reverse2 = DynamicGRU(size = self.hidden_size // 2,
-                              is_reverse=True,
-                              origin_mode=True,
-                              h_0 = h_0)
+        self.fc_forward1 = dg.Linear(
+            hidden_size,
+            hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.fc_reverse1 = dg.Linear(
+            hidden_size,
+            hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.gru_forward1 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=False,
+            origin_mode=True,
+            h_0=h_0)
+        self.gru_reverse1 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=True,
+            origin_mode=True,
+            h_0=h_0)
+
+        self.fc_forward2 = dg.Linear(
+            hidden_size,
+            hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.fc_reverse2 = dg.Linear(
+            hidden_size,
+            hidden_size // 2 * 3,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.gru_forward2 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=False,
+            origin_mode=True,
+            h_0=h_0)
+        self.gru_reverse2 = DynamicGRU(
+            size=self.hidden_size // 2,
+            is_reverse=True,
+            origin_mode=True,
+            h_0=h_0)

    def _conv_fit_dim(self, x, filter_size=3):
        if filter_size % 2 == 0:
-            return x[:,:,:-1]
+            return x[:, :, :-1]
        else:
-            return x 
+            return x

    def forward(self, input_):
        # input_.shape = [N, C, T]

        conv_list = []
        conv_input = input_
-        
-        for i, (conv, batchnorm) in enumerate(zip(self.conv_list, self.batchnorm_list)):
-            conv_input = self._conv_fit_dim(conv(conv_input), i+1)
+
+        for i, (conv, batchnorm
+                ) in enumerate(zip(self.conv_list, self.batchnorm_list)):
+            conv_input = self._conv_fit_dim(conv(conv_input), i + 1)
            conv_input = layers.relu(batchnorm(conv_input))
            conv_list.append(conv_input)
-        
+
        conv_cat = layers.concat(conv_list, axis=1)
-        conv_pool = self.max_pool(conv_cat)[:,:,:-1]
-        
-        
-        conv_proj = layers.relu(self.batchnorm_proj_1(self._conv_fit_dim(self.conv_projection_1(conv_pool))))
-        conv_proj = self.batchnorm_proj_2(self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
-        
+        conv_pool = self.max_pool(conv_cat)[:, :, :-1]
+
+        conv_proj = layers.relu(
+            self.batchnorm_proj_1(
+                self._conv_fit_dim(self.conv_projection_1(conv_pool))))
+        conv_proj = self.batchnorm_proj_2(
+            self._conv_fit_dim(self.conv_projection_2(conv_proj))) + input_
+
        # conv_proj.shape = [N, C, T]
-        highway = layers.transpose(conv_proj, [0,2,1])
+        highway = layers.transpose(conv_proj, [0, 2, 1])
        highway = self.highway(highway)

        # highway.shape = [N, T, C]
@@ -151,9 +211,10 @@ class CBHG(dg.Layer):
        out_forward = self.gru_forward2(fc_forward)
        out_reverse = self.gru_reverse2(fc_reverse)
        out = layers.concat([out_forward, out_reverse], axis=-1)
-        out = layers.transpose(out, [0,2,1])
+        out = layers.transpose(out, [0, 2, 1])
        return out

+
 class Highwaynet(dg.Layer):
    def __init__(self, num_units, num_layers=4):
        super(Highwaynet, self).__init__()
@@ -164,14 +225,26 @@ class Highwaynet(dg.Layer):
        self.linears = []
        k = math.sqrt(1 / num_units)
        for i in range(num_layers):
-            self.linears.append(dg.Linear(num_units, num_units,
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
-            self.gates.append(dg.Linear(num_units, num_units,
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))))
-        
-        for i, (linear, gate) in enumerate(zip(self.linears,self.gates)):
+            self.linears.append(
+                dg.Linear(
+                    num_units,
+                    num_units,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
+            self.gates.append(
+                dg.Linear(
+                    num_units,
+                    num_units,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k))))
+
+        for i, (linear, gate) in enumerate(zip(self.linears, self.gates)):
            self.add_sublayer("linears_{}".format(i), linear)
            self.add_sublayer("gates_{}".format(i), gate)

@@ -183,12 +256,6 @@ class Highwaynet(dg.Layer):
            t_ = fluid.layers.sigmoid(gate(out))

            c = 1 - t_
-            out  = h * t_ + out  * c
-            
-        return out
-
+            out = h * t_ + out * c

-
-
-                
-        
+        return out
--- a/parakeet/models/transformer_tts/decoder.py
+++ b/parakeet/models/transformer_tts/decoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
@@ -7,70 +20,110 @@ from parakeet.modules.ffn import PositionwiseFeedForward
 from parakeet.models.transformer_tts.prenet import PreNet
 from parakeet.models.transformer_tts.post_convnet import PostConvNet

+
 class Decoder(dg.Layer):
    def __init__(self, num_hidden, config, num_head=4):
        super(Decoder, self).__init__()
        self.num_hidden = num_hidden
        param = fluid.ParamAttr()
-        self.alpha = self.create_parameter(shape=(1,), attr=param, dtype='float32',
-                        default_initializer = fluid.initializer.ConstantInitializer(value=1.0))
-        self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
-        self.pos_emb = dg.Embedding(size=[1024, num_hidden],
-                                 padding_idx=0,
-                                 param_attr=fluid.ParamAttr(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
-                                     trainable=False))
-        self.decoder_prenet = PreNet(input_size = config['audio']['num_mels'], 
-                                            hidden_size = num_hidden * 2, 
-                                            output_size = num_hidden, 
-                                            dropout_rate=0.2)
+        self.alpha = self.create_parameter(
+            shape=(1, ),
+            attr=param,
+            dtype='float32',
+            default_initializer=fluid.initializer.ConstantInitializer(
+                value=1.0))
+        self.pos_inp = get_sinusoid_encoding_table(
+            1024, self.num_hidden, padding_idx=0)
+        self.pos_emb = dg.Embedding(
+            size=[1024, num_hidden],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.decoder_prenet = PreNet(
+            input_size=config['audio']['num_mels'],
+            hidden_size=num_hidden * 2,
+            output_size=num_hidden,
+            dropout_rate=0.2)
        k = math.sqrt(1 / num_hidden)
-        self.linear = dg.Linear(num_hidden, num_hidden,
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+        self.linear = dg.Linear(
+            num_hidden,
+            num_hidden,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))

-        self.selfattn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        self.selfattn_layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
        for i, layer in enumerate(self.selfattn_layers):
            self.add_sublayer("self_attn_{}".format(i), layer)
-        self.attn_layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        self.attn_layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
        for i, layer in enumerate(self.attn_layers):
            self.add_sublayer("attn_{}".format(i), layer)
-        self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1) for _ in range(3)]
+        self.ffns = [
+            PositionwiseFeedForward(
+                num_hidden, num_hidden * num_head, filter_size=1)
+            for _ in range(3)
+        ]
        for i, layer in enumerate(self.ffns):
            self.add_sublayer("ffns_{}".format(i), layer)
-        self.mel_linear = dg.Linear(num_hidden, config['audio']['num_mels'] * config['audio']['outputs_per_step'],
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
-        self.stop_linear = dg.Linear(num_hidden, 1,
-                                  param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                                  bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
-
-        self.postconvnet = PostConvNet(config['audio']['num_mels'], config['hidden_size'], 
-                                       filter_size = 5, padding = 4, num_conv=5, 
-                                       outputs_per_step=config['audio']['outputs_per_step'], 
-                                       use_cudnn = True)
+        self.mel_linear = dg.Linear(
+            num_hidden,
+            config['audio']['num_mels'] * config['audio']['outputs_per_step'],
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+        self.stop_linear = dg.Linear(
+            num_hidden,
+            1,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
+
+        self.postconvnet = PostConvNet(
+            config['audio']['num_mels'],
+            config['hidden_size'],
+            filter_size=5,
+            padding=4,
+            num_conv=5,
+            outputs_per_step=config['audio']['outputs_per_step'],
+            use_cudnn=True)

    def forward(self, key, value, query, c_mask, positional):

        # get decoder mask with triangular matrix
-        
+
        if fluid.framework._dygraph_tracer()._train_mode:
            m_mask = get_non_pad_mask(positional)
-            mask = get_attn_key_pad_mask((positional==0).astype(np.float32), query)
-            triu_tensor = dg.to_variable(get_triu_tensor(query.numpy(), query.numpy())).astype(np.float32)
+            mask = get_attn_key_pad_mask((positional == 0).astype(np.float32),
+                                         query)
+            triu_tensor = dg.to_variable(
+                get_triu_tensor(query.numpy(), query.numpy())).astype(
+                    np.float32)
            mask = mask + triu_tensor
            mask = fluid.layers.cast(mask == 0, np.float32)
-            
+
            # (batch_size, decoder_len, encoder_len)
-            zero_mask = get_attn_key_pad_mask(layers.squeeze(c_mask,[-1]), query)
+            zero_mask = get_attn_key_pad_mask(
+                layers.squeeze(c_mask, [-1]), query)
        else:
-            mask = get_triu_tensor(query.numpy(), query.numpy()).astype(np.float32)
+            mask = get_triu_tensor(query.numpy(),
+                                   query.numpy()).astype(np.float32)
            mask = fluid.layers.cast(dg.to_variable(mask == 0), np.float32)
            m_mask, zero_mask = None, None

        # Decoder pre-network
        query = self.decoder_prenet(query)
-        
+
        # Centered position
        query = self.linear(query)

@@ -84,10 +137,13 @@ class Decoder(dg.Layer):
        # Attention decoder-decoder, encoder-decoder
        selfattn_list = list()
        attn_list = list()
-        
-        for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers, self.ffns):
-            query, attn_dec = selfattn(query, query, query, mask = mask, query_mask = m_mask)
-            query, attn_dot = attn(key, value, query, mask = zero_mask, query_mask = m_mask)
+
+        for selfattn, attn, ffn in zip(self.selfattn_layers, self.attn_layers,
+                                       self.ffns):
+            query, attn_dec = selfattn(
+                query, query, query, mask=mask, query_mask=m_mask)
+            query, attn_dot = attn(
+                key, value, query, mask=zero_mask, query_mask=m_mask)
            query = ffn(query)
            selfattn_list.append(attn_dec)
            attn_list.append(attn_dot)
@@ -96,7 +152,7 @@ class Decoder(dg.Layer):
        # Post Mel Network
        out = self.postconvnet(mel_out)
        out = mel_out + out
-        
+
        # Stop tokens
        stop_tokens = self.stop_linear(query)
        stop_tokens = layers.squeeze(stop_tokens, [-1])

--- a/parakeet/models/transformer_tts/encoder.py
+++ b/parakeet/models/transformer_tts/encoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.utils import *
@@ -5,25 +18,41 @@ from parakeet.modules.multihead_attention import MultiheadAttention
 from parakeet.modules.ffn import PositionwiseFeedForward
 from parakeet.models.transformer_tts.encoderprenet import EncoderPrenet

+
 class Encoder(dg.Layer):
    def __init__(self, embedding_size, num_hidden, num_head=4):
        super(Encoder, self).__init__()
        self.num_hidden = num_hidden
-        param = fluid.ParamAttr(initializer=fluid.initializer.Constant(value=1.0))
-        self.alpha = self.create_parameter(shape=(1, ), attr=param, dtype='float32')
-        self.pos_inp = get_sinusoid_encoding_table(1024, self.num_hidden, padding_idx=0)
-        self.pos_emb = dg.Embedding(size=[1024, num_hidden],
-                                 padding_idx=0,
-                                 param_attr=fluid.ParamAttr(
-                                     initializer=fluid.initializer.NumpyArrayInitializer(self.pos_inp),
-                                     trainable=False))
-        self.encoder_prenet = EncoderPrenet(embedding_size = embedding_size, 
-                                            num_hidden = num_hidden, 
-                                            use_cudnn=True)
-        self.layers = [MultiheadAttention(num_hidden, num_hidden//num_head, num_hidden//num_head) for _ in range(3)]
+        param = fluid.ParamAttr(initializer=fluid.initializer.Constant(
+            value=1.0))
+        self.alpha = self.create_parameter(
+            shape=(1, ), attr=param, dtype='float32')
+        self.pos_inp = get_sinusoid_encoding_table(
+            1024, self.num_hidden, padding_idx=0)
+        self.pos_emb = dg.Embedding(
+            size=[1024, num_hidden],
+            padding_idx=0,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.NumpyArrayInitializer(
+                    self.pos_inp),
+                trainable=False))
+        self.encoder_prenet = EncoderPrenet(
+            embedding_size=embedding_size,
+            num_hidden=num_hidden,
+            use_cudnn=True)
+        self.layers = [
+            MultiheadAttention(num_hidden, num_hidden // num_head,
+                               num_hidden // num_head) for _ in range(3)
+        ]
        for i, layer in enumerate(self.layers):
            self.add_sublayer("self_attn_{}".format(i), layer)
-        self.ffns = [PositionwiseFeedForward(num_hidden, num_hidden*num_head, filter_size=1, use_cudnn = True) for _ in range(3)]
+        self.ffns = [
+            PositionwiseFeedForward(
+                num_hidden,
+                num_hidden * num_head,
+                filter_size=1,
+                use_cudnn=True) for _ in range(3)
+        ]
        for i, layer in enumerate(self.ffns):
            self.add_sublayer("ffns_{}".format(i), layer)

@@ -33,25 +62,23 @@ class Encoder(dg.Layer):
            mask = get_attn_key_pad_mask(positional, x)
        else:
            query_mask, mask = None, None
-        
+
        # Encoder pre_network
-        x = self.encoder_prenet(x) #(N,T,C)
-        
-        
+        x = self.encoder_prenet(x)  #(N,T,C)
+
        # Get positional encoding
-        positional = self.pos_emb(positional) 
-        
-        x = positional * self.alpha + x #(N, T, C)
-       
+        positional = self.pos_emb(positional)
+
+        x = positional * self.alpha + x  #(N, T, C)

        # Positional dropout
        x = layers.dropout(x, 0.1)
-        
+
        # Self attention encoder
        attentions = list()
        for layer, ffn in zip(self.layers, self.ffns):
-            x, attention = layer(x, x, x, mask = mask, query_mask = query_mask)
+            x, attention = layer(x, x, x, mask=mask, query_mask=query_mask)
            x = ffn(x)
            attentions.append(attention)

-        return x, query_mask, attentions
\ No newline at end of file
+        return x, query_mask, attentions
--- a/parakeet/models/transformer_tts/encoderprenet.py
+++ b/parakeet/models/transformer_tts/encoderprenet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 from parakeet.g2p.text.symbols import symbols
 import paddle.fluid.dygraph as dg
@@ -13,47 +26,63 @@ class EncoderPrenet(dg.Layer):
        self.embedding_size = embedding_size
        self.num_hidden = num_hidden
        self.use_cudnn = use_cudnn
-        self.embedding = dg.Embedding( size = [len(symbols), embedding_size],
-                                        padding_idx = None)
+        self.embedding = dg.Embedding(
+            size=[len(symbols), embedding_size], padding_idx=None)
        self.conv_list = []
        k = math.sqrt(1 / embedding_size)
-        self.conv_list.append(Conv1D(num_channels = embedding_size, 
-                            num_filters = num_hidden, 
-                            filter_size = 5,
-                            padding = int(np.floor(5/2)),
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                            use_cudnn = use_cudnn))
+        self.conv_list.append(
+            Conv1D(
+                num_channels=embedding_size,
+                num_filters=num_hidden,
+                filter_size=5,
+                padding=int(np.floor(5 / 2)),
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))
        k = math.sqrt(1 / num_hidden)
        for _ in range(2):
-            self.conv_list.append(Conv1D(num_channels = num_hidden, 
-                                num_filters = num_hidden, 
-                                filter_size = 5,
-                                padding = int(np.floor(5/2)),
-                                param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                                bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                                use_cudnn = use_cudnn))
+            self.conv_list.append(
+                Conv1D(
+                    num_channels=num_hidden,
+                    num_filters=num_hidden,
+                    filter_size=5,
+                    padding=int(np.floor(5 / 2)),
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k)),
+                    use_cudnn=use_cudnn))

        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)

-        self.batch_norm_list = [dg.BatchNorm(num_hidden, 
-                            data_layout='NCHW') for _ in range(3)]
+        self.batch_norm_list = [
+            dg.BatchNorm(
+                num_hidden, data_layout='NCHW') for _ in range(3)
+        ]

        for i, layer in enumerate(self.batch_norm_list):
            self.add_sublayer("batch_norm_list_{}".format(i), layer)

        k = math.sqrt(1 / num_hidden)
-        self.projection = dg.Linear(num_hidden, num_hidden,
-                                param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                                bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+        self.projection = dg.Linear(
+            num_hidden,
+            num_hidden,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))

    def forward(self, x):
-        x = self.embedding(x) #(batch_size, seq_len, embending_size)
-        x = layers.transpose(x,[0,2,1])
+        x = self.embedding(x)  #(batch_size, seq_len, embending_size)
+        x = layers.transpose(x, [0, 2, 1])
        for batch_norm, conv in zip(self.batch_norm_list, self.conv_list):
            x = layers.dropout(layers.relu(batch_norm(conv(x))), 0.2)
-        x = layers.transpose(x,[0,2,1]) #(N,T,C)
+        x = layers.transpose(x, [0, 2, 1])  #(N,T,C)
        x = self.projection(x)

-        return x
\ No newline at end of file
+        return x
--- a/parakeet/models/transformer_tts/post_convnet.py
+++ b/parakeet/models/transformer_tts/post_convnet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 from parakeet.modules.customized import Conv1D

+
 class PostConvNet(dg.Layer):
-    def __init__(self, 
+    def __init__(self,
                 n_mels=80,
                 num_hidden=512,
                 filter_size=5,
@@ -16,49 +30,66 @@ class PostConvNet(dg.Layer):
                 dropout=0.1,
                 batchnorm_last=False):
        super(PostConvNet, self).__init__()
-        
+
        self.dropout = dropout
        self.num_conv = num_conv
        self.batchnorm_last = batchnorm_last
        self.conv_list = []
        k = math.sqrt(1 / (n_mels * outputs_per_step))
-        self.conv_list.append(Conv1D(num_channels = n_mels * outputs_per_step,
-                            num_filters = num_hidden,
-                            filter_size = filter_size,
-                            padding = padding,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                            use_cudnn = use_cudnn))
+        self.conv_list.append(
+            Conv1D(
+                num_channels=n_mels * outputs_per_step,
+                num_filters=num_hidden,
+                filter_size=filter_size,
+                padding=padding,
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))

        k = math.sqrt(1 / num_hidden)
-        for _ in range(1, num_conv-1):
-            self.conv_list.append(Conv1D(num_channels = num_hidden,
-                                num_filters = num_hidden,
-                                filter_size = filter_size,
-                                padding = padding,
-                                param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                                bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                                use_cudnn = use_cudnn))
+        for _ in range(1, num_conv - 1):
+            self.conv_list.append(
+                Conv1D(
+                    num_channels=num_hidden,
+                    num_filters=num_hidden,
+                    filter_size=filter_size,
+                    padding=padding,
+                    param_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.XavierInitializer()),
+                    bias_attr=fluid.ParamAttr(
+                        initializer=fluid.initializer.Uniform(
+                            low=-k, high=k)),
+                    use_cudnn=use_cudnn))

-        self.conv_list.append(Conv1D(num_channels = num_hidden,
-                            num_filters = n_mels * outputs_per_step,
-                            filter_size = filter_size,
-                            padding = padding,
-                            param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                            bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                            use_cudnn = use_cudnn))
+        self.conv_list.append(
+            Conv1D(
+                num_channels=num_hidden,
+                num_filters=n_mels * outputs_per_step,
+                filter_size=filter_size,
+                padding=padding,
+                param_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.XavierInitializer()),
+                bias_attr=fluid.ParamAttr(
+                    initializer=fluid.initializer.Uniform(
+                        low=-k, high=k)),
+                use_cudnn=use_cudnn))

        for i, layer in enumerate(self.conv_list):
            self.add_sublayer("conv_list_{}".format(i), layer)

-        self.batch_norm_list = [dg.BatchNorm(num_hidden, 
-                            data_layout='NCHW') for _ in range(num_conv-1)]
+        self.batch_norm_list = [
+            dg.BatchNorm(
+                num_hidden, data_layout='NCHW') for _ in range(num_conv - 1)
+        ]
        if self.batchnorm_last:
-            self.batch_norm_list.append(dg.BatchNorm(n_mels * outputs_per_step, 
-                                data_layout='NCHW'))
+            self.batch_norm_list.append(
+                dg.BatchNorm(
+                    n_mels * outputs_per_step, data_layout='NCHW'))
        for i, layer in enumerate(self.batch_norm_list):
            self.add_sublayer("batch_norm_list_{}".format(i), layer)
-        

    def forward(self, input):
        """
@@ -69,18 +100,19 @@ class PostConvNet(dg.Layer):
        Returns:
            output (Variable), Shape(B, T, C), the result after postconvnet.
        """
-        
-        input = layers.transpose(input, [0,2,1])
+
+        input = layers.transpose(input, [0, 2, 1])
        len = input.shape[-1]
-        for i in range(self.num_conv-1):
+        for i in range(self.num_conv - 1):
            batch_norm = self.batch_norm_list[i]
            conv = self.conv_list[i]
-            
-            input = layers.dropout(layers.tanh(batch_norm(conv(input)[:,:,:len])), self.dropout)
-        conv = self.conv_list[self.num_conv-1]
-        input = conv(input)[:,:,:len]
+
+            input = layers.dropout(
+                layers.tanh(batch_norm(conv(input)[:, :, :len])), self.dropout)
+        conv = self.conv_list[self.num_conv - 1]
+        input = conv(input)[:, :, :len]
        if self.batchnorm_last:
-            batch_norm = self.batch_norm_list[self.num_conv-1]
+            batch_norm = self.batch_norm_list[self.num_conv - 1]
            input = layers.dropout(batch_norm(input), self.dropout)
-        output = layers.transpose(input, [0,2,1])
-        return output
\ No newline at end of file
+        output = layers.transpose(input, [0, 2, 1])
+        return output
--- a/parakeet/models/transformer_tts/prenet.py
+++ b/parakeet/models/transformer_tts/prenet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers

+
 class PreNet(dg.Layer):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.2):
        """
@@ -17,13 +31,21 @@ class PreNet(dg.Layer):
        self.dropout_rate = dropout_rate

        k = math.sqrt(1 / input_size)
-        self.linear1 = dg.Linear(input_size, hidden_size,
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+        self.linear1 = dg.Linear(
+            input_size,
+            hidden_size,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))
        k = math.sqrt(1 / hidden_size)
-        self.linear2 = dg.Linear(hidden_size, output_size,
-                              param_attr=fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer()),
-                              bias_attr=fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k)))
+        self.linear2 = dg.Linear(
+            hidden_size,
+            output_size,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)))

    def forward(self, x):
        """

--- a/parakeet/models/transformer_tts/transformer_tts.py
+++ b/parakeet/models/transformer_tts/transformer_tts.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.models.transformer_tts.encoder import Encoder
 from parakeet.models.transformer_tts.decoder import Decoder

+
 class TransformerTTS(dg.Layer):
    def __init__(self, config):
        super(TransformerTTS, self).__init__()
@@ -11,16 +25,10 @@ class TransformerTTS(dg.Layer):
        self.config = config

    def forward(self, characters, mel_input, pos_text, pos_mel):
-        
-        key, c_mask, attns_enc = self.encoder(characters, pos_text)
-        
-        mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(key, key, mel_input, c_mask, pos_mel)
-
-        return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec

+        key, c_mask, attns_enc = self.encoder(characters, pos_text)

-    
-
+        mel_output, postnet_output, attn_probs, stop_preds, attns_dec = self.decoder(
+            key, key, mel_input, c_mask, pos_mel)

-            
-            
+        return mel_output, postnet_output, attn_probs, stop_preds, attns_enc, attns_dec
--- a/parakeet/models/transformer_tts/utils.py
+++ b/parakeet/models/transformer_tts/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import numpy as np
 import librosa
 import os, copy
@@ -6,14 +19,15 @@ import paddle.fluid.layers as layers


 def get_positional_table(d_pos_vec, n_position=1024):
-    position_enc = np.array([
-        [pos / np.power(10000, 2*i/d_pos_vec) for i in range(d_pos_vec)]
-        if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
+    position_enc = np.array(
+        [[pos / np.power(10000, 2 * i / d_pos_vec) for i in range(d_pos_vec)]
+         if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])

-    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i
-    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1
+    position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
+    position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
    return position_enc

+
 def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
    ''' Sinusoid position encoding table '''

@@ -23,7 +37,8 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):
    def get_posi_angle_vec(position):
        return [cal_angle(position, hid_j) for hid_j in range(d_hid)]

-    sinusoid_table = np.array([get_posi_angle_vec(pos_i) for pos_i in range(n_position)])
+    sinusoid_table = np.array(
+        [get_posi_angle_vec(pos_i) for pos_i in range(n_position)])

    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
@@ -34,8 +49,10 @@ def get_sinusoid_encoding_table(n_position, d_hid, padding_idx=None):

    return sinusoid_table

+
 def get_non_pad_mask(seq):
-    return layers.unsqueeze((seq != 0).astype(np.float32),[-1])
+    return layers.unsqueeze((seq != 0).astype(np.float32), [-1])
+

 def get_attn_key_pad_mask(seq_k, seq_q):
    ''' For masking out the padding part of key sequence. '''
@@ -43,32 +60,37 @@ def get_attn_key_pad_mask(seq_k, seq_q):
    # Expand to fit the shape of key query attention matrix.
    len_q = seq_q.shape[1]
    padding_mask = (seq_k != 0).astype(np.float32)
-    padding_mask = layers.expand(layers.unsqueeze(padding_mask,[1]), [1, len_q, 1]) 
+    padding_mask = layers.expand(
+        layers.unsqueeze(padding_mask, [1]), [1, len_q, 1])
    return padding_mask

+
 def get_triu_tensor(seq_k, seq_q):
    ''' For make a triu tensor '''
    len_k = seq_k.shape[1]
    len_q = seq_q.shape[1]
    batch_size = seq_k.shape[0]
    triu_tensor = np.triu(np.ones([len_k, len_q]), 1)
-    triu_tensor = np.repeat(np.expand_dims(triu_tensor, axis=0) ,batch_size, axis=0)
-    
+    triu_tensor = np.repeat(
+        np.expand_dims(
+            triu_tensor, axis=0), batch_size, axis=0)
+
    return triu_tensor

+
 def guided_attention(N, T, g=0.2):
    '''Guided attention. Refer to page 3 on the paper.'''
    W = np.zeros((N, T), dtype=np.float32)
    for n_pos in range(W.shape[0]):
        for t_pos in range(W.shape[1]):
-            W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N)) ** 2 / (2 * g * g))
+            W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(T) - n_pos / float(N))
+                                         **2 / (2 * g * g))
    return W


 def cross_entropy(input, label, position_weight=1.0, epsilon=1e-30):
-    output = -1 * label * layers.log(input + epsilon) - (1-label) * layers.log(1 - input + epsilon)
+    output = -1 * label * layers.log(input + epsilon) - (
+        1 - label) * layers.log(1 - input + epsilon)
    output = output * (label * (position_weight - 1) + 1)

    return layers.reduce_sum(output, dim=[0, 1])
-        
-
--- a/parakeet/models/transformer_tts/vocoder.py
+++ b/parakeet/models/transformer_tts/vocoder.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid as fluid
 from parakeet.modules.customized import Conv1D
 from parakeet.models.transformer_tts.utils import *
 from parakeet.models.transformer_tts.cbhg import CBHG

+
 class Vocoder(dg.Layer):
    """
    CBHG Network (mel -> linear)
    """
+
    def __init__(self, config, batch_size):
        super(Vocoder, self).__init__()
-        self.pre_proj = Conv1D(num_channels = config['audio']['num_mels'], 
-                             num_filters = config['hidden_size'],
-                             filter_size=1)
+        self.pre_proj = Conv1D(
+            num_channels=config['audio']['num_mels'],
+            num_filters=config['hidden_size'],
+            filter_size=1)
        self.cbhg = CBHG(config['hidden_size'], batch_size)
-        self.post_proj = Conv1D(num_channels = config['hidden_size'], 
-                             num_filters = (config['audio']['n_fft'] // 2) + 1,
-                             filter_size=1)
+        self.post_proj = Conv1D(
+            num_channels=config['hidden_size'],
+            num_filters=(config['audio']['n_fft'] // 2) + 1,
+            filter_size=1)

    def forward(self, mel):
-        mel = layers.transpose(mel, [0,2,1])
+        mel = layers.transpose(mel, [0, 2, 1])
        mel = self.pre_proj(mel)
        mel = self.cbhg(mel)
        mag_pred = self.post_proj(mel)
-        mag_pred = layers.transpose(mag_pred, [0,2,1])
+        mag_pred = layers.transpose(mag_pred, [0, 2, 1])
        return mag_pred
--- a/parakeet/models/waveflow/__init__.py
+++ b/parakeet/models/waveflow/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.models.waveflow.waveflow import WaveFlow
--- a/parakeet/models/waveflow/data.py
+++ b/parakeet/models/waveflow/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import random

 import librosa

--- a/parakeet/models/waveflow/waveflow.py
+++ b/parakeet/models/waveflow/waveflow.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import itertools
 import os
 import time

--- a/parakeet/models/waveflow/waveflow_modules.py
+++ b/parakeet/models/waveflow/waveflow_modules.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import itertools
 import numpy as np
 import paddle.fluid.dygraph as dg

--- a/parakeet/models/wavenet/README.md
+++ b/parakeet/models/wavenet/README.md
@@ -2,7 +2,7 @@

 Paddle fluid implementation of WaveNet, a deep generative model of raw audio waveforms.
 WaveNet model is originally proposed in [WaveNet: A Generative Model for Raw Audio](https://arxiv.org/abs/1609.03499).
-Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels. 
+Our implementation is based on the WaveNet architecture described in [ClariNet: Parallel Wave Generation in End-to-End Text-to-Speech](https://arxiv.org/abs/1807.07281) and can provide various output distributions, including single Gaussian, mixture of Gaussian, and softmax with linearly quantized channels.

 We implement WaveNet model in paddle fluid with dynamic graph, which is convenient for flexible network architectures.

@@ -51,10 +51,10 @@ python -u train.py --config=${yaml} \
 #### Save and Load checkpoints

 Our model will save model parameters as checkpoints in `./runs/wavenet/${ModelName}/checkpoint/` every 10000 iterations by default.
-The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters. 
+The saved checkpoint will have the format of `step-${iteration_number}.pdparams` for model parameters and `step-${iteration_number}.pdopt` for optimizer parameters.

 There are three ways to load a checkpoint and resume training (take an example that you want to load a 500000-iteration checkpoint):
-1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed. 
+1. Use `--checkpoint=./runs/wavenet/${ModelName}/checkpoint/step-500000` to provide a specific path to load. Note that you only need to provide the base name of the parameter file, which is `step-500000`, no extension name `.pdparams` or `.pdopt` is needed.
 2. Use `--iteration=500000`.
 3. If you don't specify either `--checkpoint` or `--iteration`, the model will automatically load the latest checkpoint in `./runs/wavenet/${ModelName}/checkpoint`.

@@ -91,7 +91,7 @@ python -u synthesis.py --config=${yaml} \
    --root=./data/LJSpeech-1.1 \
    --name=${ModelName} --use_gpu=true \
    --output=./syn_audios \
-    --sample=${SAMPLE} 
+    --sample=${SAMPLE}
 ```

 In this example, `--output` specifies where to save the synthesized audios and `--sample` specifies which sample in the valid dataset (a split from the whole LJSpeech dataset, by default contains the first 16 audio samples) to synthesize based on the mel-spectrograms computed from the ground truth sample audio, e.g., `--sample=0` means to synthesize the first audio in the valid dataset.
--- a/parakeet/models/wavenet/data.py
+++ b/parakeet/models/wavenet/data.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import random

 import librosa
@@ -18,8 +32,8 @@ class Dataset(ljspeech.LJSpeech):
        self.fft_window_shift = config.fft_window_shift
        # Calculate context frames.
        frames_per_second = config.sample_rate // self.fft_window_shift
-        train_clip_frames = int(np.ceil(
-            config.train_clip_second * frames_per_second))
+        train_clip_frames = int(
+            np.ceil(config.train_clip_second * frames_per_second))
        context_frames = config.context_size // self.fft_window_shift
        self.num_frames = train_clip_frames + context_frames

@@ -32,7 +46,7 @@ class Dataset(ljspeech.LJSpeech):
        fft_window_shift = config.fft_window_shift
        fft_window_size = config.fft_window_size
        fft_size = config.fft_size
-        
+
        audio, loaded_sr = librosa.load(wav_path, sr=None)
        assert loaded_sr == sr

@@ -41,42 +55,46 @@ class Dataset(ljspeech.LJSpeech):
        fft_padding = (fft_size - fft_window_shift) // 2
        desired_length = frames * fft_window_shift + fft_padding * 2
        pad_amount = (desired_length - audio.size) // 2
-        
+
        if audio.size % 2 == 0:
            audio = np.pad(audio, (pad_amount, pad_amount), mode='reflect')
        else:
            audio = np.pad(audio, (pad_amount, pad_amount + 1), mode='reflect')
-        
+
        # Normalize audio.
        audio = audio / np.abs(audio).max() * 0.999
-        
+
        # Compute mel-spectrogram.
        # Turn center to False to prevent internal padding.
        spectrogram = librosa.core.stft(
-            audio, hop_length=fft_window_shift,
-            win_length=fft_window_size, n_fft=fft_size, center=False)
+            audio,
+            hop_length=fft_window_shift,
+            win_length=fft_window_size,
+            n_fft=fft_size,
+            center=False)
        spectrogram_magnitude = np.abs(spectrogram)
-        
+
        # Compute mel-spectrograms.
-        mel_filter_bank = librosa.filters.mel(sr=sr, n_fft=fft_size,
+        mel_filter_bank = librosa.filters.mel(sr=sr,
+                                              n_fft=fft_size,
                                              n_mels=config.mel_bands)
        mel_spectrogram = np.dot(mel_filter_bank, spectrogram_magnitude)
        mel_spectrogram = mel_spectrogram.T
-        
+
        # Rescale mel_spectrogram.
        min_level, ref_level = 1e-5, 20
        mel_spectrogram = 20 * np.log10(np.maximum(min_level, mel_spectrogram))
        mel_spectrogram = mel_spectrogram - ref_level
        mel_spectrogram = np.clip((mel_spectrogram + 100) / 100, 0, 1)
-        
+
        # Extract the center of audio that corresponds to mel spectrograms.
-        audio = audio[fft_padding : -fft_padding]
+        audio = audio[fft_padding:-fft_padding]
        assert mel_spectrogram.shape[0] * fft_window_shift == audio.size

        return audio, mel_spectrogram


-class Subset(dataset.Dataset): 
+class Subset(dataset.Dataset):
    def __init__(self, dataset, indices, valid):
        self.dataset = dataset
        self.indices = indices
@@ -100,23 +118,23 @@ class Subset(dataset.Dataset):

            audio_start = frame_start * fft_window_shift
            audio_end = frame_end * fft_window_shift
-            
-            audio = audio[audio_start : audio_end]
+
+            audio = audio[audio_start:audio_end]

        return audio, mel, audio_start

    def _batch_examples(self, batch):
        audios = [sample[0] for sample in batch]
        audio_starts = [sample[2] for sample in batch]
-    
+
        # mels shape [num_frames, mel_bands]
-        max_frames = max(sample[1].shape[0] for sample in batch) 
+        max_frames = max(sample[1].shape[0] for sample in batch)
        mels = [utils.pad_to_size(sample[1], max_frames) for sample in batch]
-        
+
        audios = np.array(audios, dtype=np.float32)
        mels = np.array(mels, dtype=np.float32)
        audio_starts = np.array(audio_starts, dtype=np.int32)
-    
+
        return audios, mels, audio_starts

    def __len__(self):
@@ -138,17 +156,17 @@ class LJSpeech:

        # Train dataset.
        trainset = Subset(ds, train_indices, valid=False)
-        sampler = DistributedSampler(len(trainset), nranks, rank) 
+        sampler = DistributedSampler(len(trainset), nranks, rank)
        total_bs = config.batch_size
        assert total_bs % nranks == 0
-        train_sampler = BatchSampler(sampler, total_bs // nranks,
-            drop_last=True)
+        train_sampler = BatchSampler(
+            sampler, total_bs // nranks, drop_last=True)
        trainloader = DataCargo(trainset, batch_sampler=train_sampler)

        trainreader = fluid.io.PyReader(capacity=50, return_list=True)
        trainreader.decorate_batch_generator(trainloader, place)
        self.trainloader = (data for _ in iter(int, 1)
-            for data in trainreader())
+                            for data in trainreader())

        # Valid dataset.
        validset = Subset(ds, valid_indices, valid=True)
@@ -156,5 +174,5 @@ class LJSpeech:
        validloader = DataCargo(validset, batch_size=1, shuffle=False)

        validreader = fluid.io.PyReader(capacity=20, return_list=True)
-        validreader.decorate_batch_generator(validloader, place) 
+        validreader.decorate_batch_generator(validloader, place)
        self.validloader = validreader
--- a/parakeet/models/wavenet/slurm.py
+++ b/parakeet/models/wavenet/slurm.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Utility module for restarting training when using SLURM.
 """
@@ -45,8 +58,8 @@ def parse_time(text):
    try:
        return parse_hours(hours) * 3600 + int(minutes) * 60 + int(seconds)
    except ValueError as e:
-        raise ValueError("Error parsing time {}. Got error {}.".format(
-            text, str(e)))
+        raise ValueError("Error parsing time {}. Got error {}.".format(text,
+                                                                       str(e)))


 def restart_command():
@@ -76,8 +89,10 @@ def restart_command():
    gres, partition = info.get("Gres"), info.get("Partition")
    stderr, stdout = info.get("StdErr"), info.get("StdOut")
    job_name = info.get("JobName")
-    command = ["sbatch", "--job-name={}".format(job_name),
-               "--ntasks={}".format(num_tasks)]
+    command = [
+        "sbatch", "--job-name={}".format(job_name),
+        "--ntasks={}".format(num_tasks)
+    ]

    if partition:
        command.extend(["--partition", partition])
@@ -98,12 +113,13 @@ def restart_command():
    dist_setting = ['-m', 'paddle.distributed.launch']
    wrap_cmd = ["srun", python, '-u'] + dist_setting + sys.argv

-    command.append(
-        "--wrap={}".format(" ".join(shlex.quote(arg) for arg in wrap_cmd)))
+    command.append("--wrap={}".format(" ".join(
+        shlex.quote(arg) for arg in wrap_cmd)))
    time_limit_string = info["TimeLimit"]
    if time_limit_string.lower() == "unlimited":
-        print("UNLIMITED detected: restart OFF, infinite learning ON.",
-              flush=True)
+        print(
+            "UNLIMITED detected: restart OFF, infinite learning ON.",
+            flush=True)
        return command, None
    time_limit = parse_time(time_limit_string)
    runtime = parse_time(info["RunTime"])

--- a/parakeet/models/wavenet/synthesis.py
+++ b/parakeet/models/wavenet/synthesis.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 from pprint import pprint
@@ -12,25 +26,42 @@ from wavenet import WaveNet


 def add_options_to_parser(parser):
-    parser.add_argument('--model', type=str, default='wavenet',
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='wavenet',
        help="general name of the model")
-    parser.add_argument('--name', type=str,
-        help="specific name of the training model")
-    parser.add_argument('--root', type=str,
-        help="root path of the LJSpeech dataset")
-
-    parser.add_argument('--use_gpu', type=bool, default=True,
+    parser.add_argument(
+        '--name', type=str, help="specific name of the training model")
+    parser.add_argument(
+        '--root', type=str, help="root path of the LJSpeech dataset")
+
+    parser.add_argument(
+        '--use_gpu',
+        type=bool,
+        default=True,
        help="option to use gpu training")

-    parser.add_argument('--iteration', type=int, default=None,
+    parser.add_argument(
+        '--iteration',
+        type=int,
+        default=None,
        help=("which iteration of checkpoint to load, "
              "default to load the latest checkpoint"))
-    parser.add_argument('--checkpoint', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint',
+        type=str,
+        default=None,
        help="path of the checkpoint to load")

-    parser.add_argument('--output', type=str, default="./syn_audios",
+    parser.add_argument(
+        '--output',
+        type=str,
+        default="./syn_audios",
        help="path to write synthesized audio files")
-    parser.add_argument('--sample', type=int,
+    parser.add_argument(
+        '--sample',
+        type=int,
        help="which of the valid samples to synthesize audio")


@@ -52,7 +83,7 @@ def synthesize(config):
        fluid.default_startup_program().random_seed = seed
        fluid.default_main_program().random_seed = seed
        print("Random Seed: ", seed)
-        
+
        # Build model.
        model = WaveNet(config, checkpoint_dir)
        model.build(training=False)

--- a/parakeet/models/wavenet/train.py
+++ b/parakeet/models/wavenet/train.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import random
 import subprocess
@@ -18,24 +32,42 @@ MAXIMUM_SAVE_TIME = 10 * 60


 def add_options_to_parser(parser):
-    parser.add_argument('--model', type=str, default='wavenet',
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='wavenet',
        help="general name of the model")
-    parser.add_argument('--name', type=str,
-        help="specific name of the training model")
-    parser.add_argument('--root', type=str,
-        help="root path of the LJSpeech dataset")
-
-    parser.add_argument('--parallel', type=bool, default=True,
+    parser.add_argument(
+        '--name', type=str, help="specific name of the training model")
+    parser.add_argument(
+        '--root', type=str, help="root path of the LJSpeech dataset")
+
+    parser.add_argument(
+        '--parallel',
+        type=bool,
+        default=True,
        help="option to use data parallel training")
-    parser.add_argument('--use_gpu', type=bool, default=True,
+    parser.add_argument(
+        '--use_gpu',
+        type=bool,
+        default=True,
        help="option to use gpu training")

-    parser.add_argument('--iteration', type=int, default=None,
+    parser.add_argument(
+        '--iteration',
+        type=int,
+        default=None,
        help=("which iteration of checkpoint to load, "
              "default to load the latest checkpoint"))
-    parser.add_argument('--checkpoint', type=str, default=None,
+    parser.add_argument(
+        '--checkpoint',
+        type=str,
+        default=None,
        help="path of the checkpoint to load")
-    parser.add_argument('--slurm', type=bool, default=False,
+    parser.add_argument(
+        '--slurm',
+        type=bool,
+        default=False,
        help="whether you are using slurm to submit training jobs")


@@ -104,8 +136,8 @@ def train(config):

            # Check whether reaching the time limit.
            if config.slurm:
-                done = (death_time is not None and death_time - time.time() <
-                    MAXIMUM_SAVE_TIME)
+                done = (death_time is not None and
+                        death_time - time.time() < MAXIMUM_SAVE_TIME)

            if rank == 0 and done:
                print("Saving progress before exiting.")
@@ -127,8 +159,8 @@ def train(config):

 if __name__ == "__main__":
    # Create parser.
-    parser = jsonargparse.ArgumentParser(description="Train WaveNet model",
-        formatter_class='default_argparse')
+    parser = jsonargparse.ArgumentParser(
+        description="Train WaveNet model", formatter_class='default_argparse')
    add_options_to_parser(parser)
    utils.add_config_options_to_parser(parser)

@@ -136,4 +168,4 @@ if __name__ == "__main__":
    # For conflicting updates to the same field, 
    # the preceding update will be overwritten by the following one.
    config = parser.parse_args()
-    train(config) 
+    train(config)
--- a/parakeet/models/wavenet/utils.py
+++ b/parakeet/models/wavenet/utils.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import itertools
 import os
 import time
@@ -8,57 +22,82 @@ import paddle.fluid.dygraph as dg


 def add_config_options_to_parser(parser):
-    parser.add_argument('--valid_size', type=int,
-        help="size of the valid dataset")
-    parser.add_argument('--train_clip_second', type=float,
+    parser.add_argument(
+        '--valid_size', type=int, help="size of the valid dataset")
+    parser.add_argument(
+        '--train_clip_second',
+        type=float,
        help="the length of audio clip for training")
-    parser.add_argument('--sample_rate', type=int,
-        help="sampling rate of audio data file")
-    parser.add_argument('--fft_window_shift', type=int,
+    parser.add_argument(
+        '--sample_rate', type=int, help="sampling rate of audio data file")
+    parser.add_argument(
+        '--fft_window_shift',
+        type=int,
        help="the shift of fft window for each frame")
-    parser.add_argument('--fft_window_size', type=int,
+    parser.add_argument(
+        '--fft_window_size',
+        type=int,
        help="the size of fft window for each frame")
-    parser.add_argument('--fft_size', type=int,
-        help="the size of fft filter on each frame")
-    parser.add_argument('--mel_bands', type=int,
+    parser.add_argument(
+        '--fft_size', type=int, help="the size of fft filter on each frame")
+    parser.add_argument(
+        '--mel_bands',
+        type=int,
        help="the number of mel bands when calculating mel spectrograms")

-    parser.add_argument('--seed', type=int,
-        help="seed of random initialization for the model")
-    parser.add_argument('--batch_size', type=int,
-        help="batch size for training")
-    parser.add_argument('--test_every', type=int,
-        help="test interval during training")
-    parser.add_argument('--save_every', type=int,
+    parser.add_argument(
+        '--seed', type=int, help="seed of random initialization for the model")
+    parser.add_argument(
+        '--batch_size', type=int, help="batch size for training")
+    parser.add_argument(
+        '--test_every', type=int, help="test interval during training")
+    parser.add_argument(
+        '--save_every',
+        type=int,
        help="checkpointing interval during training")
-    parser.add_argument('--max_iterations', type=int,
-        help="maximum training iterations")
-
-    parser.add_argument('--layers', type=int,
-        help="number of dilated convolution layers")
-    parser.add_argument('--kernel_width', type=int,
-        help="dilated convolution kernel width")
-    parser.add_argument('--dilation_block', type=list,
-        help="dilated convolution kernel width")
+    parser.add_argument(
+        '--max_iterations', type=int, help="maximum training iterations")
+
+    parser.add_argument(
+        '--layers', type=int, help="number of dilated convolution layers")
+    parser.add_argument(
+        '--kernel_width', type=int, help="dilated convolution kernel width")
+    parser.add_argument(
+        '--dilation_block', type=list, help="dilated convolution kernel width")
    parser.add_argument('--residual_channels', type=int)
    parser.add_argument('--skip_channels', type=int)
-    parser.add_argument('--loss_type', type=str,
-        help="mix-gaussian-pdf or softmax")
-    parser.add_argument('--num_channels', type=int, default=None,
+    parser.add_argument(
+        '--loss_type', type=str, help="mix-gaussian-pdf or softmax")
+    parser.add_argument(
+        '--num_channels',
+        type=int,
+        default=None,
        help="number of channels for softmax output")
-    parser.add_argument('--num_mixtures', type=int, default=None,
+    parser.add_argument(
+        '--num_mixtures',
+        type=int,
+        default=None,
        help="number of gaussian mixtures for gaussian output")
-    parser.add_argument('--log_scale_min', type=float, default=None,
+    parser.add_argument(
+        '--log_scale_min',
+        type=float,
+        default=None,
        help="minimum clip value of log variance of gaussian output")

-    parser.add_argument('--conditioner.filter_sizes', type=list,
+    parser.add_argument(
+        '--conditioner.filter_sizes',
+        type=list,
        help="conv2d tranpose op filter sizes for building conditioner")
-    parser.add_argument('--conditioner.upsample_factors', type=list,
+    parser.add_argument(
+        '--conditioner.upsample_factors',
+        type=list,
        help="list of upsample factors for building conditioner")

    parser.add_argument('--learning_rate', type=float)
    parser.add_argument('--gradient_max_norm', type=float)
-    parser.add_argument('--anneal.every', type=int,
+    parser.add_argument(
+        '--anneal.every',
+        type=int,
        help="step interval for annealing learning rate")
    parser.add_argument('--anneal.rate', type=float)

@@ -113,8 +152,12 @@ def save_latest_checkpoint(checkpoint_dir, iteration):
        handle.write("model_checkpoint_path: step-{}".format(iteration))


-def load_parameters(checkpoint_dir, rank, model, optimizer=None,
-                    iteration=None, file_path=None):
+def load_parameters(checkpoint_dir,
+                    rank,
+                    model,
+                    optimizer=None,
+                    iteration=None,
+                    file_path=None):
    if file_path is None:
        if iteration is None:
            iteration = load_latest_checkpoint(checkpoint_dir, rank)
@@ -128,7 +171,7 @@ def load_parameters(checkpoint_dir, rank, model, optimizer=None,
    if optimizer and optimizer_dict:
        optimizer.set_dict(optimizer_dict)
        print("[checkpoint] Rank {}: loaded optimizer state from {}".format(
-              rank, file_path))
+            rank, file_path))


 def save_latest_parameters(checkpoint_dir, iteration, model, optimizer=None):

--- a/parakeet/models/wavenet/wavenet.py
+++ b/parakeet/models/wavenet/wavenet.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import itertools
 import os
 import time
@@ -13,8 +27,13 @@ from wavenet_modules import WaveNetModule


 class WaveNet():
-    def __init__(self, config, checkpoint_dir, parallel=False, rank=0,
-                 nranks=1, tb_logger=None):
+    def __init__(self,
+                 config,
+                 checkpoint_dir,
+                 parallel=False,
+                 rank=0,
+                 nranks=1,
+                 tb_logger=None):
        # Process config to calculate the context size
        dilations = list(
            itertools.islice(
@@ -29,12 +48,12 @@ class WaveNet():

    def build(self, training=True):
        config = self.config
-        dataset = LJSpeech(config, self.nranks, self.rank) 
+        dataset = LJSpeech(config, self.nranks, self.rank)
        self.trainloader = dataset.trainloader
        self.validloader = dataset.validloader

        wavenet = WaveNetModule("wavenet", config, self.rank)
-        
+
        # Dry run once to create and initalize all necessary parameters.
        audio = dg.to_variable(np.random.randn(1, 20000).astype(np.float32))
        mel = dg.to_variable(
@@ -45,38 +64,44 @@ class WaveNet():
        if training:
            # Create Learning rate scheduler.
            lr_scheduler = dg.ExponentialDecay(
-                learning_rate = config.learning_rate,
-                decay_steps = config.anneal.every,
-                decay_rate = config.anneal.rate,
+                learning_rate=config.learning_rate,
+                decay_steps=config.anneal.every,
+                decay_rate=config.anneal.rate,
                staircase=True)
-    
+
            optimizer = fluid.optimizer.AdamOptimizer(
                learning_rate=lr_scheduler)
-    
+
            clipper = fluid.dygraph_grad_clip.GradClipByGlobalNorm(
                config.gradient_max_norm)

            # Load parameters.
-            utils.load_parameters(self.checkpoint_dir, self.rank,
-                                  wavenet, optimizer,
-                                  iteration=config.iteration,
-                                  file_path=config.checkpoint)
+            utils.load_parameters(
+                self.checkpoint_dir,
+                self.rank,
+                wavenet,
+                optimizer,
+                iteration=config.iteration,
+                file_path=config.checkpoint)
            print("Rank {}: checkpoint loaded.".format(self.rank))
-    
+
            # Data parallelism.
            if self.parallel:
                strategy = dg.parallel.prepare_context()
                wavenet = dg.parallel.DataParallel(wavenet, strategy)
-    
+
            self.wavenet = wavenet
            self.optimizer = optimizer
            self.clipper = clipper

        else:
            # Load parameters.
-            utils.load_parameters(self.checkpoint_dir, self.rank, wavenet,
-                                  iteration=config.iteration,
-                                  file_path=config.checkpoint)
+            utils.load_parameters(
+                self.checkpoint_dir,
+                self.rank,
+                wavenet,
+                iteration=config.iteration,
+                file_path=config.checkpoint)
            print("Rank {}: checkpoint loaded.".format(self.rank))

            self.wavenet = wavenet
@@ -104,7 +129,9 @@ class WaveNet():
        else:
            current_lr = self.optimizer._learning_rate

-        self.optimizer.minimize(loss, grad_clip=self.clipper,
+        self.optimizer.minimize(
+            loss,
+            grad_clip=self.clipper,
            parameter_list=self.wavenet.parameters())
        self.wavenet.clear_gradients()

@@ -143,10 +170,16 @@ class WaveNet():

            tb = self.tb_logger
            tb.add_scalar("Valid-Avg-Loss", loss_val, iteration)
-            tb.add_audio("Teacher-Forced-Audio-0", sample_audios[0].numpy(),
-                iteration, sample_rate=self.config.sample_rate)
-            tb.add_audio("Teacher-Forced-Audio-1", sample_audios[1].numpy(),
-                iteration, sample_rate=self.config.sample_rate)
+            tb.add_audio(
+                "Teacher-Forced-Audio-0",
+                sample_audios[0].numpy(),
+                iteration,
+                sample_rate=self.config.sample_rate)
+            tb.add_audio(
+                "Teacher-Forced-Audio-1",
+                sample_audios[1].numpy(),
+                iteration,
+                sample_rate=self.config.sample_rate)

    @dg.no_grad
    def infer(self, iteration):
@@ -165,10 +198,9 @@ class WaveNet():
        start_time = time.time()
        syn_audio = self.wavenet.synthesize(mels_list[sample])
        syn_time = time.time() - start_time
-        print("audio shape {}, synthesis time {}".format(
-            syn_audio.shape, syn_time))
-        librosa.output.write_wav(filename, syn_audio,
-            sr=config.sample_rate)
+        print("audio shape {}, synthesis time {}".format(syn_audio.shape,
+                                                         syn_time))
+        librosa.output.write_wav(filename, syn_audio, sr=config.sample_rate)

    def save(self, iteration):
        utils.save_latest_parameters(self.checkpoint_dir, iteration,

--- a/parakeet/models/wavenet/wavenet_modules.py
+++ b/parakeet/models/wavenet/wavenet_modules.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import itertools

 import numpy as np
@@ -16,11 +30,11 @@ def get_padding(filter_size, stride, padding_type='same'):

 def extract_slices(x, audio_starts, audio_length, rank):
    slices = []
-    for i in range(x.shape[0]): 
+    for i in range(x.shape[0]):
        start = audio_starts.numpy()[i]
        end = start + audio_length
        slice = fluid.layers.slice(
-            x, axes=[0, 1], starts=[i, start], ends=[i+1, end])
+            x, axes=[0, 1], starts=[i, start], ends=[i + 1, end])
        slices.append(fluid.layers.squeeze(slice, [0]))

    x = fluid.layers.stack(slices, axis=0)
@@ -50,7 +64,7 @@ class Conditioner(dg.Layer):
        # Register python list as parameters.
        for i, layer in enumerate(self.deconvs):
            self.add_sublayer("conv_transpose_{}".format(i), layer)
-        
+
    def forward(self, x):
        x = fluid.layers.unsqueeze(x, 1)
        for layer in self.deconvs:
@@ -62,7 +76,7 @@ class Conditioner(dg.Layer):
 class WaveNetModule(dg.Layer):
    def __init__(self, name_scope, config, rank):
        super(WaveNetModule, self).__init__(name_scope)
-        
+
        self.rank = rank
        self.conditioner = Conditioner(self.full_name(), config)
        self.dilations = list(
@@ -82,15 +96,13 @@ class WaveNetModule(dg.Layer):
                embed_dim=config.residual_channels,
                std=0.1)
        elif config.loss_type == "mix-gaussian-pdf":
-            self.embedding_fc = modules.FC(
-                self.full_name(),
-                in_features=1,
-                size=config.residual_channels,
-                num_flatten_dims=2,
-                relu=False)
+            self.embedding_fc = modules.FC(self.full_name(),
+                                           in_features=1,
+                                           size=config.residual_channels,
+                                           num_flatten_dims=2,
+                                           relu=False)
        else:
-            raise ValueError(
-                "loss_type {} is unsupported!".format(loss_type))
+            raise ValueError("loss_type {} is unsupported!".format(loss_type))

        self.dilated_causal_convs = []
        for dilation in self.dilations:
@@ -102,56 +114,49 @@ class WaveNetModule(dg.Layer):
                    num_filters=config.residual_channels,
                    filter_size=config.kernel_width,
                    dilation=dilation,
-                    causal=True
-                )
-            )
+                    causal=True))

        for i, layer in enumerate(self.dilated_causal_convs):
-            self.add_sublayer("dilated_causal_conv_{}".format(i), layer) 
-
-        self.fc1 = modules.FC(
-            self.full_name(),
-            in_features=config.residual_channels,
-            size=config.skip_channels,
-            num_flatten_dims=2,
-            relu=True,
-            act="relu")
-
-        self.fc2 = modules.FC(
-            self.full_name(),
-            in_features=config.skip_channels,
-            size=config.skip_channels,
-            num_flatten_dims=2,
-            relu=True,
-            act="relu")
+            self.add_sublayer("dilated_causal_conv_{}".format(i), layer)
+
+        self.fc1 = modules.FC(self.full_name(),
+                              in_features=config.residual_channels,
+                              size=config.skip_channels,
+                              num_flatten_dims=2,
+                              relu=True,
+                              act="relu")
+
+        self.fc2 = modules.FC(self.full_name(),
+                              in_features=config.skip_channels,
+                              size=config.skip_channels,
+                              num_flatten_dims=2,
+                              relu=True,
+                              act="relu")

        if config.loss_type == "softmax":
-            self.fc3 = modules.FC(
-                self.full_name(),
-                in_features=config.skip_channels,
-                size=config.num_channels,
-                num_flatten_dims=2,
-                relu=False)
+            self.fc3 = modules.FC(self.full_name(),
+                                  in_features=config.skip_channels,
+                                  size=config.num_channels,
+                                  num_flatten_dims=2,
+                                  relu=False)
        elif config.loss_type == "mix-gaussian-pdf":
-            self.fc3 = modules.FC(
-                self.full_name(),
-                in_features=config.skip_channels,
-                size=3 * config.num_mixtures,
-                num_flatten_dims=2,
-                relu=False)
+            self.fc3 = modules.FC(self.full_name(),
+                                  in_features=config.skip_channels,
+                                  size=3 * config.num_mixtures,
+                                  num_flatten_dims=2,
+                                  relu=False)
        else:
-            raise ValueError(
-                "loss_type {} is unsupported!".format(loss_type))
+            raise ValueError("loss_type {} is unsupported!".format(loss_type))

    def sample_softmax(self, mix_parameters):
        batch, length, hidden = mix_parameters.shape
        mix_param_2d = fluid.layers.reshape(mix_parameters,
-            [batch * length, hidden])
+                                            [batch * length, hidden])
        mix_param_2d = fluid.layers.softmax(mix_param_2d, axis=-1)

        # quantized: [batch * length]
-        quantized = fluid.layers.cast(fluid.layers.sampling_id(mix_param_2d),
-            dtype="float32")
+        quantized = fluid.layers.cast(
+            fluid.layers.sampling_id(mix_param_2d), dtype="float32")
        samples = (quantized + 0.5) * (2.0 / self.config.num_channels) - 1.0

        # samples: [batch * length]
@@ -162,23 +167,23 @@ class WaveNetModule(dg.Layer):
        # to [bs * len, 3 * num_mixtures].
        batch, length, hidden = mix_parameters.shape
        mix_param_2d = fluid.layers.reshape(mix_parameters,
-            [batch * length, hidden])
+                                            [batch * length, hidden])
        K = hidden // 3

        # Unpack the parameters of the mixture of gaussian.
-        logits_pi = mix_param_2d[:, 0 : K]
-        mu = mix_param_2d[:, K : 2*K]
-        log_s = mix_param_2d[:, 2*K : 3*K]
+        logits_pi = mix_param_2d[:, 0:K]
+        mu = mix_param_2d[:, K:2 * K]
+        log_s = mix_param_2d[:, 2 * K:3 * K]
        s = fluid.layers.exp(log_s)

        pi = fluid.layers.softmax(logits_pi, axis=-1)
        comp_samples = fluid.layers.sampling_id(pi)
-        
+
        row_idx = dg.to_variable(np.arange(batch * length))
        comp_samples = fluid.layers.stack([row_idx, comp_samples], axis=-1)

        mu_comp = fluid.layers.gather_nd(mu, comp_samples)
-        s_comp = fluid.layers.gather_nd(s, comp_samples) 
+        s_comp = fluid.layers.gather_nd(s, comp_samples)

        # N(0, 1) normal sample.
        u = fluid.layers.gaussian_random(shape=[batch * length])
@@ -220,8 +225,9 @@ class WaveNetModule(dg.Layer):

        # Calculate gaussian loss.
        targets = fluid.layers.unsqueeze(targets, -1)
-        targets = fluid.layers.expand(targets, [1, 1, self.config.num_mixtures])
-        x_std =  inv_s * (targets - mu)
+        targets = fluid.layers.expand(targets,
+                                      [1, 1, self.config.num_mixtures])
+        x_std = inv_s * (targets - mu)
        exponent = fluid.layers.exp(-0.5 * x_std * x_std)
        pdf_x = 1.0 / np.sqrt(2.0 * np.pi) * inv_s * exponent
        pdf_x = pi * pdf_x
@@ -239,9 +245,9 @@ class WaveNetModule(dg.Layer):

        # Slice conditioners.
        audio_length = audios.shape[1]
-        conditioner = extract_slices(full_conditioner,
-            audio_starts, audio_length, self.rank)
-    
+        conditioner = extract_slices(full_conditioner, audio_starts,
+                                     audio_length, self.rank)
+
        # input_audio, target_audio: [bs, len]
        input_audios = audios[:, :-1]
        target_audios = audios[:, 1:]
@@ -263,15 +269,16 @@ class WaveNetModule(dg.Layer):
            layer_input = self.embedding_fc(
                fluid.layers.unsqueeze(input_audios, 2))
        else:
-            raise ValueError(
-                "loss_type {} is unsupported!".format(loss_type))
+            raise ValueError("loss_type {} is unsupported!".format(loss_type))

        # layer_input: [bs, res_channel, 1, len]
        layer_input = fluid.layers.unsqueeze(
-            fluid.layers.transpose(layer_input, perm=[0, 2, 1]), 2)
+            fluid.layers.transpose(
+                layer_input, perm=[0, 2, 1]), 2)
        # conditioner: [bs, mel_bands, 1, len]
        conditioner = fluid.layers.unsqueeze(
-            fluid.layers.transpose(conditioner, perm=[0, 2, 1]), 2)
+            fluid.layers.transpose(
+                conditioner, perm=[0, 2, 1]), 2)

        skip = None
        for i, layer in enumerate(self.dilated_causal_convs):
@@ -292,23 +299,22 @@ class WaveNetModule(dg.Layer):
            elif loss_type == "mix-gaussian-pdf":
                sample_audios = self.sample_mix_gaussian(mix_parameters)
            else:
-                raise ValueError(
-                    "loss_type {} is unsupported!".format(loss_type))
+                raise ValueError("loss_type {} is unsupported!".format(
+                    loss_type))

        if loss_type == "softmax":
            loss = self.softmax_loss(target_audios, mix_parameters)
        elif loss_type == "mix-gaussian-pdf":
-            loss = self.mixture_density_loss(target_audios,
-                mix_parameters, self.log_scale_min)
+            loss = self.mixture_density_loss(target_audios, mix_parameters,
+                                             self.log_scale_min)
        else:
-            raise ValueError(
-                "loss_type {} is unsupported!".format(loss_type))
+            raise ValueError("loss_type {} is unsupported!".format(loss_type))

        return loss, sample_audios

    def synthesize(self, mels):
        self.start_new_sequence()
-        bs, n_frames, mel_bands = mels.shape 
+        bs, n_frames, mel_bands = mels.shape
        conditioner = self.conditioner(mels)
        time_steps = conditioner.shape[1]

@@ -335,23 +341,24 @@ class WaveNetModule(dg.Layer):
            elif loss_type == "mix-gaussian-pdf":
                audio_input = self.embedding_fc(current_sample)
            else:
-                raise ValueError(
-                    "loss_type {} is unsupported!".format(loss_type))
+                raise ValueError("loss_type {} is unsupported!".format(
+                    loss_type))

            # [bs, channel, 1, 1]
            audio_input = fluid.layers.unsqueeze(
-                fluid.layers.transpose(audio_input, perm=[0, 2, 1]), 2)
+                fluid.layers.transpose(
+                    audio_input, perm=[0, 2, 1]), 2)
            # [bs, mel_bands]
            cond_input = conditioner[:, i, :]
            # [bs, mel_bands, 1, 1]
-            cond_input = fluid.layers.reshape(
-                cond_input, cond_input.shape + [1, 1])
+            cond_input = fluid.layers.reshape(cond_input,
+                                              cond_input.shape + [1, 1])

            skip = None
            for layer in self.dilated_causal_convs:
-                audio_input, skip = layer.add_input(
-                    audio_input, skip, cond_input)
-            
+                audio_input, skip = layer.add_input(audio_input, skip,
+                                                    cond_input)
+
            # [bs, 1, channel]
            skip = fluid.layers.transpose(
                fluid.layers.squeeze(skip, [2]), perm=[0, 2, 1])
@@ -361,19 +368,19 @@ class WaveNetModule(dg.Layer):
            elif loss_type == "mix-gaussian-pdf":
                sample = self.sample_mix_gaussian(mix_parameters)
            else:
-                raise ValueError(
-                    "loss_type {} is unsupported!".format(loss_type))
+                raise ValueError("loss_type {} is unsupported!".format(
+                    loss_type))
            audio_samples.append(sample)
            # [bs]
            current_sample = audio_samples[-1]
            # [bs, 1, 1]
-            current_sample = fluid.layers.reshape(current_sample,
-                current_sample.shape + [1, 1])
+            current_sample = fluid.layers.reshape(
+                current_sample, current_sample.shape + [1, 1])

        # syn_audio: [num_samples]
        syn_audio = fluid.layers.concat(audio_samples, axis=0).numpy()

-        return syn_audio        
+        return syn_audio

    def start_new_sequence(self):
        for layer in self.sublayers():

--- a/parakeet/modules/__init__.py
+++ b/parakeet/modules/__init__.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from . import weight_norm
 from .customized import *
\ No newline at end of file
--- a/parakeet/modules/customized.py
+++ b/parakeet/modules/customized.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle import fluid
 import paddle.fluid.layers as F
 import paddle.fluid.dygraph as dg
@@ -7,14 +21,15 @@ class Pool1D(dg.Layer):
    """
    A Pool 1D block implemented with Pool2D.
    """
+
    def __init__(self,
-                 pool_size=-1, 
-                 pool_type='max', 
-                 pool_stride=1, 
-                 pool_padding=0, 
-                 global_pooling=False, 
-                 use_cudnn=True, 
-                 ceil_mode=False, 
+                 pool_size=-1,
+                 pool_type='max',
+                 pool_stride=1,
+                 pool_padding=0,
+                 global_pooling=False,
+                 use_cudnn=True,
+                 ceil_mode=False,
                 exclusive=True,
                 data_format='NCT'):
        super(Pool1D, self).__init__()
@@ -28,13 +43,16 @@ class Pool1D(dg.Layer):
        self.exclusive = exclusive
        self.data_format = data_format

+        self.pool2d = dg.Pool2D(
+            [1, pool_size],
+            pool_type=pool_type,
+            pool_stride=[1, pool_stride],
+            pool_padding=[0, pool_padding],
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn,
+            ceil_mode=ceil_mode,
+            exclusive=exclusive)

-        self.pool2d = dg.Pool2D([1,pool_size], pool_type = pool_type,
-                                pool_stride = [1,pool_stride], pool_padding = [0, pool_padding],
-                                global_pooling = global_pooling, use_cudnn = use_cudnn,
-                                ceil_mode = ceil_mode, exclusive = exclusive)
-
-    
    def forward(self, x):
        """
        Args:
@@ -53,12 +71,14 @@ class Pool1D(dg.Layer):
            x = fluid.layers.transpose(x, [0, 2, 1])
        return x

+
 class Conv1D(dg.Conv2D):
    """A standard Conv1D layer that use (B, C, T) data layout. It inherit Conv2D and 
    use (B, C, 1, T) data layout to compute 1D convolution. Nothing more.
    NOTE: we inherit Conv2D instead of encapsulate a Conv2D layer to make it a simple
    layer, instead of a complex one. So we can easily apply weight norm to it.
    """
+
    def __init__(self,
                 num_channels,
                 num_filters,
@@ -72,17 +92,18 @@ class Conv1D(dg.Conv2D):
                 use_cudnn=True,
                 act=None,
                 dtype='float32'):
-        super(Conv1D, self).__init__(num_channels,
-                                     num_filters, (1, filter_size),
-                                     stride=(1, stride),
-                                     padding=(0, padding),
-                                     dilation=(1, dilation),
-                                     groups=groups,
-                                     param_attr=param_attr,
-                                     bias_attr=bias_attr,
-                                     use_cudnn=use_cudnn,
-                                     act=act,
-                                     dtype=dtype)
+        super(Conv1D, self).__init__(
+            num_channels,
+            num_filters, (1, filter_size),
+            stride=(1, stride),
+            padding=(0, padding),
+            dilation=(1, dilation),
+            groups=groups,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            use_cudnn=use_cudnn,
+            act=act,
+            dtype=dtype)

    def forward(self, x):
        x = F.unsqueeze(x, [2])
@@ -105,18 +126,19 @@ class Conv1DTranspose(dg.Conv2DTranspose):
                 use_cudnn=True,
                 act=None,
                 dtype='float32'):
-        super(Conv1DTranspose, self).__init__(num_channels,
-                                              num_filters, (1, filter_size),
-                                              output_size=None,
-                                              padding=(0, padding),
-                                              stride=(1, stride),
-                                              dilation=(1, dilation),
-                                              groups=groups,
-                                              param_attr=param_attr,
-                                              bias_attr=bias_attr,
-                                              use_cudnn=use_cudnn,
-                                              act=act,
-                                              dtype=dtype)
+        super(Conv1DTranspose, self).__init__(
+            num_channels,
+            num_filters, (1, filter_size),
+            output_size=None,
+            padding=(0, padding),
+            stride=(1, stride),
+            dilation=(1, dilation),
+            groups=groups,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            use_cudnn=use_cudnn,
+            act=act,
+            dtype=dtype)

    def forward(self, x):
        x = F.unsqueeze(x, [2])
@@ -134,6 +156,7 @@ class Conv1DCell(Conv1D):
    It is a cell that it acts like an RNN cell. It does not support stride > 1, and it
    ensures 1-to-1 mapping from input time steps to output timesteps.
    """
+
    def __init__(self,
                 num_channels,
                 num_filters,
@@ -150,18 +173,19 @@ class Conv1DCell(Conv1D):
        padding = receptive_field - 1 if causal else receptive_field // 2
        self._receptive_field = receptive_field
        self.causal = causal
-        super(Conv1DCell, self).__init__(num_channels,
-                                         num_filters,
-                                         filter_size,
-                                         stride=1,
-                                         padding=padding,
-                                         dilation=dilation,
-                                         groups=groups,
-                                         param_attr=param_attr,
-                                         bias_attr=bias_attr,
-                                         use_cudnn=use_cudnn,
-                                         act=act,
-                                         dtype=dtype)
+        super(Conv1DCell, self).__init__(
+            num_channels,
+            num_filters,
+            filter_size,
+            stride=1,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            use_cudnn=use_cudnn,
+            act=act,
+            dtype=dtype)

    def forward(self, x):
        # it ensures that ouput time steps == input time steps
@@ -189,15 +213,16 @@ class Conv1DCell(Conv1D):
    def add_input(self, x_t):
        batch_size, c_in, _ = x_t.shape
        if self._buffer is None:
-            self._buffer = F.zeros((batch_size, c_in, self.receptive_field),
-                                   dtype=x_t.dtype)
+            self._buffer = F.zeros(
+                (batch_size, c_in, self.receptive_field), dtype=x_t.dtype)
        self._buffer = F.concat([self._buffer[:, :, 1:], x_t], -1)
        if self._dilation[1] > 1:
-            input = F.strided_slice(self._buffer,
-                                    axes=[2],
-                                    starts=[0],
-                                    ends=[self.receptive_field],
-                                    strides=[self._dilation[1]])
+            input = F.strided_slice(
+                self._buffer,
+                axes=[2],
+                starts=[0],
+                ends=[self.receptive_field],
+                strides=[self._dilation[1]])
        else:
            input = self._buffer
        input = F.reshape(input, (batch_size, -1))

--- a/parakeet/modules/dynamic_gru.py
+++ b/parakeet/modules/dynamic_gru.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid.layers as layers

+
 class DynamicGRU(dg.Layer):
    def __init__(self,
                 size,
@@ -49,4 +63,3 @@ class DynamicGRU(dg.Layer):
            res = res[::-1]
        res = layers.concat(res, axis=1)
        return res
-
--- a/parakeet/modules/ffn.py
+++ b/parakeet/modules/ffn.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import paddle.fluid.dygraph as dg
 import paddle.fluid.layers as layers
 import paddle.fluid as fluid
@@ -7,28 +20,41 @@ from parakeet.modules.customized import Conv1D

 class PositionwiseFeedForward(dg.Layer):
    ''' A two-feed-forward-layer module '''
-    def __init__(self, d_in, num_hidden, filter_size, padding=0, use_cudnn=True, dropout=0.1):
+
+    def __init__(self,
+                 d_in,
+                 num_hidden,
+                 filter_size,
+                 padding=0,
+                 use_cudnn=True,
+                 dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.num_hidden = num_hidden
        self.use_cudnn = use_cudnn
        self.dropout = dropout

        k = math.sqrt(1 / d_in)
-        self.w_1 = Conv1D(num_channels = d_in, 
-                        num_filters = num_hidden, 
-                        filter_size = filter_size,
-                        padding=padding,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                        use_cudnn = use_cudnn)
+        self.w_1 = Conv1D(
+            num_channels=d_in,
+            num_filters=num_hidden,
+            filter_size=filter_size,
+            padding=padding,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)),
+            use_cudnn=use_cudnn)
        k = math.sqrt(1 / num_hidden)
-        self.w_2 = Conv1D(num_channels = num_hidden,
-                        num_filters = d_in,
-                        filter_size = filter_size,
-                        padding=padding,
-                        param_attr = fluid.ParamAttr(initializer=fluid.initializer.XavierInitializer()),
-                        bias_attr = fluid.ParamAttr(initializer=fluid.initializer.Uniform(low=-k, high=k)),
-                        use_cudnn = use_cudnn)
+        self.w_2 = Conv1D(
+            num_channels=num_hidden,
+            num_filters=d_in,
+            filter_size=filter_size,
+            padding=padding,
+            param_attr=fluid.ParamAttr(
+                initializer=fluid.initializer.XavierInitializer()),
+            bias_attr=fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k)),
+            use_cudnn=use_cudnn)
        self.layer_norm = dg.LayerNorm(d_in)

    def forward(self, input):
@@ -40,18 +66,18 @@ class PositionwiseFeedForward(dg.Layer):
        Returns:
            output (Variable), Shape(B, T, C), the result after FFN.
        """
-        x = layers.transpose(input, [0,2,1])
+        x = layers.transpose(input, [0, 2, 1])
        #FFN Networt
        x = self.w_2(layers.relu(self.w_1(x)))
-        
+
        # dropout
        x = layers.dropout(x, self.dropout)

-        x = layers.transpose(x, [0,2,1])
+        x = layers.transpose(x, [0, 2, 1])
        # residual connection
        x = x + input
-        
+
        #layer normalization
        output = self.layer_norm(x)

-        return output
\ No newline at end of file
+        return output
--- a/parakeet/modules/multihead_attention.py
+++ b/parakeet/modules/multihead_attention.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import math
 import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.dygraph as dg
 import paddle.fluid.layers as layers

+
 class Linear(dg.Layer):
-    def __init__(self, in_features, out_features, is_bias=True, dtype="float32"):
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 is_bias=True,
+                 dtype="float32"):
        super(Linear, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.dtype = dtype
-        self.weight = fluid.ParamAttr(initializer = fluid.initializer.XavierInitializer())
-        self.bias  = is_bias
+        self.weight = fluid.ParamAttr(
+            initializer=fluid.initializer.XavierInitializer())
+        self.bias = is_bias

        if is_bias is not False:
            k = math.sqrt(1 / in_features)
-            self.bias = fluid.ParamAttr(initializer = fluid.initializer.Uniform(low=-k, high=k))
+            self.bias = fluid.ParamAttr(initializer=fluid.initializer.Uniform(
+                low=-k, high=k))
+
+        self.linear = dg.Linear(
+            in_features,
+            out_features,
+            param_attr=self.weight,
+            bias_attr=self.bias, )

-        self.linear = dg.Linear(in_features, out_features, param_attr = self.weight,
-                            bias_attr = self.bias,)
-    
    def forward(self, x):
        x = self.linear(x)
        return x

+
 class ScaledDotProductAttention(dg.Layer):
    def __init__(self, d_key):
        super(ScaledDotProductAttention, self).__init__()

        self.d_key = d_key
-    
+
    # please attention this mask is diff from pytorch
-    def forward(self, key, value, query, mask=None, query_mask=None, dropout=0.1):
+    def forward(self,
+                key,
+                value,
+                query,
+                mask=None,
+                query_mask=None,
+                dropout=0.1):
        """
        Scaled Dot Product Attention.
        
@@ -47,27 +77,36 @@ class ScaledDotProductAttention(dg.Layer):
            attention (Variable), Shape(n_head * B, T, C), the attention of key.
        """
        # Compute attention score
-        attention = layers.matmul(query, key, transpose_y=True) #transpose the last dim in y
+        attention = layers.matmul(
+            query, key, transpose_y=True)  #transpose the last dim in y
        attention = attention / math.sqrt(self.d_key)

        # Mask key to ignore padding
        if mask is not None:
            attention = attention * mask
-            mask = (mask == 0).astype(np.float32) * (-2 ** 32 + 1)
+            mask = (mask == 0).astype(np.float32) * (-2**32 + 1)
            attention = attention + mask
-        
+
        attention = layers.softmax(attention)
        attention = layers.dropout(attention, dropout)
-        
+
        # Mask query to ignore padding
        if query_mask is not None:
            attention = attention * query_mask
-        
+
        result = layers.matmul(attention, value)
        return result, attention

+
 class MultiheadAttention(dg.Layer):
-    def __init__(self, num_hidden, d_k, d_q, num_head=4, is_bias=False, dropout=0.1, is_concat=True):
+    def __init__(self,
+                 num_hidden,
+                 d_k,
+                 d_q,
+                 num_head=4,
+                 is_bias=False,
+                 dropout=0.1,
+                 is_concat=True):
        super(MultiheadAttention, self).__init__()
        self.num_hidden = num_hidden
        self.num_head = num_head
@@ -109,30 +148,44 @@ class MultiheadAttention(dg.Layer):

        # repeat masks h times
        if query_mask is not None:
-            query_mask = layers.expand(query_mask, [self.num_head, 1, seq_len_key])
+            query_mask = layers.expand(query_mask,
+                                       [self.num_head, 1, seq_len_key])
        if mask is not None:
            mask = layers.expand(mask, (self.num_head, 1, 1))
-        
-        
+
        # Make multihead attention
        # key & value.shape = (batch_size, seq_len, feature)(feature = num_head * num_hidden_per_attn)
-        key = layers.reshape(self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
-        value = layers.reshape(self.value(value), [batch_size, seq_len_key, self.num_head, self.d_k])
-        query = layers.reshape(self.query(query_input), [batch_size, seq_len_query, self.num_head, self.d_q])
+        key = layers.reshape(
+            self.key(key), [batch_size, seq_len_key, self.num_head, self.d_k])
+        value = layers.reshape(
+            self.value(value),
+            [batch_size, seq_len_key, self.num_head, self.d_k])
+        query = layers.reshape(
+            self.query(query_input),
+            [batch_size, seq_len_query, self.num_head, self.d_q])
+
+        key = layers.reshape(
+            layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
+        value = layers.reshape(
+            layers.transpose(value, [2, 0, 1, 3]),
+            [-1, seq_len_key, self.d_k])
+        query = layers.reshape(
+            layers.transpose(query, [2, 0, 1, 3]),
+            [-1, seq_len_query, self.d_q])
+
+        result, attention = self.scal_attn(
+            key, value, query, mask=mask, query_mask=query_mask)

-        key = layers.reshape(layers.transpose(key, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
-        value = layers.reshape(layers.transpose(value, [2, 0, 1, 3]), [-1, seq_len_key, self.d_k])
-        query = layers.reshape(layers.transpose(query, [2, 0, 1, 3]), [-1, seq_len_query, self.d_q])
-        
-        result, attention = self.scal_attn(key, value, query, mask=mask, query_mask=query_mask)
-        
        # concat all multihead result
-        result = layers.reshape(result, [self.num_head, batch_size, seq_len_query, self.d_q])
-        result = layers.reshape(layers.transpose(result, [1,2,0,3]),[batch_size, seq_len_query, -1])
+        result = layers.reshape(
+            result, [self.num_head, batch_size, seq_len_query, self.d_q])
+        result = layers.reshape(
+            layers.transpose(result, [1, 2, 0, 3]),
+            [batch_size, seq_len_query, -1])
        if self.is_concat:
-            result = layers.concat([query_input,result], axis=-1)
+            result = layers.concat([query_input, result], axis=-1)
        result = layers.dropout(self.fc(result), self.dropout)
        result = result + query_input
-        
+
        result = self.layer_norm(result)
-        return result, attention
\ No newline at end of file
+        return result, attention
--- a/parakeet/modules/weight_norm.py
+++ b/parakeet/modules/weight_norm.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from paddle import fluid
 import paddle.fluid.dygraph as dg

--- a/parakeet/utils/layer_tools.py
+++ b/parakeet/utils/layer_tools.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from torch import nn
 import paddle.fluid.dygraph as dg
@@ -10,8 +24,8 @@ def summary(layer):
        print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
        num_elements += np.prod(param.shape)
        num_params += 1
-    print("layer has {} parameters, {} elements.".format(
-        num_params, num_elements))
+    print("layer has {} parameters, {} elements.".format(num_params,
+                                                         num_elements))


 def freeze(layer):
@@ -31,5 +45,5 @@ def torch_summary(layer):
        print("{}|{}|{}".format(name, param.shape, np.prod(param.shape)))
        num_elements += np.prod(param.shape)
        num_params += 1
-    print("layer has {} parameters, {} elements.".format(
-        num_params, num_elements))
+    print("layer has {} parameters, {} elements.".format(num_params,
+                                                         num_elements))
--- a/setup.py
+++ b/setup.py
-import os 
-import io 
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import io
 import re
 from setuptools import setup, find_packages

+
 def read(*names, **kwargs):
    with io.open(
-        os.path.join(os.path.dirname(__file__), *names),
-        encoding=kwargs.get("encoding", "utf8")
-    ) as fp:
+            os.path.join(os.path.dirname(__file__), *names),
+            encoding=kwargs.get("encoding", "utf8")) as fp:
        return fp.read()


@@ -19,6 +33,7 @@ def find_version(*file_paths):
        return version_match.group(1)
    raise RuntimeError("Unable to find version string.")

+
 VERSION = find_version('parakeet', '__init__.py')
 long_description = read('README.md')

@@ -32,17 +47,26 @@ setup_info = dict(
    description='Speech synthesis tools and models based on Paddlepaddle',
    long_description=long_description,
    license='Apache 2',
-
    install_requires=[
-        'numpy', 'nltk', 'inflect', 'librosa', 'unidecode', 'numba', 
-        'tqdm', 'matplotlib', 'tensorboardX', 'tensorboard', 'scipy',
-        'ruamel.yaml', 'pandas', 'sox', 'soundfile',  
+        'numpy',
+        'nltk',
+        'inflect',
+        'librosa',
+        'unidecode',
+        'numba',
+        'tqdm',
+        'matplotlib',
+        'tensorboardX',
+        'tensorboard',
+        'scipy',
+        'ruamel.yaml',
+        'pandas',
+        'sox',
+        'soundfile',
    ],

    # Package info
    packages=find_packages(exclude=('tests', 'tests.*')),
+    zip_safe=True, )

-    zip_safe=True,
-)
-
-setup(**setup_info)
\ No newline at end of file
+setup(**setup_info)
--- a/tests/test_ljspeech.py
+++ b/tests/test_ljspeech.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.datasets.ljspeech import LJSpeech
 from parakeet.data.datacargo import DataCargo


--- a/tests/test_vctk.py
+++ b/tests/test_vctk.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from parakeet.datasets import vctk
 from pathlib import Path
 from parakeet.data.datacargo import DataCargo

 root = Path("/workspace/datasets/VCTK-Corpus")
 vctk_dataset = vctk.VCTK(root)
-vctk_cargo = DataCargo(vctk_dataset, batch_size=16, shuffle=True, drop_last=True)
+vctk_cargo = DataCargo(
+    vctk_dataset, batch_size=16, shuffle=True, drop_last=True)

 for i, batch in enumerate(vctk_cargo):
    print(i)
-
--- a/tools/copyright.hook
+++ b/tools/copyright.hook
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import io, re
+import sys, os
+import subprocess
+import platform
+
+COPYRIGHT = '''
+Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+LANG_COMMENT_MARK = None
+
+NEW_LINE_MARK = None
+
+COPYRIGHT_HEADER = None
+
+if platform.system() == "Windows":
+    NEW_LINE_MARK = "\r\n"
+else:
+    NEW_LINE_MARK = '\n'
+    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
+    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
+    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
+    date, err = process.communicate()
+    date = date.decode("utf-8").rstrip("\n")
+    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
+
+
+def generate_copyright(template, lang='C'):
+    if lang == 'Python':
+        LANG_COMMENT_MARK = '#'
+    else:
+        LANG_COMMENT_MARK = "//"
+
+    lines = template.split(NEW_LINE_MARK)
+    BLANK = " "
+    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
+    for lino, line in enumerate(lines):
+        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
+        if len(line)  == 0:
+            BLANK = ""
+        else:
+            BLANK = " "
+        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
+
+    return ans + "\n"
+
+
+def lang_type(filename):
+    if filename.endswith(".py"):
+        return "Python"
+    elif filename.endswith(".h"):
+        return "C"
+    elif filename.endswith(".c"):
+        return "C"
+    elif filename.endswith(".hpp"):
+        return "C"
+    elif filename.endswith(".cc"):
+        return "C"
+    elif filename.endswith(".cpp"):
+        return "C"
+    elif filename.endswith(".cu"):
+        return "C"
+    elif filename.endswith(".cuh"):
+        return "C"
+    elif filename.endswith(".go"):
+        return "C"
+    elif filename.endswith(".proto"):
+        return "C"
+    else:
+        print("Unsupported filetype %s", filename)
+        exit(0)
+
+
+PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+
+    retv = 0
+    for filename in args.filenames:
+        fd = io.open(filename, encoding="utf-8")
+        first_line = fd.readline()
+        second_line = fd.readline()
+        if "COPYRIGHT (C)" in first_line.upper(): continue
+        if first_line.startswith("#!") or PYTHON_ENCODE.match(
+                second_line) != None or PYTHON_ENCODE.match(first_line) != None:
+            continue
+        original_contents = io.open(filename, encoding="utf-8").read()
+        new_contents = generate_copyright(
+            COPYRIGHT, lang_type(filename)) + original_contents
+        print('Auto Insert Copyright Header {}'.format(filename))
+        retv = 1
+        with io.open(filename, 'w') as output_file:
+            output_file.write(new_contents)
+
+    return retv
+
+
+if __name__ == '__main__':
+    exit(main())