diff --git a/deepvoice3/compute_timestamp_ratio.py b/deepvoice3/compute_timestamp_ratio.py
index 7fc306ff98075d0b0245a45f7b0a5a093ac80485..d737059c1f447c905923749f19f5711d17041d29 100644
--- a/deepvoice3/compute_timestamp_ratio.py
+++ b/deepvoice3/compute_timestamp_ratio.py
@@ -14,7 +14,7 @@ from hparams import hparams, hparams_debug_string
 from data.data import TextDataSource, MelSpecDataSource
 from nnmnkwii.datasets import FileSourceDataset
 from tqdm import trange
-from modules import frontend
+import g2p as frontend
 
 
 def build_parser():
diff --git a/deepvoice3/data.py b/deepvoice3/data.py
index 197b1f37fe00170723e4596383f872b328bf9982..6c9c85aa7fbdc8386fb56866e55d787cbe5a1d6c 100644
--- a/deepvoice3/data.py
+++ b/deepvoice3/data.py
@@ -25,7 +25,7 @@ import random
 
 # import global hyper parameters
 from hparams import hparams
-from modules import frontend
+import g2p as frontend
 import builder
 
 _frontend = getattr(frontend, hparams.frontend)
diff --git a/deepvoice3/dry_run.py b/deepvoice3/dry_run.py
index 4c1366a40da9eafc8687b613212a5de0c3e319e8..4428e40ef34e655d8059f73f9a47edb0a89d5574 100644
--- a/deepvoice3/dry_run.py
+++ b/deepvoice3/dry_run.py
@@ -17,7 +17,7 @@ from paddle import fluid
 import paddle.fluid.dygraph as dg
 
 from hparams import hparams, hparams_debug_string
-from modules import frontend
+import g2p as frontend
 from deepvoice3 import DeepVoiceTTS
 
 
diff --git a/deepvoice3/eval_model.py b/deepvoice3/eval_model.py
index 870fdd6196f98534e352f2bb53cceeef60092381..101d352e5fcd4c02a93502181ba75e2d14fe0d89 100644
--- a/deepvoice3/eval_model.py
+++ b/deepvoice3/eval_model.py
@@ -37,7 +37,7 @@ from tensorboardX import SummaryWriter
 
 # import global hyper parameters
 from hparams import hparams
-from modules import frontend
+import g2p as frontend
 
 _frontend = getattr(frontend, hparams.frontend)
 
diff --git a/deepvoice3/synthesis.py b/deepvoice3/synthesis.py
index e043403f944963fef64202ccb42d15cfb1e8eaa5..e589c570a44b45d5aa7782ec66a9c53c432d8c4a 100644
--- a/deepvoice3/synthesis.py
+++ b/deepvoice3/synthesis.py
@@ -30,7 +30,7 @@ import paddle.fluid.dygraph as dg
 
 sys.path.append("../")
 import audio
-from modules import frontend
+import g2p as frontend
 import dry_run
 
 from hparams import hparams
diff --git a/deepvoice3/train.py b/deepvoice3/train.py
index f36ade4676e1c18ba5abe41fc37b9435c1bbd8ea..0f6859ca965b2699315ac6b4d2c69fc7c4b78d73 100644
--- a/deepvoice3/train.py
+++ b/deepvoice3/train.py
@@ -32,7 +32,7 @@ from data import (TextDataSource, MelSpecDataSource,
                                     LinearSpecDataSource,
                                     PartialyRandomizedSimilarTimeLengthSampler,
                                     Dataset, make_loader, create_batch)
-from modules import frontend
+import g2p as frontend
 from builder import deepvoice3, WindowRange
 from dry_run import dry_run
 from train_model import train_model
diff --git a/modules/__init__.py b/modules/__init__.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/modules/conv.py b/modules/conv.py
deleted file mode 100644
index 34149be142490b52e60544e075741e5392fdd261..0000000000000000000000000000000000000000
--- a/modules/conv.py
+++ /dev/null
@@ -1,222 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import numpy as np
-
-import paddle
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-
-from weight_norm import Conv2D, Conv2DTranspose
-
-
-class Conv1D(dg.Layer):
-    """
-    A convolution 1D block implemented with Conv2D. Form simplicity and 
-    ensuring the output has the same length as the input, it does not allow 
-    stride > 1.
-    """
-
-    def __init__(self,
-                 name_scope,
-                 in_cahnnels,
-                 num_filters,
-                 filter_size=3,
-                 dilation=1,
-                 groups=None,
-                 causal=False,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 dtype="float32"):
-        super(Conv1D, self).__init__(name_scope, dtype=dtype)
-
-        if causal:
-            padding = dilation * (filter_size - 1)
-        else:
-            padding = (dilation * (filter_size - 1)) // 2
-
-        self.in_channels = in_cahnnels
-        self.num_filters = num_filters
-        self.filter_size = filter_size
-        self.dilation = dilation
-        self.causal = causal
-        self.padding = padding
-        self.act = act
-
-        self.conv = Conv2D(
-            self.full_name(),
-            num_filters=num_filters,
-            filter_size=(1, filter_size),
-            stride=(1, 1),
-            dilation=(1, dilation),
-            padding=(0, padding),
-            groups=groups,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            use_cudnn=use_cudnn,
-            act=act,
-            dtype=dtype)
-
-    def forward(self, x):
-        """
-        Args:
-            x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
-                input channels.
-
-        Returns:
-            x (Variable): Shape(B, C_out, 1, T), the outputs, where C_out means
-                output channels (num_filters).
-        """
-        x = self.conv(x)
-        if self.filter_size > 1:
-            if self.causal:
-                x = fluid.layers.slice(
-                    x, axes=[3], starts=[0], ends=[-self.padding])
-            elif self.filter_size % 2 == 0:
-                x = fluid.layers.slice(x, axes=[3], starts=[0], ends=[-1])
-        return x
-
-    def start_new_sequence(self):
-        self.temp_weight = None
-        self.input_buffer = None
-
-    def add_input(self, x):
-        """
-        Adding input for a time step and compute an output for a time step.
-        
-        Args:
-            x (Variable): Shape(B, C_in, 1, T), the input, where C_in means
-                input channels, and T = 1.
-
-        Returns:
-            out (Variable): Shape(B, C_out, 1, T), the outputs, where C_out
-            means output channels (num_filters), and T = 1.
-            
-        """
-        if self.temp_weight is None:
-            self.temp_weight = self._reshaped_weight()
-
-        window_size = 1 + (self.filter_size - 1) * self.dilation
-        batch_size = x.shape[0]
-        in_channels = x.shape[1]
-
-        if self.filter_size > 1:
-            if self.input_buffer is None:
-                self.input_buffer = fluid.layers.fill_constant(
-                    [batch_size, in_channels, 1, window_size - 1],
-                    dtype=x.dtype,
-                    value=0.0)
-            else:
-                self.input_buffer = self.input_buffer[:, :, :, 1:]
-            self.input_buffer = fluid.layers.concat(
-                [self.input_buffer, x], axis=3)
-            x = self.input_buffer
-            if self.dilation > 1:
-                if not hasattr(self, "indices"):
-                    self.indices = dg.to_variable(
-                        np.arange(0, window_size, self.dilation))
-                tmp = fluid.layers.transpose(
-                    self.input_buffer, perm=[3, 1, 2, 0])
-                tmp = fluid.layers.gather(tmp, index=self.indices)
-                tmp = fluid.layers.transpose(tmp, perm=[3, 1, 2, 0])
-                x = tmp
-        inputs = fluid.layers.reshape(
-            x, shape=[batch_size, in_channels * 1 * self.filter_size])
-        out = fluid.layers.matmul(inputs, self.temp_weight, transpose_y=True)
-        out = fluid.layers.elementwise_add(out, self.conv._bias_param, axis=-1)
-        out = fluid.layers.reshape(out, out.shape + [1, 1])
-        out = self._helper.append_activation(out, act=self.act)
-        return out
-
-    def _reshaped_weight(self):
-        """
-        Get the linearized weight of convolution filter, cause it is by nature 
-        a matmul weight. And because the model uses weight norm, compute the
-        weight by weight_v * weight_g to make it faster.
-
-        Returns:
-            weight_matrix (Variable): Shape(C_out, C_in * 1 * kernel_size)
-        """
-        shape = self.conv._filter_param_v.shape
-        matrix_shape = [shape[0], np.prod(shape[1:])]
-        weight_matrix = fluid.layers.reshape(
-            self.conv._filter_param_v, shape=matrix_shape)
-        weight_matrix = fluid.layers.elementwise_mul(
-            fluid.layers.l2_normalize(
-                weight_matrix, axis=1),
-            self.conv._filter_param_g,
-            axis=0)
-        return weight_matrix
-
-
-class Conv1DTranspose(dg.Layer):
-    """
-    A convolutional transpose 1D block implemented with convolutional transpose
-    2D. It does not ensure that the output is exactly expanded stride times in 
-    time dimension.
-    """
-
-    def __init__(self,
-                 name_scope,
-                 in_channels,
-                 num_filters,
-                 filter_size,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 dtype="float32"):
-        super(Conv1DTranspose, self).__init__(name_scope, dtype=dtype)
-
-        self.in_channels = in_channels
-        self.num_filters = num_filters
-        self.filter_size = filter_size
-        self.padding = padding
-        self.stride = stride
-        self.dilation = dilation
-        self.groups = groups
-
-        self.conv_transpose = Conv2DTranspose(
-            self.full_name(),
-            num_filters,
-            filter_size=(1, filter_size),
-            padding=(0, padding),
-            stride=(1, stride),
-            dilation=(1, dilation),
-            groups=groups,
-            param_attr=param_attr,
-            bias_attr=bias_attr,
-            use_cudnn=use_cudnn,
-            act=act,
-            dtype=dtype)
-
-    def forward(self, x):
-        """
-        Argss:
-            x (Variable): Shape(B, C_in, 1, T_in), where C_in means the input
-                channels and T_in means the number of time steps of input.
-        
-        Returns:
-            out (Variable): shape(B, C_out, 1, T_out), where C_out means the
-                output channels and T_out means the number of time steps of
-                input.
-        """
-        return self.conv_transpose(x)
diff --git a/modules/frontend/README.md b/modules/frontend/README.md
deleted file mode 100644
index af4513e747f223e42ebc10f05761f6f3146d8a27..0000000000000000000000000000000000000000
--- a/modules/frontend/README.md
+++ /dev/null
@@ -1 +0,0 @@
-This package is adapted from https://github.com/r9y9/deepvoice3_pytorch/tree/master/deepvoice3_pytorch/frontend, Copyright (c) 2017: Ryuichi Yamamoto, whose license applies.
diff --git a/modules/frontend/__init__.py b/modules/frontend/__init__.py
deleted file mode 100644
index f5f6c0ae9e19e140f70d454cfa4ca6ebdb324f22..0000000000000000000000000000000000000000
--- a/modules/frontend/__init__.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# coding: utf-8
-"""Text processing frontend
-
-All frontend module should have the following functions:
-
-- text_to_sequence(text, p)
-- sequence_to_text(sequence)
-
-and the property:
-
-- n_vocab
-
-"""
-from . import en
-
-# optinoal Japanese frontend
-try:
-    from . import jp
-except ImportError:
-    jp = None
-
-try:
-    from . import ko
-except ImportError:
-    ko = None
-
-# if you are going to use the frontend, you need to modify _characters in
-# symbol.py:
-# _characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? ' + '¡¿ñáéíóúÁÉÍÓÚÑ'
-try:
-    from . import es
-except ImportError:
-    es = None
diff --git a/modules/frontend/en/__init__.py b/modules/frontend/en/__init__.py
deleted file mode 100644
index 58cf2f6415f010936c3b7eb2eceb027fe4e640fb..0000000000000000000000000000000000000000
--- a/modules/frontend/en/__init__.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# coding: utf-8
-from modules.frontend.text.symbols import symbols
-
-import nltk
-from random import random
-
-n_vocab = len(symbols)
-
-_arpabet = nltk.corpus.cmudict.dict()
-
-
-def _maybe_get_arpabet(word, p):
-    try:
-        phonemes = _arpabet[word][0]
-        phonemes = " ".join(phonemes)
-    except KeyError:
-        return word
-
-    return '{%s}' % phonemes if random() < p else word
-
-
-def mix_pronunciation(text, p):
-    text = ' '.join(_maybe_get_arpabet(word, p) for word in text.split(' '))
-    return text
-
-
-def text_to_sequence(text, p=0.0):
-    if p >= 0:
-        text = mix_pronunciation(text, p)
-    from modules.frontend.text import text_to_sequence
-    text = text_to_sequence(text, ["english_cleaners"])
-    return text
-
-
-from modules.frontend.text import sequence_to_text
diff --git a/modules/frontend/es/__init__.py b/modules/frontend/es/__init__.py
deleted file mode 100644
index 24323e58a939ea5e4196755f23048a9b66d52fd2..0000000000000000000000000000000000000000
--- a/modules/frontend/es/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-# coding: utf-8
-from deepvoice3_paddle.frontend.text.symbols import symbols
-
-import nltk
-from random import random
-
-n_vocab = len(symbols)
-
-
-def text_to_sequence(text, p=0.0):
-    from deepvoice3_paddle.frontend.text import text_to_sequence
-    text = text_to_sequence(text, ["basic_cleaners"])
-    return text
-
-
-from deepvoice3_paddle.frontend.text import sequence_to_text
diff --git a/modules/frontend/jp/__init__.py b/modules/frontend/jp/__init__.py
deleted file mode 100644
index 36c7fd84e1b4e5b5c4f4116978d8a35123b91cb6..0000000000000000000000000000000000000000
--- a/modules/frontend/jp/__init__.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# coding: utf-8
-
-import MeCab
-import jaconv
-from random import random
-
-n_vocab = 0xffff
-
-_eos = 1
-_pad = 0
-_tagger = None
-
-
-def _yomi(mecab_result):
-    tokens = []
-    yomis = []
-    for line in mecab_result.split("\n")[:-1]:
-        s = line.split("\t")
-        if len(s) == 1:
-            break
-        token, rest = s
-        rest = rest.split(",")
-        tokens.append(token)
-        yomi = rest[7] if len(rest) > 7 else None
-        yomi = None if yomi == "*" else yomi
-        yomis.append(yomi)
-
-    return tokens, yomis
-
-
-def _mix_pronunciation(tokens, yomis, p):
-    return "".join(yomis[idx]
-                   if yomis[idx] is not None and random() < p else tokens[idx]
-                   for idx in range(len(tokens)))
-
-
-def mix_pronunciation(text, p):
-    global _tagger
-    if _tagger is None:
-        _tagger = MeCab.Tagger("")
-    tokens, yomis = _yomi(_tagger.parse(text))
-    return _mix_pronunciation(tokens, yomis, p)
-
-
-def add_punctuation(text):
-    last = text[-1]
-    if last not in [".", ",", "、", "。", "！", "？", "!", "?"]:
-        text = text + "。"
-    return text
-
-
-def normalize_delimitor(text):
-    text = text.replace(",", "、")
-    text = text.replace(".", "。")
-    text = text.replace("，", "、")
-    text = text.replace("．", "。")
-    return text
-
-
-def text_to_sequence(text, p=0.0):
-    for c in [" ", "　", "「", "」", "『", "』", "・", "【", "】", "（", "）", "(", ")"]:
-        text = text.replace(c, "")
-    text = text.replace("!", "！")
-    text = text.replace("?", "？")
-
-    text = normalize_delimitor(text)
-    text = jaconv.normalize(text)
-    if p > 0:
-        text = mix_pronunciation(text, p)
-    text = jaconv.hira2kata(text)
-    text = add_punctuation(text)
-
-    return [ord(c) for c in text] + [_eos]  # EOS
-
-
-def sequence_to_text(seq):
-    return "".join(chr(n) for n in seq)
diff --git a/modules/frontend/ko/__init__.py b/modules/frontend/ko/__init__.py
deleted file mode 100644
index ccb8b5f1cb78f4561f4e557e78712d2c1c82f016..0000000000000000000000000000000000000000
--- a/modules/frontend/ko/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# coding: utf-8
-
-from random import random
-
-n_vocab = 0xffff
-
-_eos = 1
-_pad = 0
-_tagger = None
-
-
-def text_to_sequence(text, p=0.0):
-    return [ord(c) for c in text] + [_eos]  # EOS
-
-
-def sequence_to_text(seq):
-    return "".join(chr(n) for n in seq)
diff --git a/modules/frontend/text/__init__.py b/modules/frontend/text/__init__.py
deleted file mode 100644
index 26244ce3d44929613f5ad52e8e3a3e0dd85f07f1..0000000000000000000000000000000000000000
--- a/modules/frontend/text/__init__.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import re
-from . import cleaners
-from .symbols import symbols
-
-# Mappings from symbol to numeric ID and vice versa:
-_symbol_to_id = {s: i for i, s in enumerate(symbols)}
-_id_to_symbol = {i: s for i, s in enumerate(symbols)}
-
-# Regular expression matching text enclosed in curly braces:
-_curly_re = re.compile(r'(.*?)\{(.+?)\}(.*)')
-
-
-def text_to_sequence(text, cleaner_names):
-    '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-
-      The text can optionally have ARPAbet sequences enclosed in curly braces embedded
-      in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
-
-      Args:
-        text: string to convert to a sequence
-        cleaner_names: names of the cleaner functions to run the text through
-
-      Returns:
-        List of integers corresponding to the symbols in the text
-    '''
-    sequence = []
-
-    # Check for curly braces and treat their contents as ARPAbet:
-    while len(text):
-        m = _curly_re.match(text)
-        if not m:
-            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
-            break
-        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
-        sequence += _arpabet_to_sequence(m.group(2))
-        text = m.group(3)
-
-    # Append EOS token
-    sequence.append(_symbol_to_id['~'])
-    return sequence
-
-
-def sequence_to_text(sequence):
-    '''Converts a sequence of IDs back to a string'''
-    result = ''
-    for symbol_id in sequence:
-        if symbol_id in _id_to_symbol:
-            s = _id_to_symbol[symbol_id]
-            # Enclose ARPAbet back in curly braces:
-            if len(s) > 1 and s[0] == '@':
-                s = '{%s}' % s[1:]
-            result += s
-    return result.replace('}{', ' ')
-
-
-def _clean_text(text, cleaner_names):
-    for name in cleaner_names:
-        cleaner = getattr(cleaners, name)
-        if not cleaner:
-            raise Exception('Unknown cleaner: %s' % name)
-        text = cleaner(text)
-    return text
-
-
-def _symbols_to_sequence(symbols):
-    return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]
-
-
-def _arpabet_to_sequence(text):
-    return _symbols_to_sequence(['@' + s for s in text.split()])
-
-
-def _should_keep_symbol(s):
-    return s in _symbol_to_id and s is not '_' and s is not '~'
diff --git a/modules/frontend/text/cleaners.py b/modules/frontend/text/cleaners.py
deleted file mode 100644
index e94226476968df18c5e28f4b705f9161f3555025..0000000000000000000000000000000000000000
--- a/modules/frontend/text/cleaners.py
+++ /dev/null
@@ -1,104 +0,0 @@
-'''
-Cleaners are transformations that run over the input text at both training and
-eval time.
-
-Cleaners can be selected by passing a comma-delimited list of cleaner names as
-the "cleaners" hyperparameter. Some cleaners are English-specific. You'll 
-typically want to use:
-  1. "english_cleaners" for English text
-  2. "transliteration_cleaners" for non-English text that can be transliterated 
-  to ASCII using the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-  3. "basic_cleaners" if you do not want to transliterate (in this case, you 
-  should also update the symbols in symbols.py to match your data).
-'''
-
-import re
-from unidecode import unidecode
-from .numbers import normalize_numbers
-
-# Regular expression matching whitespace:
-_whitespace_re = re.compile(r'\s+')
-
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1])
-                  for x in [
-                      ('mrs', 'misess'),
-                      ('mr', 'mister'),
-                      ('dr', 'doctor'),
-                      ('st', 'saint'),
-                      ('co', 'company'),
-                      ('jr', 'junior'),
-                      ('maj', 'major'),
-                      ('gen', 'general'),
-                      ('drs', 'doctors'),
-                      ('rev', 'reverend'),
-                      ('lt', 'lieutenant'),
-                      ('hon', 'honorable'),
-                      ('sgt', 'sergeant'),
-                      ('capt', 'captain'),
-                      ('esq', 'esquire'),
-                      ('ltd', 'limited'),
-                      ('col', 'colonel'),
-                      ('ft', 'fort'),
-                  ]]
-
-
-def expand_abbreviations(text):
-    for regex, replacement in _abbreviations:
-        text = re.sub(regex, replacement, text)
-    return text
-
-
-def expand_numbers(text):
-    return normalize_numbers(text)
-
-
-def lowercase(text):
-    return text.lower()
-
-
-def collapse_whitespace(text):
-    return re.sub(_whitespace_re, ' ', text)
-
-
-def convert_to_ascii(text):
-    return unidecode(text)
-
-
-def add_punctuation(text):
-    if len(text) == 0:
-        return text
-    if text[-1] not in '!,.:;?':
-        text = text + '.'  # without this decoder is confused when to output EOS
-    return text
-
-
-def basic_cleaners(text):
-    '''
-    Basic pipeline that lowercases and collapses whitespace without 
-    transliteration.
-    '''
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-
-
-def transliteration_cleaners(text):
-    '''Pipeline for non-English text that transliterates to ASCII.'''
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-
-
-def english_cleaners(text):
-    '''
-    Pipeline for English text, including number and abbreviation expansion.
-    '''
-    text = convert_to_ascii(text)
-    text = add_punctuation(text)
-    text = lowercase(text)
-    text = expand_numbers(text)
-    text = expand_abbreviations(text)
-    text = collapse_whitespace(text)
-    return text
diff --git a/modules/frontend/text/cmudict.py b/modules/frontend/text/cmudict.py
deleted file mode 100644
index 304592b66e246d6c578ad024e63548a00aa6a9b5..0000000000000000000000000000000000000000
--- a/modules/frontend/text/cmudict.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import re
-
-valid_symbols = [
-    'AA', 'AA0', 'AA1', 'AA2', 'AE', 'AE0', 'AE1', 'AE2', 'AH', 'AH0', 'AH1',
-    'AH2', 'AO', 'AO0', 'AO1', 'AO2', 'AW', 'AW0', 'AW1', 'AW2', 'AY', 'AY0',
-    'AY1', 'AY2', 'B', 'CH', 'D', 'DH', 'EH', 'EH0', 'EH1', 'EH2', 'ER', 'ER0',
-    'ER1', 'ER2', 'EY', 'EY0', 'EY1', 'EY2', 'F', 'G', 'HH', 'IH', 'IH0', 'IH1',
-    'IH2', 'IY', 'IY0', 'IY1', 'IY2', 'JH', 'K', 'L', 'M', 'N', 'NG', 'OW',
-    'OW0', 'OW1', 'OW2', 'OY', 'OY0', 'OY1', 'OY2', 'P', 'R', 'S', 'SH', 'T',
-    'TH', 'UH', 'UH0', 'UH1', 'UH2', 'UW', 'UW0', 'UW1', 'UW2', 'V', 'W', 'Y',
-    'Z', 'ZH'
-]
-
-_valid_symbol_set = set(valid_symbols)
-
-
-class CMUDict:
-    '''
-    Thin wrapper around CMUDict data. 
-    http://www.speech.cs.cmu.edu/cgi-bin/cmudict
-    '''
-
-    def __init__(self, file_or_path, keep_ambiguous=True):
-        if isinstance(file_or_path, str):
-            with open(file_or_path, encoding='latin-1') as f:
-                entries = _parse_cmudict(f)
-        else:
-            entries = _parse_cmudict(file_or_path)
-        if not keep_ambiguous:
-            entries = {
-                word: pron
-                for word, pron in entries.items() if len(pron) == 1
-            }
-        self._entries = entries
-
-    def __len__(self):
-        return len(self._entries)
-
-    def lookup(self, word):
-        '''Returns list of ARPAbet pronunciations of the given word.'''
-        return self._entries.get(word.upper())
-
-
-_alt_re = re.compile(r'\([0-9]+\)')
-
-
-def _parse_cmudict(file):
-    cmudict = {}
-    for line in file:
-        if len(line) and (line[0] >= 'A' and line[0] <= 'Z' or line[0] == "'"):
-            parts = line.split('  ')
-            word = re.sub(_alt_re, '', parts[0])
-            pronunciation = _get_pronunciation(parts[1])
-            if pronunciation:
-                if word in cmudict:
-                    cmudict[word].append(pronunciation)
-                else:
-                    cmudict[word] = [pronunciation]
-    return cmudict
-
-
-def _get_pronunciation(s):
-    parts = s.strip().split(' ')
-    for part in parts:
-        if part not in _valid_symbol_set:
-            return None
-    return ' '.join(parts)
diff --git a/modules/frontend/text/numbers.py b/modules/frontend/text/numbers.py
deleted file mode 100644
index 24b58175dc1028994a0bcccfde6531b21a27cb72..0000000000000000000000000000000000000000
--- a/modules/frontend/text/numbers.py
+++ /dev/null
@@ -1,71 +0,0 @@
-# -*- coding: utf-8 -*-
-
-import inflect
-import re
-
-_inflect = inflect.engine()
-_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
-_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
-_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
-_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
-_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
-_number_re = re.compile(r'[0-9]+')
-
-
-def _remove_commas(m):
-    return m.group(1).replace(',', '')
-
-
-def _expand_decimal_point(m):
-    return m.group(1).replace('.', ' point ')
-
-
-def _expand_dollars(m):
-    match = m.group(1)
-    parts = match.split('.')
-    if len(parts) > 2:
-        return match + ' dollars'  # Unexpected format
-    dollars = int(parts[0]) if parts[0] else 0
-    cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
-    if dollars and cents:
-        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-        cent_unit = 'cent' if cents == 1 else 'cents'
-        return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
-    elif dollars:
-        dollar_unit = 'dollar' if dollars == 1 else 'dollars'
-        return '%s %s' % (dollars, dollar_unit)
-    elif cents:
-        cent_unit = 'cent' if cents == 1 else 'cents'
-        return '%s %s' % (cents, cent_unit)
-    else:
-        return 'zero dollars'
-
-
-def _expand_ordinal(m):
-    return _inflect.number_to_words(m.group(0))
-
-
-def _expand_number(m):
-    num = int(m.group(0))
-    if num > 1000 and num < 3000:
-        if num == 2000:
-            return 'two thousand'
-        elif num > 2000 and num < 2010:
-            return 'two thousand ' + _inflect.number_to_words(num % 100)
-        elif num % 100 == 0:
-            return _inflect.number_to_words(num // 100) + ' hundred'
-        else:
-            return _inflect.number_to_words(
-                num, andword='', zero='oh', group=2).replace(', ', ' ')
-    else:
-        return _inflect.number_to_words(num, andword='')
-
-
-def normalize_numbers(text):
-    text = re.sub(_comma_number_re, _remove_commas, text)
-    text = re.sub(_pounds_re, r'\1 pounds', text)
-    text = re.sub(_dollars_re, _expand_dollars, text)
-    text = re.sub(_decimal_number_re, _expand_decimal_point, text)
-    text = re.sub(_ordinal_re, _expand_ordinal, text)
-    text = re.sub(_number_re, _expand_number, text)
-    return text
diff --git a/modules/frontend/text/symbols.py b/modules/frontend/text/symbols.py
deleted file mode 100644
index c6fc28bcf2f894147492559fce79d5723e6e110b..0000000000000000000000000000000000000000
--- a/modules/frontend/text/symbols.py
+++ /dev/null
@@ -1,18 +0,0 @@
-'''
-Defines the set of symbols used in text input to the model.
-
-The default is a set of ASCII characters that works well for English or text 
-that has been run through Unidecode. For other data, you can modify _characters. 
-See TRAINING_DATA.md for details.
-'''
-from .cmudict import valid_symbols
-
-_pad = '_'
-_eos = '~'
-_characters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz!\'(),-.:;? '
-
-# Prepend "@" to ARPAbet symbols to ensure uniqueness (some are the same as uppercase letters):
-_arpabet = ['@' + s for s in valid_symbols]
-
-# Export all symbols:
-symbols = [_pad, _eos] + list(_characters) + _arpabet
diff --git a/modules/loss.py b/modules/loss.py
deleted file mode 100644
index 96bcd3ba96f84ff0c93b06f4290b50243cadc2f2..0000000000000000000000000000000000000000
--- a/modules/loss.py
+++ /dev/null
@@ -1,158 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-from numba import jit
-
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-
-
-def masked_mean(inputs, mask):
-    """
-    Args:
-        inputs (Variable): Shape(B, C, 1, T), the input, where B means
-            batch size, C means channels of input, T means timesteps of
-            the input.
-        mask (Variable): Shape(B, T), a mask. 
-    Returns:
-        loss (Variable): Shape(1, ), masked mean.
-    """
-    channels = inputs.shape[1]
-    reshaped_mask = fluid.layers.reshape(
-        mask, shape=[mask.shape[0], 1, 1, mask.shape[-1]])
-    expanded_mask = fluid.layers.expand(
-        reshaped_mask, expand_times=[1, channels, 1, 1])
-    expanded_mask.stop_gradient = True
-
-    valid_cnt = fluid.layers.reduce_sum(expanded_mask)
-    valid_cnt.stop_gradient = True
-
-    masked_inputs = inputs * expanded_mask
-    loss = fluid.layers.reduce_sum(masked_inputs) / valid_cnt
-    return loss
-
-
-@jit(nopython=True)
-def guided_attention(N, max_N, T, max_T, g):
-    W = np.zeros((max_N, max_T), dtype=np.float32)
-    for n in range(N):
-        for t in range(T):
-            W[n, t] = 1 - np.exp(-(n / N - t / T)**2 / (2 * g * g))
-    return W
-
-
-def guided_attentions(input_lengths, target_lengths, max_target_len, g=0.2):
-    B = len(input_lengths)
-    max_input_len = input_lengths.max()
-    W = np.zeros((B, max_target_len, max_input_len), dtype=np.float32)
-    for b in range(B):
-        W[b] = guided_attention(input_lengths[b], max_input_len,
-                                target_lengths[b], max_target_len, g).T
-    return W
-
-
-class TTSLoss(object):
-    def __init__(self,
-                 masked_weight=0.0,
-                 priority_weight=0.0,
-                 binary_divergence_weight=0.0,
-                 guided_attention_sigma=0.2):
-        self.masked_weight = masked_weight
-        self.priority_weight = priority_weight
-        self.binary_divergence_weight = binary_divergence_weight
-        self.guided_attention_sigma = guided_attention_sigma
-
-    def l1_loss(self, prediction, target, mask, priority_bin=None):
-        abs_diff = fluid.layers.abs(prediction - target)
-
-        # basic mask-weighted l1 loss
-        w = self.masked_weight
-        if w > 0 and mask is not None:
-            base_l1_loss = w * masked_mean(abs_diff, mask) + (
-                1 - w) * fluid.layers.reduce_mean(abs_diff)
-        else:
-            base_l1_loss = fluid.layers.reduce_mean(abs_diff)
-
-        if self.priority_weight > 0 and priority_bin is not None:
-            # mask-weighted priority channels' l1-loss
-            priority_abs_diff = fluid.layers.slice(
-                abs_diff, axes=[1], starts=[0], ends=[priority_bin])
-            if w > 0 and mask is not None:
-                priority_loss = w * masked_mean(priority_abs_diff, mask) + (
-                    1 - w) * fluid.layers.reduce_mean(priority_abs_diff)
-            else:
-                priority_loss = fluid.layers.reduce_mean(priority_abs_diff)
-
-            # priority weighted sum
-            p = self.priority_weight
-            loss = p * priority_loss + (1 - p) * base_l1_loss
-        else:
-            loss = base_l1_loss
-        return loss
-
-    def binary_divergence(self, prediction, target, mask):
-        flattened_prediction = fluid.layers.reshape(prediction, [-1, 1])
-        flattened_target = fluid.layers.reshape(target, [-1, 1])
-        flattened_loss = fluid.layers.log_loss(
-            flattened_prediction, flattened_target, epsilon=1e-8)
-        bin_div = fluid.layers.reshape(flattened_loss, prediction.shape)
-
-        w = self.masked_weight
-        if w > 0 and mask is not None:
-            loss = w * masked_mean(bin_div, mask) + (
-                1 - w) * fluid.layers.reduce_mean(bin_div)
-        else:
-            loss = fluid.layers.reduce_mean(bin_div)
-        return loss
-
-    @staticmethod
-    def done_loss(done_hat, done):
-        flat_done_hat = fluid.layers.reshape(done_hat, [-1, 1])
-        flat_done = fluid.layers.reshape(done, [-1, 1])
-        loss = fluid.layers.log_loss(flat_done_hat, flat_done, epsilon=1e-8)
-        loss = fluid.layers.reduce_mean(loss)
-        return loss
-
-    def attention_loss(self, predicted_attention, input_lengths,
-                       target_lengths):
-        """
-        Given valid encoder_lengths and decoder_lengths, compute a diagonal 
-        guide, and compute loss from the predicted attention and the guide.
-        
-        Args:
-            predicted_attention (Variable): Shape(*, B, T_dec, T_enc), the 
-                alignment tensor, where B means batch size, T_dec means number
-                of time steps of the decoder, T_enc means the number of time
-                steps of the encoder, * means other possible dimensions.
-            input_lengths (numpy.ndarray): Shape(B,), dtype:int64, valid lengths
-                (time steps) of encoder outputs.
-            target_lengths (numpy.ndarray): Shape(batch_size,), dtype:int64, 
-                valid lengths (time steps) of decoder outputs.
-        
-        Returns:
-            loss (Variable): Shape(1, ) attention loss.
-        """
-        n_attention, batch_size, max_target_len, max_input_len = (
-            predicted_attention.shape)
-        soft_mask = guided_attentions(input_lengths, target_lengths,
-                                      max_target_len,
-                                      self.guided_attention_sigma)
-        soft_mask_ = dg.to_variable(soft_mask)
-        loss = fluid.layers.reduce_mean(predicted_attention * soft_mask_)
-        return loss
diff --git a/modules/modules.py b/modules/modules.py
deleted file mode 100644
index 3ae95d78a09e58f158004862c3e44a16576b7c22..0000000000000000000000000000000000000000
--- a/modules/modules.py
+++ /dev/null
@@ -1,458 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-
-import numpy as np
-
-import conv
-import weight_norm as weight_norm
-
-
-def FC(name_scope,
-       in_features,
-       size,
-       num_flatten_dims=1,
-       dropout=0.0,
-       epsilon=1e-30,
-       act=None,
-       is_test=False,
-       dtype="float32"):
-    """
-    A special Linear Layer, when it is used with dropout, the weight is 
-    initialized as normal(0, std=np.sqrt((1-dropout) / in_features))
-    """
-
-    # stds
-    if isinstance(in_features, int):
-        in_features = [in_features]
-    stds = [np.sqrt((1 - dropout) / in_feature) for in_feature in in_features]
-    weight_inits = [
-        fluid.initializer.NormalInitializer(scale=std) for std in stds
-    ]
-    bias_init = fluid.initializer.ConstantInitializer(0.0)
-
-    # param attrs
-    weight_attrs = [fluid.ParamAttr(initializer=init) for init in weight_inits]
-    bias_attr = fluid.ParamAttr(initializer=bias_init)
-
-    layer = weight_norm.FC(name_scope,
-                           size,
-                           num_flatten_dims=num_flatten_dims,
-                           param_attr=weight_attrs,
-                           bias_attr=bias_attr,
-                           act=act,
-                           dtype=dtype)
-    return layer
-
-
-def Conv1D(name_scope,
-           in_channels,
-           num_filters,
-           filter_size=3,
-           dilation=1,
-           groups=None,
-           causal=False,
-           std_mul=1.0,
-           dropout=0.0,
-           use_cudnn=True,
-           act=None,
-           dtype="float32"):
-    """
-    A special Conv1D Layer, when it is used with dropout, the weight is 
-    initialized as 
-    normal(0, std=np.sqrt(std_mul * (1-dropout) / (filter_size * in_features)))
-    """
-    # std
-    std = np.sqrt((std_mul * (1 - dropout)) / (filter_size * in_channels))
-    weight_init = fluid.initializer.NormalInitializer(loc=0.0, scale=std)
-    bias_init = fluid.initializer.ConstantInitializer(0.0)
-
-    # param attrs
-    weight_attr = fluid.ParamAttr(initializer=weight_init)
-    bias_attr = fluid.ParamAttr(initializer=bias_init)
-
-    layer = conv.Conv1D(
-        name_scope,
-        in_channels,
-        num_filters,
-        filter_size,
-        dilation,
-        groups=groups,
-        causal=causal,
-        param_attr=weight_attr,
-        bias_attr=bias_attr,
-        use_cudnn=use_cudnn,
-        act=act,
-        dtype=dtype)
-    return layer
-
-
-def Embedding(name_scope,
-              num_embeddings,
-              embed_dim,
-              is_sparse=False,
-              is_distributed=False,
-              padding_idx=None,
-              std=0.01,
-              dtype="float32"):
-    # param attrs
-    weight_attr = fluid.ParamAttr(initializer=fluid.initializer.Normal(
-        scale=std))
-    layer = dg.Embedding(
-        name_scope, (num_embeddings, embed_dim),
-        padding_idx=padding_idx,
-        param_attr=weight_attr,
-        dtype=dtype)
-    return layer
-
-
-class Conv1DGLU(dg.Layer):
-    """
-    A Convolution 1D block with GLU activation. It also applys dropout for the 
-    input x. It fuses speaker embeddings through a FC activated by softsign. It
-    has residual connection from the input x, and scale the output by 
-    np.sqrt(0.5).
-    """
-
-    def __init__(self,
-                 name_scope,
-                 n_speakers,
-                 speaker_dim,
-                 in_channels,
-                 num_filters,
-                 filter_size,
-                 dilation,
-                 std_mul=4.0,
-                 dropout=0.0,
-                 causal=False,
-                 residual=True,
-                 dtype="float32"):
-        super(Conv1DGLU, self).__init__(name_scope, dtype=dtype)
-
-        # conv spec
-        self.in_channels = in_channels
-        self.n_speakers = n_speakers
-        self.speaker_dim = speaker_dim
-        self.num_filters = num_filters
-        self.filter_size = filter_size
-        self.dilation = dilation
-        self.causal = causal
-        self.residual = residual
-
-        # weight init and dropout
-        self.std_mul = std_mul
-        self.dropout = dropout
-
-        if residual:
-            assert (
-                in_channels == num_filters
-            ), "this block uses residual connection"\
-                "the input_channes should equals num_filters"
-
-        self.conv = Conv1D(
-            self.full_name(),
-            in_channels,
-            2 * num_filters,
-            filter_size,
-            dilation,
-            causal=causal,
-            std_mul=std_mul,
-            dropout=dropout,
-            dtype=dtype)
-
-        if n_speakers > 1:
-            assert (speaker_dim is not None
-                    ), "speaker embed should not be null in multi-speaker case"
-            self.fc = Conv1D(
-                self.full_name(),
-                speaker_dim,
-                num_filters,
-                filter_size=1,
-                dilation=1,
-                causal=False,
-                act="softsign",
-                dtype=dtype)
-
-    def forward(self, x, speaker_embed_bc1t=None):
-        """
-        Args:
-            x (Variable): Shape(B, C_in, 1, T), the input of Conv1DGLU
-                layer, where B means batch_size, C_in means the input channels
-                T means input time steps.
-            speaker_embed_bct1 (Variable): Shape(B, C_sp, 1, T), expanded
-                speaker embed, where C_sp means speaker embedding size. Note
-                that when using residual connection, the Conv1DGLU does not
-                change the number of channels, so out channels equals input
-                channels.
-
-        Returns:
-            x (Variable): Shape(B, C_out, 1, T), the output of Conv1DGLU, where
-                C_out means the output channels of Conv1DGLU.
-        """
-
-        residual = x
-        x = fluid.layers.dropout(
-            x, self.dropout, dropout_implementation="upscale_in_train")
-        x = self.conv(x)
-
-        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
-
-        if speaker_embed_bc1t is not None:
-            sp = self.fc(speaker_embed_bc1t)
-            content = content + sp
-
-        # glu
-        x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
-
-        if self.residual:
-            x = fluid.layers.scale(x + residual, np.sqrt(0.5))
-        return x
-
-    def add_input(self, x, speaker_embed_bc11=None):
-        """
-        Inputs:
-        x: shape(B, num_filters, 1, time_steps)
-        speaker_embed_bc11: shape(B, speaker_dim, 1, time_steps)
-
-        Outputs:
-        out: shape(B, num_filters, 1, time_steps), where time_steps = 1
-        """
-
-        residual = x
-
-        # add step input and produce step output
-        x = fluid.layers.dropout(
-            x, self.dropout, dropout_implementation="upscale_in_train")
-        x = self.conv.add_input(x)
-
-        content, gate = fluid.layers.split(x, num_or_sections=2, dim=1)
-
-        if speaker_embed_bc11 is not None:
-            sp = self.fc(speaker_embed_bc11)
-            content = content + sp
-
-        x = fluid.layers.elementwise_mul(fluid.layers.sigmoid(gate), content)
-
-        if self.residual:
-            x = fluid.layers.scale(x + residual, np.sqrt(0.5))
-        return x
-
-
-def Conv1DTranspose(name_scope,
-                    in_channels,
-                    num_filters,
-                    filter_size,
-                    padding=0,
-                    stride=1,
-                    dilation=1,
-                    groups=None,
-                    std_mul=1.0,
-                    dropout=0.0,
-                    use_cudnn=True,
-                    act=None,
-                    dtype="float32"):
-    std = np.sqrt(std_mul * (1 - dropout) / (in_channels * filter_size))
-    weight_init = fluid.initializer.NormalInitializer(scale=std)
-    weight_attr = fluid.ParamAttr(initializer=weight_init)
-    bias_init = fluid.initializer.ConstantInitializer(0.0)
-    bias_attr = fluid.ParamAttr(initializer=bias_init)
-    layer = conv.Conv1DTranspose(
-        name_scope,
-        in_channels,
-        num_filters,
-        filter_size,
-        padding=padding,
-        stride=stride,
-        dilation=dilation,
-        groups=groups,
-        param_attr=weight_attr,
-        bias_attr=bias_attr,
-        use_cudnn=use_cudnn,
-        act=act,
-        dtype=dtype)
-    return layer
-
-
-def compute_position_embedding(rad):
-    # rad is a transposed radius, shape(embed_dim, n_vocab)
-    embed_dim, n_vocab = rad.shape
-
-    even_dims = dg.to_variable(np.arange(0, embed_dim, 2).astype("int32"))
-    odd_dims = dg.to_variable(np.arange(1, embed_dim, 2).astype("int32"))
-
-    even_rads = fluid.layers.gather(rad, even_dims)
-    odd_rads = fluid.layers.gather(rad, odd_dims)
-
-    sines = fluid.layers.sin(even_rads)
-    cosines = fluid.layers.cos(odd_rads)
-
-    temp = fluid.layers.scatter(rad, even_dims, sines)
-    out = fluid.layers.scatter(temp, odd_dims, cosines)
-    out = fluid.layers.transpose(out, perm=[1, 0])
-    return out
-
-
-def position_encoding_init(n_position,
-                           d_pos_vec,
-                           position_rate=1.0,
-                           sinusoidal=True):
-    """ Init the sinusoid position encoding table """
-
-    # keep idx 0 for padding token position encoding zero vector
-    position_enc = np.array([[
-        position_rate * pos / np.power(10000, 2 * (i // 2) / d_pos_vec)
-        for i in range(d_pos_vec)
-    ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)])
-
-    if sinusoidal:
-        position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2])  # dim 2i
-        position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2])  # dim 2i+1
-
-    return position_enc
-
-
-class PositionEmbedding(dg.Layer):
-    def __init__(self,
-                 name_scope,
-                 n_position,
-                 d_pos_vec,
-                 position_rate=1.0,
-                 is_sparse=False,
-                 is_distributed=False,
-                 param_attr=None,
-                 max_norm=None,
-                 padding_idx=None,
-                 dtype="float32"):
-        super(PositionEmbedding, self).__init__(name_scope, dtype=dtype)
-        self.embed = dg.Embedding(
-            self.full_name(),
-            size=(n_position, d_pos_vec),
-            is_sparse=is_sparse,
-            is_distributed=is_distributed,
-            padding_idx=None,
-            param_attr=param_attr,
-            dtype=dtype)
-        self.set_weight(
-            position_encoding_init(
-                n_position,
-                d_pos_vec,
-                position_rate=position_rate,
-                sinusoidal=False).astype(dtype))
-
-        self._is_sparse = is_sparse
-        self._is_distributed = is_distributed
-        self._remote_prefetch = self._is_sparse and (not self._is_distributed)
-        if self._remote_prefetch:
-            assert self._is_sparse is True and self._is_distributed is False
-
-        self._padding_idx = (-1 if padding_idx is None else padding_idx if
-                             padding_idx >= 0 else (n_position + padding_idx))
-        self._position_rate = position_rate
-        self._max_norm = max_norm
-        self._dtype = dtype
-
-    def set_weight(self, array):
-        assert self.embed._w.shape == list(array.shape), "shape does not match"
-        self.embed._w._ivar.value().get_tensor().set(
-            array, fluid.framework._current_expected_place())
-
-    def forward(self, indices, speaker_position_rate=None):
-        """
-        Args:
-            indices (Variable): Shape (B, T, 1), dtype: int64, position
-                indices, where B means the batch size, T means the time steps.
-            speaker_position_rate (Variable | float, optional), position
-                rate. It can be a float point number or a Variable with 
-                shape (1,), then this speaker_position_rate is used for every 
-                example. It can also be a Variable with shape (B, 1), which 
-                contains a speaker position rate for each speaker.
-        Returns:
-            out (Variable): Shape(B, C_pos), position embedding, where C_pos 
-                means position embedding size.
-        """
-        rad = fluid.layers.transpose(self.embed._w, perm=[1, 0])
-        batch_size = indices.shape[0]
-
-        if speaker_position_rate is None:
-            weight = compute_position_embedding(rad)
-            out = self._helper.create_variable_for_type_inference(self._dtype)
-            self._helper.append_op(
-                type="lookup_table",
-                inputs={"Ids": indices,
-                        "W": weight},
-                outputs={"Out": out},
-                attrs={
-                    "is_sparse": self._is_sparse,
-                    "is_distributed": self._is_distributed,
-                    "remote_prefetch": self._remote_prefetch,
-                    "padding_idx":
-                    self._padding_idx,  # special value for lookup table op
-                })
-            return out
-
-        elif (np.isscalar(speaker_position_rate) or
-              isinstance(speaker_position_rate, fluid.framework.Variable) and
-              speaker_position_rate.shape == [1, 1]):
-            # # make a weight
-            # scale the weight (the operand for sin & cos)
-            if np.isscalar(speaker_position_rate):
-                scaled_rad = fluid.layers.scale(rad, speaker_position_rate)
-            else:
-                scaled_rad = fluid.layers.elementwise_mul(
-                    rad, speaker_position_rate[0])
-            weight = compute_position_embedding(scaled_rad)
-            out = self._helper.create_variable_for_type_inference(self._dtype)
-            self._helper.append_op(
-                type="lookup_table",
-                inputs={"Ids": indices,
-                        "W": weight},
-                outputs={"Out": out},
-                attrs={
-                    "is_sparse": self._is_sparse,
-                    "is_distributed": self._is_distributed,
-                    "remote_prefetch": self._remote_prefetch,
-                    "padding_idx":
-                    self._padding_idx,  # special value for lookup table op
-                })
-            return out
-
-        elif np.prod(speaker_position_rate.shape) > 1:
-            assert speaker_position_rate.shape == [batch_size, 1]
-            outputs = []
-            for i in range(batch_size):
-                rate = speaker_position_rate[i]  # rate has shape [1]
-                scaled_rad = fluid.layers.elementwise_mul(rad, rate)
-                weight = compute_position_embedding(scaled_rad)
-                out = self._helper.create_variable_for_type_inference(
-                    self._dtype)
-                sequence = indices[i]
-                self._helper.append_op(
-                    type="lookup_table",
-                    inputs={"Ids": sequence,
-                            "W": weight},
-                    outputs={"Out": out},
-                    attrs={
-                        "is_sparse": self._is_sparse,
-                        "is_distributed": self._is_distributed,
-                        "remote_prefetch": self._remote_prefetch,
-                        "padding_idx": -1,
-                    })
-                outputs.append(out)
-            out = fluid.layers.stack(outputs)
-            return out
-        else:
-            raise Exception("Then you can just use position rate at init")
diff --git a/modules/weight_norm.py b/modules/weight_norm.py
deleted file mode 100644
index cbb0d03f9c50d697421d760d3c57ac95cfd63048..0000000000000000000000000000000000000000
--- a/modules/weight_norm.py
+++ /dev/null
@@ -1,863 +0,0 @@
-#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-import numpy as np
-from six.moves import reduce
-
-from copy import deepcopy
-
-import paddle
-from paddle import fluid
-import paddle.fluid.dygraph as dg
-from paddle.fluid import core
-from paddle.fluid.layers import utils
-from paddle.fluid.framework import Variable
-from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
-
-
-def _norm(p, dim):
-    """Computes the norm over all dimensions except dim.
-    It differs from pytorch implementation that it does not keep dim.
-    This difference is related with the broadcast mechanism in paddle.
-    Read elementeise_mul for more.
-    """
-
-    if dim is None:
-        return np.linalg.norm(p, ord=2, axis=None)
-    elif dim == 0:
-        p = np.reshape(p, newshape=(p.shape[0], -1))
-        return np.linalg.norm(p, ord=2, axis=1)
-    elif dim == p.ndim - 1:
-        p = np.reshape(p, newshape=(-1, p.shape[-1]))
-        return np.linalg.norm(p, ord=2, axis=0)
-    else:
-        perm = list(range(p.ndim))
-        perm[0] = dim
-        perm[dim] = 0
-        return _norm(np.transpose(p, axes=perm))
-
-
-class FC(dg.Layer):
-    """
-    **Fully Connected Layer**
-
-    This function creates a fully connected layer in the network. It can take
-    one or multiple tensors as its inputs(input can be a list of Variable, see
-    Args in detail). It creates a pair of variables called (magnitude(g), 
-    direction(V)) for each input tensor. Elementwise_mul(V, g) represents a fully connected 
-    weight matrix from each input unit to each output unit. 
-    The fully connected layer multiplies each input tensor
-    with its corresponding weight to produce an output Tensor with shape [M, `size`],
-    where M is batch size. If multiple input tensors are given, the results of
-    multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
-    is not None, a bias variable will be created and added to the output.
-    Finally, if activation is not None, it will be applied to the output as well.
-
-    When the input is single tensor:
-
-    .. math::
-
-        Out = Act({X(normalize(V)g) + b})
-
-    When the input are multiple tensors:
-
-    .. math::
-
-        Out = Act({\sum_{i=0}^{N-1}X_i(V_ig_i) + b})
-
-    In the above equation:
-
-    * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
-    * :math:`X_i`: The i-th input tensor.
-    * :math:`V_i`: The i-th direction matrix corresponding i-th input tensor.
-    * :math:`g_i`: The i-th magnitude vector corresponding i-th input tensor.
-    * :math:`b`: The bias parameter created by this layer (if needed).
-    * :math:`Act`: The activation function.
-    * :math:`Out`: The output tensor.
-
-    See below for an example.
-
-    .. code-block:: text
-
-        Given:
-            data_1.data = [[[0.1, 0.2],
-                           [0.3, 0.4]]]
-            data_1.shape = (1, 2, 2) # 1 is batch_size
-
-            data_2 = [[[0.1, 0.2, 0.3]]]
-            data_2.shape = (1, 1, 3)
-
-            out = fluid.layers.fc(input=[data_1, data_2], size=2)
-
-        Then:
-            out.data = [[0.18669507, 0.1893476]]
-            out.shape = (1, 2)
-
-    Args:
-        name_scope(str): The name of this class.
-        size(int): The number of output units in this layer.
-        num_flatten_dims (int): The fc layer can accept an input tensor with more than
-            two dimensions. If this happens, the multidimensional tensor will first be flattened
-            into a 2-dimensional matrix. The parameter `num_flatten_dims` determines how the input
-            tensor is flattened: the first `num_flatten_dims` (inclusive, index starts from 1)
-            dimensions will be flatten to form the first dimension of the final matrix (height of
-            the matrix), and the rest `rank(X) - num_flatten_dims` dimensions are flattened to
-            form the second dimension of the final matrix (width of the matrix). For example, suppose
-            `X` is a 5-dimensional tensor with a shape [2, 3, 4, 5, 6], and `num_flatten_dims` = 3.
-            Then, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. Default: 1
-        param_attr (ParamAttr|list of ParamAttr|None): The parameter attribute for learnable
-            parameters/weights of this layer.
-        bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
-            of this layer. If it is set to False, no bias will be added to the output units.
-            If it is set to None, the bias is initialized zero. Default: None.
-        act (str|None): Activation to be applied to the output of this layer.
-        is_test(bool): A flag indicating whether execution is in test phase. Default: False
-        dtype(str): Dtype used for weight
-
-    Raises:
-        ValueError: If rank of the input tensor is less than 2.
-
-    Examples:
-        .. code-block:: python
-
-          from paddle.fluid.dygraph.base import to_variable
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph import FC
-          import numpy as np
-
-          data = np.random.uniform( -1, 1, [30, 10, 32] ).astype('float32')
-          with fluid.dygraph.guard():
-              fc = FC( "fc", 64, num_flatten_dims=2)
-              data = to_variable( data )
-              conv = fc( data )
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 size,
-                 num_flatten_dims=1,
-                 epsilon=1e-30,
-                 param_attr=None,
-                 bias_attr=None,
-                 act=None,
-                 is_test=False,
-                 dtype="float32"):
-        super(FC, self).__init__(name_scope, dtype)
-
-        self._size = size
-        self._num_flatten_dims = num_flatten_dims
-        self._epsilon = epsilon
-        self._dtype = dtype
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._act = act
-        self.__g = list()
-        self.__v = list()
-
-    @property
-    def _v(self, i=0):
-        return self.__v[i]
-
-    @property
-    def _g(self, i=0):
-        return self.__g[i]
-
-    @_v.setter
-    def _v(self, value, i=0):
-        assert isinstance(value, Parameter)
-        self.__v[i] = value
-
-    @_g.setter
-    def _g(self, value, i=0):
-        assert isinstance(value, Parameter)
-        self.__g[i] = value
-
-    def _build_once(self, input):
-        i = 0
-        for inp, param in self._helper.iter_inputs_and_params(input,
-                                                              self._param_attr):
-            input_shape = inp.shape
-
-            param_shape = [
-                reduce(lambda a, b: a * b, input_shape[self._num_flatten_dims:],
-                       1)
-            ] + [self._size]
-            self.__v.append(
-                self.add_parameter(
-                    "_v%d" % i,
-                    self.create_parameter(
-                        attr=param,
-                        shape=param_shape,
-                        dtype=self._dtype,
-                        is_bias=False)))
-
-            magnitude_shape = param_shape[1:]
-            magnitude_value = np.linalg.norm(self.__v[i].numpy(), ord=2, axis=0)
-
-            self.__g.append(
-                self.add_parameter(
-                    "_g%d" % i,
-                    self.create_parameter(
-                        attr=fluid.ParamAttr(
-                            initializer=fluid.initializer.NumpyArrayInitializer(
-                                magnitude_value)),
-                        shape=magnitude_shape,
-                        dtype=self._dtype,
-                        is_bias=False)))
-            i += 1
-
-        size = list([self._size])
-        self._b = self.create_parameter(
-            attr=self._bias_attr, shape=size, dtype=self._dtype, is_bias=True)
-
-    def forward(self, input):
-        mul_results = list()
-        i = 0
-        for inp, param in self._helper.iter_inputs_and_params(input,
-                                                              self._param_attr):
-            v_norm = self._helper.create_variable_for_type_inference(
-                self._dtype)
-            v_normalized = self._helper.create_variable_for_type_inference(
-                self._dtype)
-            self._helper.append_op(
-                type="norm",
-                inputs={"X": self.__v[i]},
-                outputs={"Out": v_normalized,
-                         "Norm": v_norm},
-                attrs={"axis": 0,
-                       "epsilon": self._epsilon})
-            weight = self._helper.create_variable_for_type_inference(
-                self._dtype)
-            self._helper.append_op(
-                type="elementwise_mul",
-                inputs={"X": [v_normalized],
-                        "Y": [self.__g[i]]},
-                outputs={"Out": [weight]},
-                attrs={"axis": 1})
-            tmp = self._helper.create_variable_for_type_inference(self._dtype)
-            self._helper.append_op(
-                type="mul",
-                inputs={"X": inp,
-                        "Y": weight},
-                outputs={"Out": tmp},
-                attrs={
-                    "x_num_col_dims": self._num_flatten_dims,
-                    "y_num_col_dims": 1
-                })
-            i += 1
-            mul_results.append(tmp)
-
-        if len(mul_results) == 1:
-            pre_bias = mul_results[0]
-        else:
-            pre_bias = self._helper.create_variable_for_type_inference(
-                self._dtype)
-            self._helper.append_op(
-                type="sum",
-                inputs={"X": mul_results},
-                outputs={"Out": pre_bias},
-                attrs={"use_mkldnn": False})
-
-        if self._b:
-            pre_activation = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type="elementwise_add",
-                inputs={"X": [pre_bias],
-                        "Y": [self._b]},
-                outputs={"Out": [pre_activation]},
-                attrs={"axis": self._num_flatten_dims})
-        else:
-            pre_activation = pre_bias
-        # Currently, we don't support inplace in dygraph mode
-        return self._helper.append_activation(pre_activation, act=self._act)
-
-
-class Conv2D(dg.Layer):
-    """
-    The convolution2D layer calculates the output based on the input, filter
-    and strides, paddings, dilations, groups parameters. Input and
-    Output are in NCHW format, where N is batch size, C is the number of
-    channels, H is the height of the feature, and W is the width of the feature.
-    Filter is in MCHW format, where M is the number of output image channels,
-    C is the number of input image channels, H is the height of the filter,
-    and W is the width of the filter. If the groups is greater than 1,
-    C will equal the number of input image channels divided by the groups.
-    Please refer to UFLDL's `convolution
-    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`
-    for more detials.
-    If bias attribution and activation type are provided, bias is added to the
-    output of the convolution, and the corresponding activation function is
-    applied to the final result.
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma ((Vg) \\ast X + b)
-
-    Where:
-
-    * :math:`X`: Input value, a tensor with NCHW format.
-    * :math:`V`: Filter direction value, a tensor with MCHW format.
-    * :math:`g`: Filter magnitude value, a tensor with M format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{out}, C_{in}, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
-
-    Args:
-        name_scope(str) : The name for this class.
-        num_filters(int): The number of filter. It is as same as the output
-            image channel.
-        filter_size (int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square.
-        stride (int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: stride = 1.
-        padding (int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding. Default: padding = 0.
-        dilation (int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: dilation = 1.
-        groups (int): The groups number of the Conv2d Layer. According to grouped
-            convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
-            the first half of the filters is only connected to the first half
-            of the input channels, while the second half of the filters is only
-            connected to the second half of the input channels. Default: groups=1.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with :math:`Normal(0.0, std)`,
-            and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn (bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None
-
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
-    Examples:
-        .. code-block:: python
-
-          from paddle.fluid.dygraph.base import to_variable
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph import Conv2D
-          import numpy as np
-
-          data = np.random.uniform( -1, 1, [10, 3, 32, 32] ).astype('float32')
-          with fluid.dygraph.guard():
-              conv2d = Conv2D( "conv2d", 2, 3)
-              data = to_variable( data )
-              conv = conv2d( data )
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 filter_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 act=None,
-                 epsilon=1e-30,
-                 dtype="float32"):
-        assert param_attr is not False, "param_attr should not be False here."
-        super(Conv2D, self).__init__(name_scope, dtype)
-        self._groups = groups
-        self._stride = utils.convert_to_list(stride, 2, "stride")
-        self._padding = utils.convert_to_list(padding, 2, "padding")
-        self._dilation = utils.convert_to_list(dilation, 2, "dilation")
-        self._act = act
-        if not isinstance(use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-        self._use_cudnn = use_cudnn
-        self._filter_size = filter_size
-        self._num_filters = num_filters
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._epsilon = epsilon
-        self._dtype = dtype
-        # if (self._num_channels == self._groups and
-        #         num_filters % self._num_channels == 0 and not self._use_cudnn):
-        #     self._l_type = 'depthwise_conv2d'
-        # else:
-        # TODO(jiabin): recover the usage of depthwise_conv2d when it's
-        #  kernel fixed https://github.com/PaddlePaddle/Paddle/issues/17275
-        self._l_type = "conv2d"
-
-    def _build_once(self, input):
-        self._num_channels = input.shape[1]
-        if self._groups is None:
-            num_filter_channels = self._num_channels
-        else:
-            if self._num_channels % self._groups != 0:
-                raise ValueError("num_channels must be divisible by groups.")
-            num_filter_channels = self._num_channels // self._groups
-        filter_size = utils.convert_to_list(self._filter_size, 2, "filter_size")
-        filter_shape = [self._num_filters, int(num_filter_channels)
-                        ] + filter_size
-
-        def _get_default_param_initializer():
-            filter_elem_num = filter_size[0] * filter_size[
-                1] * self._num_channels
-            std = (2.0 / filter_elem_num)**0.5
-            return Normal(0.0, std, 0)
-
-        # weight_v
-        self._filter_param_v = self.create_parameter(
-            attr=self._param_attr,
-            shape=filter_shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer())
-
-        # weight_g
-        norm_value = _norm(
-            self._filter_param_v.numpy(), dim=0)  # CAUTION: hard-code
-        self._filter_param_g = self.create_parameter(
-            attr=fluid.ParamAttr(
-                initializer=fluid.initializer.NumpyArrayInitializer(
-                    norm_value)),
-            shape=norm_value.shape,
-            dtype=self._dtype,
-            default_initializer=_get_default_param_initializer())
-
-        if self._use_cudnn:
-            self.create_variable(
-                name="kCUDNNFwdAlgoCache",
-                persistable=True,
-                type=core.VarDesc.VarType.RAW)
-            self.create_variable(
-                name="kCUDNNBwdDataAlgoCache",
-                persistable=True,
-                type=core.VarDesc.VarType.RAW)
-            self.create_variable(
-                name="kCUDNNBwdFilterAlgoCache",
-                persistable=True,
-                type=core.VarDesc.VarType.RAW)
-
-        self._bias_param = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        matrix = self._helper.create_variable_for_type_inference(self._dtype)
-        tmp = self._helper.create_variable_for_type_inference(self._dtype)
-        new_shape = [
-            self._filter_param_v.shape[0],
-            reduce(lambda x, y: x * y, self._filter_param_v.shape[1:], 1),
-        ]
-
-        self._helper.append_op(
-            type="reshape2",
-            inputs={"X": self._filter_param_v},
-            attrs={"shape": new_shape},
-            outputs={"Out": matrix,
-                     "XShape": tmp})
-
-        m_norm = self._helper.create_variable_for_type_inference(self._dtype)
-        m_normalized = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        self._helper.append_op(
-            type="norm",
-            inputs={"X": matrix},
-            outputs={"Out": m_normalized,
-                     "Norm": m_norm},
-            attrs={"axis": 1,
-                   "epsilon": self._epsilon})
-
-        v_normalized = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="reshape2",
-            inputs={"X": m_normalized},
-            attrs={"shape": self._filter_param_v.shape},
-            outputs={"Out": v_normalized,
-                     "XShape": tmp2})
-
-        filter_param = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        self._helper.append_op(
-            type="elementwise_mul",
-            inputs={"X": [v_normalized],
-                    "Y": [self._filter_param_g]},
-            outputs={"Out": [filter_param]},
-            attrs={"axis": 0},  # CAUTION: hard-code
-        )
-
-        pre_bias = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype)
-
-        self._helper.append_op(
-            type=self._l_type,
-            inputs={"Input": input,
-                    "Filter": filter_param},
-            outputs={"Output": pre_bias},
-            attrs={
-                "strides": self._stride,
-                "paddings": self._padding,
-                "dilations": self._dilation,
-                "groups": self._groups if self._groups else 1,
-                "use_cudnn": self._use_cudnn,
-                "use_mkldnn": False,
-            })
-
-        if self._bias_param is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type="elementwise_add",
-                inputs={"X": [pre_bias],
-                        "Y": [self._bias_param]},
-                outputs={"Out": [pre_act]},
-                attrs={"axis": 1})
-        else:
-            pre_act = pre_bias
-
-        # Currently, we don't support inplace in dygraph mode
-        return self._helper.append_activation(pre_act, act=self._act)
-
-
-class Conv2DTranspose(dg.Layer):
-    """
-    **Convlution2D transpose layer**
-
-    The convolution2D transpose layer calculates the output based on the input,
-    filter, and dilations, strides, paddings. Input(Input) and output(Output)
-    are in NCHW format. Where N is batch size, C is the number of channels,
-    H is the height of the feature, and W is the width of the feature.
-    Parameters(dilations, strides, paddings) are two elements. These two elements
-    represent height and width, respectively. The details of convolution transpose
-    layer, please refer to the following explanation and references
-    `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
-    If bias attribution and activation type are provided, bias is added to
-    the output of the convolution, and the corresponding activation function
-    is applied to the final result.
-
-    For each input :math:`X`, the equation is:
-
-    .. math::
-
-        Out = \sigma ((Vg) \\ast X + b)
-
-    Where:
-
-    * :math:`X`: Input value, a tensor with NCHW format.
-    * :math:`V`: Filter value, a tensor with MCHW format.
-    * :math:`g`: Filter value, a tensor with M format.
-    * :math:`\\ast`: Convolution operation.
-    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
-    * :math:`\\sigma`: Activation function.
-    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, C_{in}, H_{in}, W_{in})`
-
-          Filter shape: :math:`(C_{in}, C_{out}, H_f, W_f)`
-
-        - Output:
-
-          Output shape: :math:`(N, C_{out}, H_{out}, W_{out})`
-
-        Where
-
-        .. math::
-
-           H^\prime_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
-           W^\prime_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1 \\\\
-           H_{out} &\in [ H^\prime_{out}, H^\prime_{out} + strides[0] ) \\\\
-           W_{out} &\in [ W^\prime_{out}, W^\prime_{out} + strides[1] )
-
-    Args:
-        name_scope(str): The name of this class.
-        num_filters(int): The number of the filter. It is as same as the output
-            image channel.
-        output_size(int|tuple|None): The output image size. If output size is a
-            tuple, it must contain two integers, (image_H, image_W). None if use
-            filter_size, padding, and stride to calculate output_size.
-            if output_size and filter_size are specified at the same time, They
-            should follow the formula above. Default: None.
-        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
-            it must contain two integers, (filter_size_H, filter_size_W).
-            Otherwise, the filter will be a square. None if use output size to
-            calculate filter_size. Default: None.
-        padding(int|tuple): The padding size. If padding is a tuple, it must
-            contain two integers, (padding_H, padding_W). Otherwise, the
-            padding_H = padding_W = padding. Default: padding = 0.
-        stride(int|tuple): The stride size. If stride is a tuple, it must
-            contain two integers, (stride_H, stride_W). Otherwise, the
-            stride_H = stride_W = stride. Default: stride = 1.
-        dilation(int|tuple): The dilation size. If dilation is a tuple, it must
-            contain two integers, (dilation_H, dilation_W). Otherwise, the
-            dilation_H = dilation_W = dilation. Default: dilation = 1.
-        groups(int): The groups number of the Conv2d transpose layer. Inspired by
-            grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
-            when group=2, the first half of the filters is only connected to the
-            first half of the input channels, while the second half of the
-            filters is only connected to the second half of the input channels.
-            Default: groups = 1.
-        param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights
-            of conv2d_transpose. If it is set to None or one attribute of ParamAttr, conv2d_transpose
-            will create ParamAttr as param_attr. If the Initializer of the param_attr
-            is not set, the parameter is initialized with Xavier. Default: None.
-        bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of conv2d_transpose.
-            If it is set to False, no bias will be added to the output units.
-            If it is set to None or one attribute of ParamAttr, conv2d_transpose
-            will create ParamAttr as bias_attr. If the Initializer of the bias_attr
-            is not set, the bias is initialized zero. Default: None.
-        use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
-            library is installed. Default: True.
-        act (str): Activation type, if it is set to None, activation is not appended.
-            Default: None.
-
-    Returns:
-        Variable: The tensor variable storing the convolution transpose result.
-
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
-    Examples:
-       .. code-block:: python
-
-          import paddle.fluid as fluid
-          import numpy
-
-          with fluid.dygraph.guard():
-              data = numpy.random.random((3, 32, 32)).astype('float32')
-              conv2DTranspose = fluid.dygraph.nn.Conv2DTranspose(
-                    'Conv2DTranspose', num_filters=2, filter_size=3)
-              ret = conv2DTranspose(fluid.dygraph.base.to_variable(data))
-
-    """
-
-    def __init__(self,
-                 name_scope,
-                 num_filters,
-                 output_size=None,
-                 filter_size=None,
-                 padding=0,
-                 stride=1,
-                 dilation=1,
-                 groups=None,
-                 param_attr=None,
-                 bias_attr=None,
-                 use_cudnn=True,
-                 epsilon=1e-30,
-                 act=None,
-                 dtype="float32"):
-        super(Conv2DTranspose, self).__init__(name_scope, dtype)
-        assert (param_attr is not False
-                ), "param_attr should not be False in conv2d_transpose."
-        self._param_attr = param_attr
-        self._bias_attr = bias_attr
-        self._groups = groups
-        self._num_filters = num_filters
-        self._use_cudnn = use_cudnn
-        self._padding = padding
-        self._stride = stride
-        self._dilation = dilation
-        self._filter_size = filter_size
-        self._output_size = output_size
-        self._op_type = "conv2d_transpose"
-        self._epsilon = epsilon
-
-    def _build_once(self, input):
-        input_channel = input.shape[1]
-        if (input_channel == self._groups and
-                self._num_filters == input_channel and not self._use_cudnn):
-            self._op_type = "depthwise_conv2d_transpose"
-
-        if not isinstance(input, Variable):
-            raise TypeError("Input of conv2d_transpose must be Variable")
-
-        self._padding = utils.convert_to_list(self._padding, 2, "padding")
-        self._stride = utils.convert_to_list(self._stride, 2, "stride")
-        self._dilation = utils.convert_to_list(self._dilation, 2, "dilation")
-
-        if not isinstance(self._use_cudnn, bool):
-            raise ValueError("use_cudnn should be True or False")
-
-        if self._filter_size is None:
-            if self._output_size is None:
-                raise ValueError(
-                    "output_size must be set when filter_size is None")
-            if isinstance(self._output_size, int):
-                self._output_size = [self._output_size, self._output_size]
-
-            h_in = input.shape[2]
-            w_in = input.shape[3]
-
-            filter_size_h = (self._output_size[0] -
-                             (h_in - 1) * self._stride[0] + 2 * self._padding[0]
-                             - 1) // self._dilation[0] + 1
-            filter_size_w = (self._output_size[1] -
-                             (w_in - 1) * self._stride[1] + 2 * self._padding[1]
-                             - 1) // self._dilation[1] + 1
-            self._filter_size = [filter_size_h, filter_size_w]
-        else:
-            self._filter_size = utils.convert_to_list(
-                self._filter_size, 2, "conv2d_transpose.filter_size")
-
-        if self._output_size is None:
-            self._output_size = []
-        elif isinstance(self._output_size, list) or isinstance(
-                self._output_size, int):
-            self._output_size = utils.convert_to_list(self._output_size, 2,
-                                                      "output_size")
-        else:
-            raise ValueError("output_size should be list or int")
-        self._padding = utils.convert_to_list(self._padding, 2, "padding")
-        self._groups = 1 if self._groups is None else self._groups
-        filter_shape = [
-            input_channel,
-            self._num_filters // self._groups,
-        ] + self._filter_size
-
-        # img filter v (direction)
-        self._img_filter_v = self.create_parameter(
-            dtype=input.dtype, shape=filter_shape, attr=self._param_attr)
-
-        # img filter g (magnitude)
-        img_filter_magnitude = _norm(
-            self._img_filter_v.numpy(), dim=0)  # CAUTION: hard-code
-        self._img_filter_g = self.create_parameter(
-            dtype=input.dtype,
-            shape=img_filter_magnitude.shape,
-            attr=fluid.ParamAttr(
-                initializer=NumpyArrayInitializer(img_filter_magnitude)))
-
-        self._img_bias = self.create_parameter(
-            attr=self._bias_attr,
-            shape=[self._num_filters],
-            dtype=self._dtype,
-            is_bias=True)
-
-    def forward(self, input):
-        matrix = self._helper.create_variable_for_type_inference(self._dtype)
-        tmp = self._helper.create_variable_for_type_inference(self._dtype)
-        new_shape = [
-            self._img_filter_v.shape[0],
-            reduce(lambda x, y: x * y, self._img_filter_v.shape[1:], 1),
-        ]
-
-        self._helper.append_op(
-            type="reshape2",
-            inputs={"X": self._img_filter_v},
-            attrs={"shape": new_shape},
-            outputs={"Out": matrix,
-                     "XShape": tmp})
-
-        m_norm = self._helper.create_variable_for_type_inference(self._dtype)
-        m_normalized = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        self._helper.append_op(
-            type="norm",
-            inputs={"X": matrix},
-            outputs={"Out": m_normalized,
-                     "Norm": m_norm},
-            attrs={"axis": 1,
-                   "epsilon": self._epsilon})
-
-        v_normalized = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        tmp2 = self._helper.create_variable_for_type_inference(self._dtype)
-        self._helper.append_op(
-            type="reshape2",
-            inputs={"X": m_normalized},
-            attrs={"shape": self._img_filter_v.shape},
-            outputs={"Out": v_normalized,
-                     "XShape": tmp2})
-
-        img_filter = self._helper.create_variable_for_type_inference(
-            self._dtype)
-        self._helper.append_op(
-            type="elementwise_mul",
-            inputs={"X": [v_normalized],
-                    "Y": [self._img_filter_g]},
-            outputs={"Out": [img_filter]},
-            attrs={"axis": 0},  # CAUTION: hard-code
-        )
-
-        pre_bias = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype)
-        self._helper.append_op(
-            type=self._op_type,
-            inputs={"Input": [input],
-                    "Filter": [img_filter]},
-            outputs={"Output": pre_bias},
-            attrs={
-                "output_size": self._output_size,
-                "strides": self._stride,
-                "paddings": self._padding,
-                "dilations": self._dilation,
-                "groups": self._groups,
-                "use_cudnn": self._use_cudnn,
-            })
-
-        if self._img_bias is not None:
-            pre_act = self._helper.create_variable_for_type_inference(
-                dtype=self._dtype)
-            self._helper.append_op(
-                type="elementwise_add",
-                inputs={"X": [pre_bias],
-                        "Y": [self._img_bias]},
-                outputs={"Out": [pre_act]},
-                attrs={"axis": 1})
-        else:
-            pre_act = pre_bias
-
-        out = self._helper.append_activation(pre_act)
-        return out